diff --git "a/checkpoint-12000/trainer_state.json" "b/checkpoint-12000/trainer_state.json" new file mode 100644--- /dev/null +++ "b/checkpoint-12000/trainer_state.json" @@ -0,0 +1,168019 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.9477357571822758, + "eval_steps": 500, + "global_step": 12000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0, + "learning_rate": 2.702702702702703e-08, + "logits/chosen": -0.6498041152954102, + "logits/rejected": -0.5546936392784119, + "logps/chosen": -203.9995574951172, + "logps/rejected": -214.9104461669922, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1 + }, + { + "epoch": 0.0, + "learning_rate": 5.405405405405406e-08, + "logits/chosen": -0.5683586001396179, + "logits/rejected": -0.47829124331474304, + "logps/chosen": -139.94802856445312, + "logps/rejected": -89.8438949584961, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 2 + }, + { + "epoch": 0.0, + "learning_rate": 8.108108108108109e-08, + "logits/chosen": -0.13393600285053253, + "logits/rejected": -0.12582682073116302, + "logps/chosen": -62.24049758911133, + "logps/rejected": -7.032988548278809, + "loss": 0.6905, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.007649231236428022, + "rewards/margins": -0.007175636477768421, + "rewards/rejected": -0.00047359467134810984, + "step": 3 + }, + { + "epoch": 0.0, + "learning_rate": 1.0810810810810812e-07, + "logits/chosen": 0.10883359611034393, + "logits/rejected": 0.15941740572452545, + "logps/chosen": -60.321937561035156, + "logps/rejected": -71.6085205078125, + "loss": 0.6939, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.05187797546386719, + "rewards/margins": 0.0388362891972065, + "rewards/rejected": 0.013041687197983265, + "step": 4 + }, + { + "epoch": 0.0, + "learning_rate": 1.3513513513513515e-07, + "logits/chosen": 0.09655981510877609, + "logits/rejected": 0.09719934314489365, + "logps/chosen": -8.410079002380371, + "logps/rejected": -9.139074325561523, + "loss": 0.671, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.014026832766830921, + "rewards/margins": -0.0007179258391261101, + "rewards/rejected": -0.013308906927704811, + "step": 5 + }, + { + "epoch": 0.0, + "learning_rate": 1.6216216216216218e-07, + "logits/chosen": -0.5018764138221741, + "logits/rejected": -0.5089478492736816, + "logps/chosen": -162.28591918945312, + "logps/rejected": -129.50613403320312, + "loss": 0.7639, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.01582946814596653, + "rewards/margins": -0.04836120456457138, + "rewards/rejected": 0.03253173828125, + "step": 6 + }, + { + "epoch": 0.0, + "learning_rate": 1.8918918918918921e-07, + "logits/chosen": -0.6321228742599487, + "logits/rejected": -1.0973819494247437, + "logps/chosen": -135.6925048828125, + "logps/rejected": -37.35700225830078, + "loss": 0.6942, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0060134888626635075, + "rewards/margins": 0.008611679077148438, + "rewards/rejected": -0.014625168405473232, + "step": 7 + }, + { + "epoch": 0.0, + "learning_rate": 2.1621621621621625e-07, + "logits/chosen": -0.07189406454563141, + "logits/rejected": -0.09807883203029633, + "logps/chosen": -175.12692260742188, + "logps/rejected": -130.64334106445312, + "loss": 0.6617, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0033966065384447575, + "rewards/margins": 0.02865600772202015, + "rewards/rejected": -0.03205261379480362, + "step": 8 + }, + { + "epoch": 0.0, + "learning_rate": 2.4324324324324326e-07, + "logits/chosen": -0.4548376202583313, + "logits/rejected": -0.4348132312297821, + "logps/chosen": -90.37907409667969, + "logps/rejected": -37.03586196899414, + "loss": 0.7, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.00241851806640625, + "rewards/margins": -0.01372604351490736, + "rewards/rejected": 0.01130752544850111, + "step": 9 + }, + { + "epoch": 0.0, + "learning_rate": 2.702702702702703e-07, + "logits/chosen": -0.5907372832298279, + "logits/rejected": -0.6872574090957642, + "logps/chosen": -282.7813415527344, + "logps/rejected": -106.4892578125, + "loss": 0.7129, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.06922302395105362, + "rewards/margins": 0.056191254407167435, + "rewards/rejected": 0.01303176861256361, + "step": 10 + }, + { + "epoch": 0.0, + "learning_rate": 2.972972972972973e-07, + "logits/chosen": -0.7774422764778137, + "logits/rejected": -0.7980676293373108, + "logps/chosen": -168.11203002929688, + "logps/rejected": -147.12152099609375, + "loss": 0.7002, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.015194701962172985, + "rewards/margins": -0.02163086086511612, + "rewards/rejected": 0.03682556375861168, + "step": 11 + }, + { + "epoch": 0.0, + "learning_rate": 3.2432432432432436e-07, + "logits/chosen": -0.3884061276912689, + "logits/rejected": -0.3906358778476715, + "logps/chosen": -4.5940656661987305, + "logps/rejected": -3.533803939819336, + "loss": 0.7024, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.009688282385468483, + "rewards/margins": -0.013838721439242363, + "rewards/rejected": 0.004150438588112593, + "step": 12 + }, + { + "epoch": 0.0, + "learning_rate": 3.513513513513514e-07, + "logits/chosen": -0.4521062970161438, + "logits/rejected": -0.4931703507900238, + "logps/chosen": -146.48838806152344, + "logps/rejected": -169.53939819335938, + "loss": 0.6647, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.03493957594037056, + "rewards/margins": 0.06259612739086151, + "rewards/rejected": -0.02765655517578125, + "step": 13 + }, + { + "epoch": 0.0, + "learning_rate": 3.7837837837837843e-07, + "logits/chosen": -0.32512426376342773, + "logits/rejected": -0.303484171628952, + "logps/chosen": -95.06636810302734, + "logps/rejected": -119.577880859375, + "loss": 0.7014, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.018029022961854935, + "rewards/margins": -0.041393280029296875, + "rewards/rejected": 0.02336425893008709, + "step": 14 + }, + { + "epoch": 0.0, + "learning_rate": 4.0540540540540546e-07, + "logits/chosen": -0.8186230063438416, + "logits/rejected": -0.857027530670166, + "logps/chosen": -167.0155792236328, + "logps/rejected": -186.492431640625, + "loss": 0.7143, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.02746429480612278, + "rewards/margins": -0.05318145826458931, + "rewards/rejected": 0.02571716345846653, + "step": 15 + }, + { + "epoch": 0.0, + "learning_rate": 4.324324324324325e-07, + "logits/chosen": -0.21891331672668457, + "logits/rejected": -0.18656358122825623, + "logps/chosen": -109.35826873779297, + "logps/rejected": -106.5494613647461, + "loss": 0.6914, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.02374877966940403, + "rewards/margins": -0.030660249292850494, + "rewards/rejected": 0.00691146869212389, + "step": 16 + }, + { + "epoch": 0.0, + "learning_rate": 4.5945945945945953e-07, + "logits/chosen": -0.20082136988639832, + "logits/rejected": -0.23444963991641998, + "logps/chosen": -81.04539489746094, + "logps/rejected": -170.18988037109375, + "loss": 0.7138, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.046953584998846054, + "rewards/margins": -0.0905357375741005, + "rewards/rejected": 0.04358215257525444, + "step": 17 + }, + { + "epoch": 0.0, + "learning_rate": 4.864864864864865e-07, + "logits/chosen": -0.32738712430000305, + "logits/rejected": -0.32738712430000305, + "logps/chosen": -54.252418518066406, + "logps/rejected": -54.252418518066406, + "loss": 0.6725, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.017116164788603783, + "rewards/margins": 0.0, + "rewards/rejected": 0.017116164788603783, + "step": 18 + }, + { + "epoch": 0.0, + "learning_rate": 5.135135135135135e-07, + "logits/chosen": -0.10929717123508453, + "logits/rejected": -0.10929717123508453, + "logps/chosen": -71.61502075195312, + "logps/rejected": -71.61502075195312, + "loss": 0.7221, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.04779815673828125, + "rewards/margins": 0.0, + "rewards/rejected": 0.04779815673828125, + "step": 19 + }, + { + "epoch": 0.0, + "learning_rate": 5.405405405405406e-07, + "logits/chosen": -0.4148974120616913, + "logits/rejected": -0.4252752363681793, + "logps/chosen": -84.35517883300781, + "logps/rejected": -70.3454360961914, + "loss": 0.6882, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.00379180908203125, + "rewards/margins": -0.016524504870176315, + "rewards/rejected": 0.01273269671946764, + "step": 20 + }, + { + "epoch": 0.0, + "learning_rate": 5.675675675675676e-07, + "logits/chosen": -0.27633753418922424, + "logits/rejected": -0.24961145222187042, + "logps/chosen": -86.89443969726562, + "logps/rejected": -99.29306030273438, + "loss": 0.6975, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.010845184326171875, + "rewards/margins": -0.044678498059511185, + "rewards/rejected": 0.03383331373333931, + "step": 21 + }, + { + "epoch": 0.0, + "learning_rate": 5.945945945945947e-07, + "logits/chosen": -0.5269609093666077, + "logits/rejected": -0.5630397796630859, + "logps/chosen": -208.6320037841797, + "logps/rejected": -56.999935150146484, + "loss": 0.7, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.05099640041589737, + "rewards/margins": 0.04648246988654137, + "rewards/rejected": 0.004513931460678577, + "step": 22 + }, + { + "epoch": 0.0, + "learning_rate": 6.216216216216217e-07, + "logits/chosen": -0.5358319878578186, + "logits/rejected": -0.5356854796409607, + "logps/chosen": -82.70429992675781, + "logps/rejected": -83.46559143066406, + "loss": 0.6915, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.016956329345703125, + "rewards/margins": -0.052490998059511185, + "rewards/rejected": 0.03553466871380806, + "step": 23 + }, + { + "epoch": 0.0, + "learning_rate": 6.486486486486487e-07, + "logits/chosen": -0.07701059430837631, + "logits/rejected": -0.07868491113185883, + "logps/chosen": -92.2003173828125, + "logps/rejected": -109.24430847167969, + "loss": 0.6945, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.03281555324792862, + "rewards/margins": -0.029000092297792435, + "rewards/rejected": -0.0038154602516442537, + "step": 24 + }, + { + "epoch": 0.0, + "learning_rate": 6.756756756756758e-07, + "logits/chosen": -0.7208604216575623, + "logits/rejected": -0.725371778011322, + "logps/chosen": -77.03095245361328, + "logps/rejected": -170.79388427734375, + "loss": 0.6921, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.024109650403261185, + "rewards/margins": 0.024651337414979935, + "rewards/rejected": -0.04876098781824112, + "step": 25 + }, + { + "epoch": 0.0, + "learning_rate": 7.027027027027028e-07, + "logits/chosen": -0.5477840900421143, + "logits/rejected": -0.5034233331680298, + "logps/chosen": -253.19888305664062, + "logps/rejected": -119.06398010253906, + "loss": 0.6864, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.04578247293829918, + "rewards/margins": 0.01485748402774334, + "rewards/rejected": 0.03092498891055584, + "step": 26 + }, + { + "epoch": 0.0, + "learning_rate": 7.297297297297298e-07, + "logits/chosen": -0.21304064989089966, + "logits/rejected": -0.21304064989089966, + "logps/chosen": -56.244728088378906, + "logps/rejected": -56.244728088378906, + "loss": 0.6982, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0185699462890625, + "rewards/margins": 0.0, + "rewards/rejected": 0.0185699462890625, + "step": 27 + }, + { + "epoch": 0.0, + "learning_rate": 7.567567567567569e-07, + "logits/chosen": -0.1799372136592865, + "logits/rejected": -0.1799372136592865, + "logps/chosen": -20.409658432006836, + "logps/rejected": -20.409658432006836, + "loss": 0.7061, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.025806045159697533, + "rewards/margins": 0.0, + "rewards/rejected": -0.025806045159697533, + "step": 28 + }, + { + "epoch": 0.0, + "learning_rate": 7.837837837837839e-07, + "logits/chosen": -0.4208410680294037, + "logits/rejected": -0.43514594435691833, + "logps/chosen": -18.232845306396484, + "logps/rejected": -28.007917404174805, + "loss": 0.6937, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.003099823137745261, + "rewards/margins": 0.006562614813446999, + "rewards/rejected": -0.0034627914428710938, + "step": 29 + }, + { + "epoch": 0.0, + "learning_rate": 8.108108108108109e-07, + "logits/chosen": -0.37855663895606995, + "logits/rejected": -0.3099723756313324, + "logps/chosen": -129.68798828125, + "logps/rejected": -59.472957611083984, + "loss": 0.7067, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.02169647254049778, + "rewards/margins": -0.039481356739997864, + "rewards/rejected": 0.017784882336854935, + "step": 30 + }, + { + "epoch": 0.01, + "learning_rate": 8.37837837837838e-07, + "logits/chosen": -0.2582060992717743, + "logits/rejected": -0.20829670131206512, + "logps/chosen": -93.4700698852539, + "logps/rejected": -67.03055572509766, + "loss": 0.713, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.041413117200136185, + "rewards/margins": 0.05256805568933487, + "rewards/rejected": -0.01115493755787611, + "step": 31 + }, + { + "epoch": 0.01, + "learning_rate": 8.64864864864865e-07, + "logits/chosen": -0.37754759192466736, + "logits/rejected": -0.32134830951690674, + "logps/chosen": -61.36663818359375, + "logps/rejected": -136.7816925048828, + "loss": 0.6566, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.04551391676068306, + "rewards/margins": 0.1145172119140625, + "rewards/rejected": -0.06900329887866974, + "step": 32 + }, + { + "epoch": 0.01, + "learning_rate": 8.91891891891892e-07, + "logits/chosen": -0.6691619157791138, + "logits/rejected": -0.6233839392662048, + "logps/chosen": -193.85940551757812, + "logps/rejected": -174.20819091796875, + "loss": 0.6714, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.03594818338751793, + "rewards/margins": 0.10399018228054047, + "rewards/rejected": -0.06804199516773224, + "step": 33 + }, + { + "epoch": 0.01, + "learning_rate": 9.189189189189191e-07, + "logits/chosen": -0.14148908853530884, + "logits/rejected": -0.11517009884119034, + "logps/chosen": -102.1534194946289, + "logps/rejected": -91.98575592041016, + "loss": 0.692, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.027033234015107155, + "rewards/margins": -0.0125961322337389, + "rewards/rejected": 0.039629366248846054, + "step": 34 + }, + { + "epoch": 0.01, + "learning_rate": 9.459459459459461e-07, + "logits/chosen": -0.4798603057861328, + "logits/rejected": -0.47774791717529297, + "logps/chosen": -69.21839904785156, + "logps/rejected": -78.19509887695312, + "loss": 0.6617, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.019161224365234375, + "rewards/margins": 0.003919219598174095, + "rewards/rejected": 0.01524200476706028, + "step": 35 + }, + { + "epoch": 0.01, + "learning_rate": 9.72972972972973e-07, + "logits/chosen": -0.4176008999347687, + "logits/rejected": -0.41002270579338074, + "logps/chosen": -72.25230407714844, + "logps/rejected": -102.09195709228516, + "loss": 0.7045, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.021532440558075905, + "rewards/margins": -0.045049287378787994, + "rewards/rejected": 0.02351684682071209, + "step": 36 + }, + { + "epoch": 0.01, + "learning_rate": 1.0000000000000002e-06, + "logits/chosen": -0.2946321368217468, + "logits/rejected": -0.2540622651576996, + "logps/chosen": -175.62266540527344, + "logps/rejected": -169.7700958251953, + "loss": 0.6968, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.08188018947839737, + "rewards/margins": 0.09403381496667862, + "rewards/rejected": -0.01215362548828125, + "step": 37 + }, + { + "epoch": 0.01, + "learning_rate": 1.027027027027027e-06, + "logits/chosen": -0.018933556973934174, + "logits/rejected": -0.06329285353422165, + "logps/chosen": -89.59103393554688, + "logps/rejected": -139.76229858398438, + "loss": 0.6602, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.0287017822265625, + "rewards/margins": 0.07011261582374573, + "rewards/rejected": -0.04141082987189293, + "step": 38 + }, + { + "epoch": 0.01, + "learning_rate": 1.0540540540540542e-06, + "logits/chosen": -0.04627982899546623, + "logits/rejected": -0.06653185188770294, + "logps/chosen": -133.7671356201172, + "logps/rejected": -65.60462951660156, + "loss": 0.6416, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.01565857045352459, + "rewards/margins": 0.05074310302734375, + "rewards/rejected": -0.03508453443646431, + "step": 39 + }, + { + "epoch": 0.01, + "learning_rate": 1.0810810810810812e-06, + "logits/chosen": -0.6726418137550354, + "logits/rejected": -0.665627121925354, + "logps/chosen": -183.9945526123047, + "logps/rejected": -90.5576171875, + "loss": 0.6905, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.05094451829791069, + "rewards/margins": -0.07900543510913849, + "rewards/rejected": 0.0280609130859375, + "step": 40 + }, + { + "epoch": 0.01, + "learning_rate": 1.1081081081081083e-06, + "logits/chosen": -0.6551176905632019, + "logits/rejected": -0.5985067486763, + "logps/chosen": -105.78885650634766, + "logps/rejected": -51.130340576171875, + "loss": 0.6682, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.01623077504336834, + "rewards/margins": 0.01463470607995987, + "rewards/rejected": 0.0015960693126544356, + "step": 41 + }, + { + "epoch": 0.01, + "learning_rate": 1.1351351351351352e-06, + "logits/chosen": -0.11211523413658142, + "logits/rejected": -0.11211523413658142, + "logps/chosen": -82.69827270507812, + "logps/rejected": -82.69827270507812, + "loss": 0.7098, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.008691406808793545, + "rewards/margins": 0.0, + "rewards/rejected": 0.008691406808793545, + "step": 42 + }, + { + "epoch": 0.01, + "learning_rate": 1.1621621621621624e-06, + "logits/chosen": -0.5662931799888611, + "logits/rejected": -0.5639760494232178, + "logps/chosen": -105.44888305664062, + "logps/rejected": -32.0634880065918, + "loss": 0.7007, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.01949768140912056, + "rewards/margins": -0.026471329852938652, + "rewards/rejected": 0.006973647978156805, + "step": 43 + }, + { + "epoch": 0.01, + "learning_rate": 1.1891891891891893e-06, + "logits/chosen": -0.05197162553668022, + "logits/rejected": -0.053507763892412186, + "logps/chosen": -50.329524993896484, + "logps/rejected": -120.26408386230469, + "loss": 0.6673, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.04514351114630699, + "rewards/margins": 0.08524894714355469, + "rewards/rejected": -0.040105439722537994, + "step": 44 + }, + { + "epoch": 0.01, + "learning_rate": 1.2162162162162164e-06, + "logits/chosen": -0.4379173219203949, + "logits/rejected": -0.44327402114868164, + "logps/chosen": -88.14654541015625, + "logps/rejected": -185.4222412109375, + "loss": 0.7043, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.01965637318789959, + "rewards/margins": -0.08961792290210724, + "rewards/rejected": 0.0699615478515625, + "step": 45 + }, + { + "epoch": 0.01, + "learning_rate": 1.2432432432432434e-06, + "logits/chosen": 0.038530927151441574, + "logits/rejected": 0.13283371925354004, + "logps/chosen": -103.72206115722656, + "logps/rejected": -20.944061279296875, + "loss": 0.698, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.002340698381885886, + "rewards/margins": 0.012176132760941982, + "rewards/rejected": -0.009835434146225452, + "step": 46 + }, + { + "epoch": 0.01, + "learning_rate": 1.2702702702702705e-06, + "logits/chosen": -0.8929139971733093, + "logits/rejected": -0.9548021554946899, + "logps/chosen": -145.2889404296875, + "logps/rejected": -41.74607467651367, + "loss": 0.687, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.001345825265161693, + "rewards/margins": -0.009859085083007812, + "rewards/rejected": 0.011204910464584827, + "step": 47 + }, + { + "epoch": 0.01, + "learning_rate": 1.2972972972972974e-06, + "logits/chosen": -0.25355756282806396, + "logits/rejected": -0.25355756282806396, + "logps/chosen": -40.78858184814453, + "logps/rejected": -40.78858184814453, + "loss": 0.6949, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.010754394344985485, + "rewards/margins": 0.0, + "rewards/rejected": -0.010754394344985485, + "step": 48 + }, + { + "epoch": 0.01, + "learning_rate": 1.3243243243243246e-06, + "logits/chosen": -0.31890633702278137, + "logits/rejected": -0.3480943441390991, + "logps/chosen": -283.9082946777344, + "logps/rejected": -157.343017578125, + "loss": 0.7378, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.03946838527917862, + "rewards/margins": -0.12347260117530823, + "rewards/rejected": 0.08400421589612961, + "step": 49 + }, + { + "epoch": 0.01, + "learning_rate": 1.3513513513513515e-06, + "logits/chosen": -0.18487414717674255, + "logits/rejected": -0.140823632478714, + "logps/chosen": -138.37689208984375, + "logps/rejected": -85.52824401855469, + "loss": 0.6973, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.05979004129767418, + "rewards/margins": -0.07041779160499573, + "rewards/rejected": 0.01062774658203125, + "step": 50 + }, + { + "epoch": 0.01, + "learning_rate": 1.3783783783783786e-06, + "logits/chosen": -0.3383142352104187, + "logits/rejected": -0.3383142352104187, + "logps/chosen": -95.27639770507812, + "logps/rejected": -95.27639770507812, + "loss": 0.6848, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.03266601637005806, + "rewards/margins": 0.0, + "rewards/rejected": -0.03266601637005806, + "step": 51 + }, + { + "epoch": 0.01, + "learning_rate": 1.4054054054054056e-06, + "logits/chosen": -0.3255666196346283, + "logits/rejected": -0.31641215085983276, + "logps/chosen": -120.01494598388672, + "logps/rejected": -148.6154022216797, + "loss": 0.7094, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.007149505894631147, + "rewards/margins": -0.08187179267406464, + "rewards/rejected": 0.0747222900390625, + "step": 52 + }, + { + "epoch": 0.01, + "learning_rate": 1.4324324324324327e-06, + "logits/chosen": -0.2859475910663605, + "logits/rejected": -0.2910458743572235, + "logps/chosen": -70.1623306274414, + "logps/rejected": -79.33369445800781, + "loss": 0.6918, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.0010688782203942537, + "rewards/margins": -0.05813904106616974, + "rewards/rejected": 0.057070162147283554, + "step": 53 + }, + { + "epoch": 0.01, + "learning_rate": 1.4594594594594596e-06, + "logits/chosen": -0.4416763186454773, + "logits/rejected": -0.4174061119556427, + "logps/chosen": -182.4401092529297, + "logps/rejected": -239.3917236328125, + "loss": 0.7179, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.00702667236328125, + "rewards/margins": -0.04381256178021431, + "rewards/rejected": 0.03678588941693306, + "step": 54 + }, + { + "epoch": 0.01, + "learning_rate": 1.4864864864864868e-06, + "logits/chosen": -0.24622806906700134, + "logits/rejected": -0.24622806906700134, + "logps/chosen": -20.854768753051758, + "logps/rejected": -20.854768753051758, + "loss": 0.6951, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.008303642272949219, + "rewards/margins": 0.0, + "rewards/rejected": -0.008303642272949219, + "step": 55 + }, + { + "epoch": 0.01, + "learning_rate": 1.5135135135135137e-06, + "logits/chosen": -0.4491868317127228, + "logits/rejected": -0.458051472902298, + "logps/chosen": -160.12835693359375, + "logps/rejected": -138.70587158203125, + "loss": 0.6885, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.002227783203125, + "rewards/margins": 0.023193359375, + "rewards/rejected": -0.025421142578125, + "step": 56 + }, + { + "epoch": 0.01, + "learning_rate": 1.5405405405405409e-06, + "logits/chosen": -0.21903136372566223, + "logits/rejected": -0.24253438413143158, + "logps/chosen": -127.47550201416016, + "logps/rejected": -90.58346557617188, + "loss": 0.7085, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.010280609130859375, + "rewards/margins": -0.06963805854320526, + "rewards/rejected": 0.059357453137636185, + "step": 57 + }, + { + "epoch": 0.01, + "learning_rate": 1.5675675675675678e-06, + "logits/chosen": -0.44356873631477356, + "logits/rejected": -0.3873465061187744, + "logps/chosen": -281.4527587890625, + "logps/rejected": -217.2810821533203, + "loss": 0.677, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.0639495849609375, + "rewards/margins": -0.06084137037396431, + "rewards/rejected": -0.0031082152854651213, + "step": 58 + }, + { + "epoch": 0.01, + "learning_rate": 1.5945945945945947e-06, + "logits/chosen": -0.6618502736091614, + "logits/rejected": -0.648360550403595, + "logps/chosen": -83.97450256347656, + "logps/rejected": -206.72714233398438, + "loss": 0.7079, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.02227630652487278, + "rewards/margins": -0.05517730861902237, + "rewards/rejected": 0.03290100023150444, + "step": 59 + }, + { + "epoch": 0.01, + "learning_rate": 1.6216216216216219e-06, + "logits/chosen": -0.5040682554244995, + "logits/rejected": -0.3862899839878082, + "logps/chosen": -160.7141876220703, + "logps/rejected": -74.84893798828125, + "loss": 0.7155, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.01665496826171875, + "rewards/margins": -0.01568145677447319, + "rewards/rejected": 0.03233642503619194, + "step": 60 + }, + { + "epoch": 0.01, + "learning_rate": 1.6486486486486488e-06, + "logits/chosen": -0.2673022747039795, + "logits/rejected": -0.2688683569431305, + "logps/chosen": -70.01580810546875, + "logps/rejected": -89.62174987792969, + "loss": 0.7131, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.012400055304169655, + "rewards/margins": -0.009644318372011185, + "rewards/rejected": -0.0027557373978197575, + "step": 61 + }, + { + "epoch": 0.01, + "learning_rate": 1.675675675675676e-06, + "logits/chosen": -0.41226422786712646, + "logits/rejected": -0.4107739329338074, + "logps/chosen": -72.17119598388672, + "logps/rejected": -76.52075958251953, + "loss": 0.7066, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.050406645983457565, + "rewards/margins": 0.02246703952550888, + "rewards/rejected": 0.027939606457948685, + "step": 62 + }, + { + "epoch": 0.01, + "learning_rate": 1.7027027027027028e-06, + "logits/chosen": -0.6915644407272339, + "logits/rejected": -0.6376903653144836, + "logps/chosen": -71.29608154296875, + "logps/rejected": -94.66590881347656, + "loss": 0.6834, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.005260467529296875, + "rewards/margins": -0.00617218017578125, + "rewards/rejected": 0.011432647705078125, + "step": 63 + }, + { + "epoch": 0.01, + "learning_rate": 1.72972972972973e-06, + "logits/chosen": -0.0606398731470108, + "logits/rejected": -0.06146256998181343, + "logps/chosen": -9.56106185913086, + "logps/rejected": -3.8328018188476562, + "loss": 0.7254, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.00952835101634264, + "rewards/margins": -0.013506079092621803, + "rewards/rejected": 0.003977728076279163, + "step": 64 + }, + { + "epoch": 0.01, + "learning_rate": 1.756756756756757e-06, + "logits/chosen": -0.15095262229442596, + "logits/rejected": -0.11480997502803802, + "logps/chosen": -73.8264389038086, + "logps/rejected": -48.654022216796875, + "loss": 0.6904, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.017533112317323685, + "rewards/margins": -0.007102584466338158, + "rewards/rejected": 0.024635696783661842, + "step": 65 + }, + { + "epoch": 0.01, + "learning_rate": 1.783783783783784e-06, + "logits/chosen": -0.4421168863773346, + "logits/rejected": -0.37823566794395447, + "logps/chosen": -76.6505126953125, + "logps/rejected": -24.136730194091797, + "loss": 0.7007, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.02334289625287056, + "rewards/margins": -0.025038719177246094, + "rewards/rejected": 0.0016958237392827868, + "step": 66 + }, + { + "epoch": 0.01, + "learning_rate": 1.810810810810811e-06, + "logits/chosen": -0.6993662714958191, + "logits/rejected": -0.7401276230812073, + "logps/chosen": -190.11849975585938, + "logps/rejected": -42.372474670410156, + "loss": 0.6663, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.061798095703125, + "rewards/margins": 0.07329674065113068, + "rewards/rejected": -0.011498642154037952, + "step": 67 + }, + { + "epoch": 0.01, + "learning_rate": 1.8378378378378381e-06, + "logits/chosen": -0.11998128890991211, + "logits/rejected": -0.11998128890991211, + "logps/chosen": -10.056513786315918, + "logps/rejected": -10.056513786315918, + "loss": 0.7078, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.001676845597103238, + "rewards/margins": 0.0, + "rewards/rejected": -0.001676845597103238, + "step": 68 + }, + { + "epoch": 0.01, + "learning_rate": 1.864864864864865e-06, + "logits/chosen": -0.4507473111152649, + "logits/rejected": -0.4507473111152649, + "logps/chosen": -33.57309341430664, + "logps/rejected": -33.57309341430664, + "loss": 0.6987, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.022124862298369408, + "rewards/margins": 0.0, + "rewards/rejected": 0.022124862298369408, + "step": 69 + }, + { + "epoch": 0.01, + "learning_rate": 1.8918918918918922e-06, + "logits/chosen": -0.7004619240760803, + "logits/rejected": -0.6718403697013855, + "logps/chosen": -125.79046630859375, + "logps/rejected": -266.2652282714844, + "loss": 0.7132, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.06360778957605362, + "rewards/margins": -0.06918640434741974, + "rewards/rejected": 0.0055786133743822575, + "step": 70 + }, + { + "epoch": 0.01, + "learning_rate": 1.918918918918919e-06, + "logits/chosen": -0.26785990595817566, + "logits/rejected": -0.27730756998062134, + "logps/chosen": -99.31166076660156, + "logps/rejected": -94.46578216552734, + "loss": 0.6519, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.06304168701171875, + "rewards/margins": 0.044065095484256744, + "rewards/rejected": 0.018976593390107155, + "step": 71 + }, + { + "epoch": 0.01, + "learning_rate": 1.945945945945946e-06, + "logits/chosen": -0.47227707505226135, + "logits/rejected": -0.4628415107727051, + "logps/chosen": -225.20803833007812, + "logps/rejected": -88.40789794921875, + "loss": 0.6968, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.003736877581104636, + "rewards/margins": -0.01261596754193306, + "rewards/rejected": 0.01635284535586834, + "step": 72 + }, + { + "epoch": 0.01, + "learning_rate": 1.9729729729729734e-06, + "logits/chosen": -0.23662738502025604, + "logits/rejected": -0.21250799298286438, + "logps/chosen": -81.14623260498047, + "logps/rejected": -160.79014587402344, + "loss": 0.6979, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.01547012384980917, + "rewards/margins": -0.024504853412508965, + "rewards/rejected": 0.009034729562699795, + "step": 73 + }, + { + "epoch": 0.01, + "learning_rate": 2.0000000000000003e-06, + "logits/chosen": 0.03662382438778877, + "logits/rejected": 0.032820604741573334, + "logps/chosen": -6.1003594398498535, + "logps/rejected": -4.745105743408203, + "loss": 0.6902, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.001505994820035994, + "rewards/margins": -0.011165952309966087, + "rewards/rejected": 0.012671947479248047, + "step": 74 + }, + { + "epoch": 0.01, + "learning_rate": 2.0270270270270273e-06, + "logits/chosen": -0.4322873651981354, + "logits/rejected": -0.43230578303337097, + "logps/chosen": -274.9582824707031, + "logps/rejected": -269.363525390625, + "loss": 0.6758, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0270538330078125, + "rewards/margins": 0.028106689453125, + "rewards/rejected": -0.0551605224609375, + "step": 75 + }, + { + "epoch": 0.01, + "learning_rate": 2.054054054054054e-06, + "logits/chosen": -0.24844281375408173, + "logits/rejected": -0.2430851012468338, + "logps/chosen": -24.240278244018555, + "logps/rejected": -27.460350036621094, + "loss": 0.6893, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.028202248737215996, + "rewards/margins": 0.016660120338201523, + "rewards/rejected": 0.011542129330337048, + "step": 76 + }, + { + "epoch": 0.01, + "learning_rate": 2.0810810810810815e-06, + "logits/chosen": -0.20377691090106964, + "logits/rejected": -0.19364161789417267, + "logps/chosen": -63.98418045043945, + "logps/rejected": -110.26387786865234, + "loss": 0.6965, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.05851097032427788, + "rewards/margins": -0.044530872255563736, + "rewards/rejected": 0.10304184257984161, + "step": 77 + }, + { + "epoch": 0.01, + "learning_rate": 2.1081081081081085e-06, + "logits/chosen": -0.5511452555656433, + "logits/rejected": -0.5735107660293579, + "logps/chosen": -116.7999267578125, + "logps/rejected": -23.535789489746094, + "loss": 0.6963, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.008586120791733265, + "rewards/margins": -0.016031455248594284, + "rewards/rejected": 0.007445335388183594, + "step": 78 + }, + { + "epoch": 0.01, + "learning_rate": 2.1351351351351354e-06, + "logits/chosen": -0.6369108557701111, + "logits/rejected": -0.4312668442726135, + "logps/chosen": -81.40458679199219, + "logps/rejected": -119.0076904296875, + "loss": 0.703, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.0025268555618822575, + "rewards/margins": 0.02286529541015625, + "rewards/rejected": -0.02033844031393528, + "step": 79 + }, + { + "epoch": 0.01, + "learning_rate": 2.1621621621621623e-06, + "logits/chosen": -0.7878825068473816, + "logits/rejected": -0.7015551924705505, + "logps/chosen": -139.30020141601562, + "logps/rejected": -136.9185028076172, + "loss": 0.6932, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.0031234740745276213, + "rewards/margins": -0.01567993126809597, + "rewards/rejected": 0.01255645789206028, + "step": 80 + }, + { + "epoch": 0.01, + "learning_rate": 2.1891891891891897e-06, + "logits/chosen": -0.7776329517364502, + "logits/rejected": -0.7736074328422546, + "logps/chosen": -100.19271850585938, + "logps/rejected": -22.697322845458984, + "loss": 0.6863, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.01601104810833931, + "rewards/margins": -0.03013134002685547, + "rewards/rejected": 0.014120292849838734, + "step": 81 + }, + { + "epoch": 0.01, + "learning_rate": 2.2162162162162166e-06, + "logits/chosen": -0.3052758574485779, + "logits/rejected": -0.3052758574485779, + "logps/chosen": -201.33554077148438, + "logps/rejected": -201.33554077148438, + "loss": 0.7177, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.03862304612994194, + "rewards/margins": 0.0, + "rewards/rejected": 0.03862304612994194, + "step": 82 + }, + { + "epoch": 0.01, + "learning_rate": 2.2432432432432435e-06, + "logits/chosen": -0.4445509910583496, + "logits/rejected": -0.4396745562553406, + "logps/chosen": -65.57904052734375, + "logps/rejected": -24.48224639892578, + "loss": 0.7003, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.025223542004823685, + "rewards/margins": 0.0477357879281044, + "rewards/rejected": -0.022512245923280716, + "step": 83 + }, + { + "epoch": 0.01, + "learning_rate": 2.2702702702702705e-06, + "logits/chosen": -0.3245055377483368, + "logits/rejected": -0.3350675404071808, + "logps/chosen": -172.28411865234375, + "logps/rejected": -116.78533935546875, + "loss": 0.6772, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.04752654954791069, + "rewards/margins": 0.064916230738163, + "rewards/rejected": -0.017389679327607155, + "step": 84 + }, + { + "epoch": 0.01, + "learning_rate": 2.297297297297298e-06, + "logits/chosen": -0.23778805136680603, + "logits/rejected": -0.23073835670948029, + "logps/chosen": -15.139572143554688, + "logps/rejected": -14.69031047821045, + "loss": 0.6815, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.013981247320771217, + "rewards/margins": 0.013554668985307217, + "rewards/rejected": 0.0004265785391908139, + "step": 85 + }, + { + "epoch": 0.01, + "learning_rate": 2.3243243243243247e-06, + "logits/chosen": -0.11168992519378662, + "logits/rejected": 0.03564317524433136, + "logps/chosen": -90.3279037475586, + "logps/rejected": -93.74842071533203, + "loss": 0.6967, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.033928681164979935, + "rewards/margins": -0.015267182141542435, + "rewards/rejected": 0.04919586330652237, + "step": 86 + }, + { + "epoch": 0.01, + "learning_rate": 2.3513513513513517e-06, + "logits/chosen": -0.08548595756292343, + "logits/rejected": 0.029392464086413383, + "logps/chosen": -112.23123168945312, + "logps/rejected": -18.767974853515625, + "loss": 0.7221, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.020590974017977715, + "rewards/margins": -0.00938186701387167, + "rewards/rejected": -0.011209107004106045, + "step": 87 + }, + { + "epoch": 0.01, + "learning_rate": 2.3783783783783786e-06, + "logits/chosen": -0.18220672011375427, + "logits/rejected": -0.11176535487174988, + "logps/chosen": -83.85472106933594, + "logps/rejected": -19.094385147094727, + "loss": 0.6732, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.055867768824100494, + "rewards/margins": 0.02437419816851616, + "rewards/rejected": 0.031493570655584335, + "step": 88 + }, + { + "epoch": 0.01, + "learning_rate": 2.4054054054054055e-06, + "logits/chosen": -0.7204433679580688, + "logits/rejected": -0.7041651606559753, + "logps/chosen": -125.742919921875, + "logps/rejected": -47.488059997558594, + "loss": 0.6688, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.021323395892977715, + "rewards/margins": -0.038892365992069244, + "rewards/rejected": 0.01756897009909153, + "step": 89 + }, + { + "epoch": 0.01, + "learning_rate": 2.432432432432433e-06, + "logits/chosen": -0.6219102144241333, + "logits/rejected": -0.6087803244590759, + "logps/chosen": -95.05422973632812, + "logps/rejected": -90.78324127197266, + "loss": 0.6622, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.013697815127670765, + "rewards/margins": 0.059175871312618256, + "rewards/rejected": -0.045478057116270065, + "step": 90 + }, + { + "epoch": 0.01, + "learning_rate": 2.45945945945946e-06, + "logits/chosen": -0.5646635293960571, + "logits/rejected": -0.5647862553596497, + "logps/chosen": -123.26982116699219, + "logps/rejected": -146.5943603515625, + "loss": 0.6733, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.01354751642793417, + "rewards/margins": 0.00739364605396986, + "rewards/rejected": -0.02094116248190403, + "step": 91 + }, + { + "epoch": 0.01, + "learning_rate": 2.4864864864864867e-06, + "logits/chosen": 0.19721351563930511, + "logits/rejected": 0.17481629550457, + "logps/chosen": -4.898685455322266, + "logps/rejected": -81.82405090332031, + "loss": 0.683, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.009389400482177734, + "rewards/margins": 0.025450801476836205, + "rewards/rejected": -0.01606140099465847, + "step": 92 + }, + { + "epoch": 0.02, + "learning_rate": 2.5135135135135137e-06, + "logits/chosen": -0.07465352863073349, + "logits/rejected": -0.27713289856910706, + "logps/chosen": -106.40739440917969, + "logps/rejected": -108.93954467773438, + "loss": 0.7239, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.0021560669410973787, + "rewards/margins": -0.039969637989997864, + "rewards/rejected": 0.037813570350408554, + "step": 93 + }, + { + "epoch": 0.02, + "learning_rate": 2.540540540540541e-06, + "logits/chosen": -0.12471901625394821, + "logits/rejected": -0.11145435273647308, + "logps/chosen": -74.40995025634766, + "logps/rejected": -117.79893493652344, + "loss": 0.7167, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.042499542236328125, + "rewards/margins": 0.01040344312787056, + "rewards/rejected": 0.032096099108457565, + "step": 94 + }, + { + "epoch": 0.02, + "learning_rate": 2.5675675675675675e-06, + "logits/chosen": -0.4100254476070404, + "logits/rejected": -0.43711620569229126, + "logps/chosen": -71.68355560302734, + "logps/rejected": -75.31361389160156, + "loss": 0.7026, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.08287658542394638, + "rewards/margins": 0.02464446797966957, + "rewards/rejected": 0.05823211744427681, + "step": 95 + }, + { + "epoch": 0.02, + "learning_rate": 2.594594594594595e-06, + "logits/chosen": -0.3587365746498108, + "logits/rejected": -0.3163298964500427, + "logps/chosen": -196.40579223632812, + "logps/rejected": -223.15611267089844, + "loss": 0.689, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.16533203423023224, + "rewards/margins": 0.052947998046875, + "rewards/rejected": 0.11238403618335724, + "step": 96 + }, + { + "epoch": 0.02, + "learning_rate": 2.621621621621622e-06, + "logits/chosen": -0.4132317304611206, + "logits/rejected": -0.3881774842739105, + "logps/chosen": -199.70010375976562, + "logps/rejected": -24.80718994140625, + "loss": 0.6653, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.09973907470703125, + "rewards/margins": 0.07669182121753693, + "rewards/rejected": 0.023047257214784622, + "step": 97 + }, + { + "epoch": 0.02, + "learning_rate": 2.648648648648649e-06, + "logits/chosen": -0.14170585572719574, + "logits/rejected": -0.14557503163814545, + "logps/chosen": -111.9797592163086, + "logps/rejected": -99.42010498046875, + "loss": 0.7057, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.012127685360610485, + "rewards/margins": -0.0053810132667422295, + "rewards/rejected": 0.017508698627352715, + "step": 98 + }, + { + "epoch": 0.02, + "learning_rate": 2.6756756756756757e-06, + "logits/chosen": -0.2432371824979782, + "logits/rejected": -0.30796775221824646, + "logps/chosen": -75.31282043457031, + "logps/rejected": -195.74563598632812, + "loss": 0.7302, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0012947082286700606, + "rewards/margins": -0.053160857409238815, + "rewards/rejected": 0.05445556715130806, + "step": 99 + }, + { + "epoch": 0.02, + "learning_rate": 2.702702702702703e-06, + "logits/chosen": -0.33195069432258606, + "logits/rejected": -0.3139808475971222, + "logps/chosen": -229.63528442382812, + "logps/rejected": -90.03630828857422, + "loss": 0.6529, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.1676177978515625, + "rewards/margins": 0.1446235626935959, + "rewards/rejected": 0.022994233295321465, + "step": 100 + }, + { + "epoch": 0.02, + "learning_rate": 2.72972972972973e-06, + "logits/chosen": -0.5748655796051025, + "logits/rejected": -0.6324576735496521, + "logps/chosen": -75.79373168945312, + "logps/rejected": -26.014638900756836, + "loss": 0.7137, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.006480407901108265, + "rewards/margins": 0.009002495557069778, + "rewards/rejected": -0.002522087190300226, + "step": 101 + }, + { + "epoch": 0.02, + "learning_rate": 2.7567567567567573e-06, + "logits/chosen": -0.42940181493759155, + "logits/rejected": -0.4093528687953949, + "logps/chosen": -101.52409362792969, + "logps/rejected": -103.61676025390625, + "loss": 0.7127, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.05197906494140625, + "rewards/margins": -0.04349822923541069, + "rewards/rejected": -0.008480834774672985, + "step": 102 + }, + { + "epoch": 0.02, + "learning_rate": 2.783783783783784e-06, + "logits/chosen": -0.10759754478931427, + "logits/rejected": -0.10759754478931427, + "logps/chosen": -160.2021484375, + "logps/rejected": -160.2021484375, + "loss": 0.715, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.11712341755628586, + "rewards/margins": 0.0, + "rewards/rejected": 0.11712341755628586, + "step": 103 + }, + { + "epoch": 0.02, + "learning_rate": 2.810810810810811e-06, + "logits/chosen": -0.1329001933336258, + "logits/rejected": -0.10199643671512604, + "logps/chosen": -63.741233825683594, + "logps/rejected": -63.55295944213867, + "loss": 0.6916, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.10105438530445099, + "rewards/margins": 0.11574287712574005, + "rewards/rejected": -0.014688491821289062, + "step": 104 + }, + { + "epoch": 0.02, + "learning_rate": 2.837837837837838e-06, + "logits/chosen": -0.35306012630462646, + "logits/rejected": -0.3267625868320465, + "logps/chosen": -72.76805114746094, + "logps/rejected": -73.55684661865234, + "loss": 0.6524, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.07141876220703125, + "rewards/margins": 0.035724639892578125, + "rewards/rejected": 0.035694122314453125, + "step": 105 + }, + { + "epoch": 0.02, + "learning_rate": 2.8648648648648654e-06, + "logits/chosen": -0.3201744556427002, + "logits/rejected": -0.32948702573776245, + "logps/chosen": -76.79370880126953, + "logps/rejected": -171.97021484375, + "loss": 0.7182, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.056960295885801315, + "rewards/margins": -0.056693267077207565, + "rewards/rejected": 0.11365356296300888, + "step": 106 + }, + { + "epoch": 0.02, + "learning_rate": 2.891891891891892e-06, + "logits/chosen": -0.4650912582874298, + "logits/rejected": -0.4396916925907135, + "logps/chosen": -74.64031982421875, + "logps/rejected": -118.55686950683594, + "loss": 0.7304, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.07407074421644211, + "rewards/margins": 0.02668609842658043, + "rewards/rejected": 0.04738464578986168, + "step": 107 + }, + { + "epoch": 0.02, + "learning_rate": 2.9189189189189193e-06, + "logits/chosen": -0.3541102409362793, + "logits/rejected": -0.3493324816226959, + "logps/chosen": -110.75398254394531, + "logps/rejected": -124.63422393798828, + "loss": 0.6874, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.06546936184167862, + "rewards/margins": 0.0015937834978103638, + "rewards/rejected": 0.06387557834386826, + "step": 108 + }, + { + "epoch": 0.02, + "learning_rate": 2.9459459459459462e-06, + "logits/chosen": -0.5941921472549438, + "logits/rejected": -0.5859269499778748, + "logps/chosen": -120.774658203125, + "logps/rejected": -164.85049438476562, + "loss": 0.7097, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0071426392532885075, + "rewards/margins": -0.06043548882007599, + "rewards/rejected": 0.06757812947034836, + "step": 109 + }, + { + "epoch": 0.02, + "learning_rate": 2.9729729729729736e-06, + "logits/chosen": -0.6249080300331116, + "logits/rejected": -0.6227817535400391, + "logps/chosen": -143.19712829589844, + "logps/rejected": -155.3424530029297, + "loss": 0.6929, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.10592193901538849, + "rewards/margins": -0.06345215439796448, + "rewards/rejected": 0.16937409341335297, + "step": 110 + }, + { + "epoch": 0.02, + "learning_rate": 3e-06, + "logits/chosen": -0.2902931272983551, + "logits/rejected": -0.2694794833660126, + "logps/chosen": -210.9807891845703, + "logps/rejected": -206.79049682617188, + "loss": 0.6954, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.11599426716566086, + "rewards/margins": -0.15805205702781677, + "rewards/rejected": 0.2740463316440582, + "step": 111 + }, + { + "epoch": 0.02, + "learning_rate": 3.0270270270270274e-06, + "logits/chosen": -0.414005309343338, + "logits/rejected": -0.4305447041988373, + "logps/chosen": -110.31449890136719, + "logps/rejected": -99.75860595703125, + "loss": 0.6782, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.09730072319507599, + "rewards/margins": -0.051239773631095886, + "rewards/rejected": 0.14854049682617188, + "step": 112 + }, + { + "epoch": 0.02, + "learning_rate": 3.0540540540540544e-06, + "logits/chosen": -0.1792457550764084, + "logits/rejected": -0.1681881994009018, + "logps/chosen": -47.2121467590332, + "logps/rejected": -46.007747650146484, + "loss": 0.6584, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0523834228515625, + "rewards/margins": -0.04464530944824219, + "rewards/rejected": 0.09702873229980469, + "step": 113 + }, + { + "epoch": 0.02, + "learning_rate": 3.0810810810810817e-06, + "logits/chosen": -0.3705331087112427, + "logits/rejected": -0.35561829805374146, + "logps/chosen": -135.25924682617188, + "logps/rejected": -28.344324111938477, + "loss": 0.6573, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.10709228366613388, + "rewards/margins": 0.0837518647313118, + "rewards/rejected": 0.023340417072176933, + "step": 114 + }, + { + "epoch": 0.02, + "learning_rate": 3.1081081081081082e-06, + "logits/chosen": -0.4832049012184143, + "logits/rejected": -0.4531274139881134, + "logps/chosen": -99.94479370117188, + "logps/rejected": -123.3532485961914, + "loss": 0.685, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.02857208251953125, + "rewards/margins": 0.0024078357964754105, + "rewards/rejected": 0.02616424672305584, + "step": 115 + }, + { + "epoch": 0.02, + "learning_rate": 3.1351351351351356e-06, + "logits/chosen": -0.44751399755477905, + "logits/rejected": -0.32192304730415344, + "logps/chosen": -161.10858154296875, + "logps/rejected": -21.911151885986328, + "loss": 0.7112, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.010018921457231045, + "rewards/margins": -0.011730575002729893, + "rewards/rejected": 0.021749496459960938, + "step": 116 + }, + { + "epoch": 0.02, + "learning_rate": 3.1621621621621625e-06, + "logits/chosen": -0.4869598150253296, + "logits/rejected": -0.4875982999801636, + "logps/chosen": -83.10441589355469, + "logps/rejected": -111.80570983886719, + "loss": 0.6318, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.23198242485523224, + "rewards/margins": 0.11691894382238388, + "rewards/rejected": 0.11506348103284836, + "step": 117 + }, + { + "epoch": 0.02, + "learning_rate": 3.1891891891891894e-06, + "logits/chosen": -0.7002464532852173, + "logits/rejected": -0.6426994800567627, + "logps/chosen": -111.58619689941406, + "logps/rejected": -108.01637268066406, + "loss": 0.7007, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.07104950398206711, + "rewards/margins": 0.025433354079723358, + "rewards/rejected": 0.04561614990234375, + "step": 118 + }, + { + "epoch": 0.02, + "learning_rate": 3.2162162162162164e-06, + "logits/chosen": -0.5688847899436951, + "logits/rejected": -0.5109351277351379, + "logps/chosen": -52.97403335571289, + "logps/rejected": -21.91141700744629, + "loss": 0.727, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.046834565699100494, + "rewards/margins": -0.062328532338142395, + "rewards/rejected": 0.015493965707719326, + "step": 119 + }, + { + "epoch": 0.02, + "learning_rate": 3.2432432432432437e-06, + "logits/chosen": -0.22468356788158417, + "logits/rejected": -0.17295438051223755, + "logps/chosen": -84.05821228027344, + "logps/rejected": -140.256591796875, + "loss": 0.7079, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.11628570407629013, + "rewards/margins": -0.15796661376953125, + "rewards/rejected": 0.274252325296402, + "step": 120 + }, + { + "epoch": 0.02, + "learning_rate": 3.2702702702702706e-06, + "logits/chosen": -0.6394220590591431, + "logits/rejected": -0.5986382961273193, + "logps/chosen": -146.19532775878906, + "logps/rejected": -80.96549987792969, + "loss": 0.6451, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3059372007846832, + "rewards/margins": 0.180267333984375, + "rewards/rejected": 0.12566986680030823, + "step": 121 + }, + { + "epoch": 0.02, + "learning_rate": 3.2972972972972976e-06, + "logits/chosen": -0.45012983679771423, + "logits/rejected": -0.3929145336151123, + "logps/chosen": -70.29680633544922, + "logps/rejected": -94.4585952758789, + "loss": 0.6783, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.15031051635742188, + "rewards/margins": 0.035369873046875, + "rewards/rejected": 0.11494064331054688, + "step": 122 + }, + { + "epoch": 0.02, + "learning_rate": 3.3243243243243245e-06, + "logits/chosen": -0.4278462529182434, + "logits/rejected": -0.42376741766929626, + "logps/chosen": -49.40119934082031, + "logps/rejected": -41.13712692260742, + "loss": 0.6746, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.11077880859375, + "rewards/margins": -0.007896803319454193, + "rewards/rejected": 0.1186756119132042, + "step": 123 + }, + { + "epoch": 0.02, + "learning_rate": 3.351351351351352e-06, + "logits/chosen": -0.43535494804382324, + "logits/rejected": -0.46570315957069397, + "logps/chosen": -180.2056121826172, + "logps/rejected": -104.66813659667969, + "loss": 0.6911, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3000503480434418, + "rewards/margins": 0.07030944526195526, + "rewards/rejected": 0.2297409027814865, + "step": 124 + }, + { + "epoch": 0.02, + "learning_rate": 3.3783783783783788e-06, + "logits/chosen": -0.7789160013198853, + "logits/rejected": -0.8407235145568848, + "logps/chosen": -106.45538330078125, + "logps/rejected": -64.75818634033203, + "loss": 0.695, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.10487212985754013, + "rewards/margins": 0.07381362468004227, + "rewards/rejected": 0.031058503314852715, + "step": 125 + }, + { + "epoch": 0.02, + "learning_rate": 3.4054054054054057e-06, + "logits/chosen": -0.43852487206459045, + "logits/rejected": -0.43852487206459045, + "logps/chosen": -86.43247985839844, + "logps/rejected": -86.43247985839844, + "loss": 0.7009, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.3925064206123352, + "rewards/margins": 0.0, + "rewards/rejected": 0.3925064206123352, + "step": 126 + }, + { + "epoch": 0.02, + "learning_rate": 3.4324324324324326e-06, + "logits/chosen": -0.5254110097885132, + "logits/rejected": -0.9034656286239624, + "logps/chosen": -55.85532760620117, + "logps/rejected": -54.094329833984375, + "loss": 0.6443, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.031790923327207565, + "rewards/margins": -0.031528472900390625, + "rewards/rejected": -0.0002624511835165322, + "step": 127 + }, + { + "epoch": 0.02, + "learning_rate": 3.45945945945946e-06, + "logits/chosen": -0.5776910781860352, + "logits/rejected": -0.6462506055831909, + "logps/chosen": -261.03717041015625, + "logps/rejected": -95.26034545898438, + "loss": 0.6348, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.19999085366725922, + "rewards/margins": 0.035745248198509216, + "rewards/rejected": 0.16424560546875, + "step": 128 + }, + { + "epoch": 0.02, + "learning_rate": 3.4864864864864865e-06, + "logits/chosen": -0.3982071280479431, + "logits/rejected": -0.3963896334171295, + "logps/chosen": -8.085589408874512, + "logps/rejected": -9.270894050598145, + "loss": 0.6547, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.04168577119708061, + "rewards/margins": 0.016861341893672943, + "rewards/rejected": 0.02482442930340767, + "step": 129 + }, + { + "epoch": 0.02, + "learning_rate": 3.513513513513514e-06, + "logits/chosen": -0.1423712819814682, + "logits/rejected": -0.16885727643966675, + "logps/chosen": -142.3251953125, + "logps/rejected": -130.91639709472656, + "loss": 0.7427, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.370657354593277, + "rewards/margins": -0.10573729872703552, + "rewards/rejected": 0.4763946533203125, + "step": 130 + }, + { + "epoch": 0.02, + "learning_rate": 3.5405405405405408e-06, + "logits/chosen": -0.5765222907066345, + "logits/rejected": -0.6160666942596436, + "logps/chosen": -259.1087341308594, + "logps/rejected": -214.40475463867188, + "loss": 0.7284, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.32189637422561646, + "rewards/margins": -0.21131592988967896, + "rewards/rejected": 0.5332123041152954, + "step": 131 + }, + { + "epoch": 0.02, + "learning_rate": 3.567567567567568e-06, + "logits/chosen": -0.8681287169456482, + "logits/rejected": -0.8249931335449219, + "logps/chosen": -188.21218872070312, + "logps/rejected": -127.45892333984375, + "loss": 0.7288, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.13236236572265625, + "rewards/margins": 0.06430816650390625, + "rewards/rejected": 0.06805419921875, + "step": 132 + }, + { + "epoch": 0.02, + "learning_rate": 3.5945945945945946e-06, + "logits/chosen": -0.5207412838935852, + "logits/rejected": -0.6421088576316833, + "logps/chosen": -174.09063720703125, + "logps/rejected": -79.69392395019531, + "loss": 0.7236, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.24029235541820526, + "rewards/margins": -0.048960134387016296, + "rewards/rejected": 0.28925248980522156, + "step": 133 + }, + { + "epoch": 0.02, + "learning_rate": 3.621621621621622e-06, + "logits/chosen": -0.8028150796890259, + "logits/rejected": -0.8186520338058472, + "logps/chosen": -86.19805908203125, + "logps/rejected": -90.125732421875, + "loss": 0.5845, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.1652984619140625, + "rewards/margins": 0.001613616943359375, + "rewards/rejected": 0.16368484497070312, + "step": 134 + }, + { + "epoch": 0.02, + "learning_rate": 3.648648648648649e-06, + "logits/chosen": -0.20325087010860443, + "logits/rejected": -0.20325087010860443, + "logps/chosen": -56.607139587402344, + "logps/rejected": -56.607139587402344, + "loss": 0.8412, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.0037220001686364412, + "rewards/margins": 0.0, + "rewards/rejected": -0.0037220001686364412, + "step": 135 + }, + { + "epoch": 0.02, + "learning_rate": 3.6756756756756763e-06, + "logits/chosen": -0.14370572566986084, + "logits/rejected": -0.14337722957134247, + "logps/chosen": -6.1251020431518555, + "logps/rejected": -8.173934936523438, + "loss": 0.7204, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.00868897419422865, + "rewards/margins": -0.02724938467144966, + "rewards/rejected": 0.035938359797000885, + "step": 136 + }, + { + "epoch": 0.02, + "learning_rate": 3.7027027027027028e-06, + "logits/chosen": -0.39413389563560486, + "logits/rejected": -0.39017483592033386, + "logps/chosen": -91.27951049804688, + "logps/rejected": -178.72604370117188, + "loss": 0.6356, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.03695220872759819, + "rewards/margins": -0.14367827773094177, + "rewards/rejected": 0.18063049018383026, + "step": 137 + }, + { + "epoch": 0.02, + "learning_rate": 3.72972972972973e-06, + "logits/chosen": -0.11732465773820877, + "logits/rejected": -0.08171546459197998, + "logps/chosen": -111.16372680664062, + "logps/rejected": -41.00941848754883, + "loss": 0.608, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.0036178589798510075, + "rewards/margins": -0.012531280517578125, + "rewards/rejected": 0.008913422003388405, + "step": 138 + }, + { + "epoch": 0.02, + "learning_rate": 3.756756756756757e-06, + "logits/chosen": -0.48076149821281433, + "logits/rejected": -0.49718281626701355, + "logps/chosen": -64.965576171875, + "logps/rejected": -23.471662521362305, + "loss": 0.6173, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2137306183576584, + "rewards/margins": 0.08537024259567261, + "rewards/rejected": 0.12836037576198578, + "step": 139 + }, + { + "epoch": 0.02, + "learning_rate": 3.7837837837837844e-06, + "logits/chosen": -0.5931867957115173, + "logits/rejected": -0.534203052520752, + "logps/chosen": -218.2240753173828, + "logps/rejected": -66.82318878173828, + "loss": 0.6116, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2989944517612457, + "rewards/margins": 0.017102062702178955, + "rewards/rejected": 0.2818923890590668, + "step": 140 + }, + { + "epoch": 0.02, + "learning_rate": 3.810810810810811e-06, + "logits/chosen": -0.23604530096054077, + "logits/rejected": -0.18047088384628296, + "logps/chosen": -83.79810333251953, + "logps/rejected": -66.38246154785156, + "loss": 0.7055, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5049721002578735, + "rewards/margins": 0.20592346787452698, + "rewards/rejected": 0.29904863238334656, + "step": 141 + }, + { + "epoch": 0.02, + "learning_rate": 3.837837837837838e-06, + "logits/chosen": -0.6938999891281128, + "logits/rejected": -0.8542177677154541, + "logps/chosen": -229.1028289794922, + "logps/rejected": -25.11754035949707, + "loss": 0.5025, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2989700436592102, + "rewards/margins": 0.2895103693008423, + "rewards/rejected": 0.00945968646556139, + "step": 142 + }, + { + "epoch": 0.02, + "learning_rate": 3.864864864864865e-06, + "logits/chosen": -0.48644429445266724, + "logits/rejected": -0.48644429445266724, + "logps/chosen": -2.7404568195343018, + "logps/rejected": -2.7404568195343018, + "loss": 0.8494, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.046611238270998, + "rewards/margins": 0.0, + "rewards/rejected": 0.046611238270998, + "step": 143 + }, + { + "epoch": 0.02, + "learning_rate": 3.891891891891892e-06, + "logits/chosen": -0.6172652244567871, + "logits/rejected": -0.5523759722709656, + "logps/chosen": -71.55520629882812, + "logps/rejected": -123.30278778076172, + "loss": 0.6333, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.20458984375, + "rewards/margins": -0.2289329469203949, + "rewards/rejected": 0.4335227906703949, + "step": 144 + }, + { + "epoch": 0.02, + "learning_rate": 3.918918918918919e-06, + "logits/chosen": -0.3668178617954254, + "logits/rejected": -0.34476107358932495, + "logps/chosen": -128.392578125, + "logps/rejected": -63.95838165283203, + "loss": 0.6368, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.40890198945999146, + "rewards/margins": 0.0935111939907074, + "rewards/rejected": 0.31539079546928406, + "step": 145 + }, + { + "epoch": 0.02, + "learning_rate": 3.945945945945947e-06, + "logits/chosen": -0.4189789593219757, + "logits/rejected": -0.4189789593219757, + "logps/chosen": -65.5035400390625, + "logps/rejected": -65.5035400390625, + "loss": 0.6612, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.041466522961854935, + "rewards/margins": 0.0, + "rewards/rejected": 0.041466522961854935, + "step": 146 + }, + { + "epoch": 0.02, + "learning_rate": 3.972972972972973e-06, + "logits/chosen": -0.44074806571006775, + "logits/rejected": -0.4372875988483429, + "logps/chosen": -50.51123046875, + "logps/rejected": -34.59359359741211, + "loss": 0.686, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.20237694680690765, + "rewards/margins": 0.15927964448928833, + "rewards/rejected": 0.04309730604290962, + "step": 147 + }, + { + "epoch": 0.02, + "learning_rate": 4.000000000000001e-06, + "logits/chosen": -0.45713549852371216, + "logits/rejected": -0.4430827796459198, + "logps/chosen": -91.44034576416016, + "logps/rejected": -106.02471923828125, + "loss": 0.6064, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3032020628452301, + "rewards/margins": 0.07759857177734375, + "rewards/rejected": 0.22560349106788635, + "step": 148 + }, + { + "epoch": 0.02, + "learning_rate": 4.027027027027028e-06, + "logits/chosen": -0.266280859708786, + "logits/rejected": -0.20300045609474182, + "logps/chosen": -58.25056076049805, + "logps/rejected": -76.41596984863281, + "loss": 0.6835, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.26804351806640625, + "rewards/margins": 0.1059417724609375, + "rewards/rejected": 0.16210174560546875, + "step": 149 + }, + { + "epoch": 0.02, + "learning_rate": 4.0540540540540545e-06, + "logits/chosen": -0.28063246607780457, + "logits/rejected": -0.12007176876068115, + "logps/chosen": -174.41659545898438, + "logps/rejected": -125.78324890136719, + "loss": 0.6888, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.608203113079071, + "rewards/margins": 0.056932032108306885, + "rewards/rejected": 0.5512710809707642, + "step": 150 + }, + { + "epoch": 0.02, + "learning_rate": 4.0810810810810815e-06, + "logits/chosen": -0.5286501049995422, + "logits/rejected": -0.558150053024292, + "logps/chosen": -108.0134048461914, + "logps/rejected": -221.62045288085938, + "loss": 0.729, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.01348190288990736, + "rewards/margins": -0.035437773913145065, + "rewards/rejected": 0.02195587195456028, + "step": 151 + }, + { + "epoch": 0.02, + "learning_rate": 4.108108108108108e-06, + "logits/chosen": -0.19124962389469147, + "logits/rejected": -0.19341856241226196, + "logps/chosen": -5.964284420013428, + "logps/rejected": -7.372284889221191, + "loss": 0.6598, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0063898563385009766, + "rewards/margins": -0.02582254633307457, + "rewards/rejected": 0.032212402671575546, + "step": 152 + }, + { + "epoch": 0.02, + "learning_rate": 4.135135135135135e-06, + "logits/chosen": -0.39883410930633545, + "logits/rejected": -0.39042142033576965, + "logps/chosen": -102.09837341308594, + "logps/rejected": -107.97804260253906, + "loss": 0.6375, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.1370033323764801, + "rewards/margins": -0.08735732734203339, + "rewards/rejected": 0.2243606597185135, + "step": 153 + }, + { + "epoch": 0.02, + "learning_rate": 4.162162162162163e-06, + "logits/chosen": -0.1644919216632843, + "logits/rejected": -0.1065155565738678, + "logps/chosen": -54.46918869018555, + "logps/rejected": -28.764083862304688, + "loss": 0.5594, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.33821144700050354, + "rewards/margins": 0.32633668184280396, + "rewards/rejected": 0.011874771676957607, + "step": 154 + }, + { + "epoch": 0.03, + "learning_rate": 4.189189189189189e-06, + "logits/chosen": -0.13164207339286804, + "logits/rejected": -0.07148265093564987, + "logps/chosen": -70.51438903808594, + "logps/rejected": -50.65663528442383, + "loss": 0.5813, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2792671322822571, + "rewards/margins": 0.1573360562324524, + "rewards/rejected": 0.12193107604980469, + "step": 155 + }, + { + "epoch": 0.03, + "learning_rate": 4.216216216216217e-06, + "logits/chosen": -0.4109238386154175, + "logits/rejected": -0.37943461537361145, + "logps/chosen": -218.996826171875, + "logps/rejected": -108.79641723632812, + "loss": 0.644, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.7189834713935852, + "rewards/margins": 0.05036848783493042, + "rewards/rejected": 0.6686149835586548, + "step": 156 + }, + { + "epoch": 0.03, + "learning_rate": 4.243243243243244e-06, + "logits/chosen": -0.12278363853693008, + "logits/rejected": -0.09661532193422318, + "logps/chosen": -319.29742431640625, + "logps/rejected": -192.38812255859375, + "loss": 0.9988, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.27650758624076843, + "rewards/margins": -0.5943603515625, + "rewards/rejected": 0.870867908000946, + "step": 157 + }, + { + "epoch": 0.03, + "learning_rate": 4.270270270270271e-06, + "logits/chosen": -0.3730891942977905, + "logits/rejected": -0.30327463150024414, + "logps/chosen": -140.99615478515625, + "logps/rejected": -91.59083557128906, + "loss": 0.6891, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.7495529055595398, + "rewards/margins": 0.27909544110298157, + "rewards/rejected": 0.4704574644565582, + "step": 158 + }, + { + "epoch": 0.03, + "learning_rate": 4.297297297297298e-06, + "logits/chosen": -0.3464239537715912, + "logits/rejected": -0.37349557876586914, + "logps/chosen": -63.30003356933594, + "logps/rejected": -63.26586151123047, + "loss": 0.6464, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.3786941468715668, + "rewards/margins": -0.4408653676509857, + "rewards/rejected": 0.8195595145225525, + "step": 159 + }, + { + "epoch": 0.03, + "learning_rate": 4.324324324324325e-06, + "logits/chosen": -0.7226946949958801, + "logits/rejected": -0.7155953049659729, + "logps/chosen": -60.91812515258789, + "logps/rejected": -14.827774047851562, + "loss": 0.5813, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.6975994110107422, + "rewards/margins": 0.5850561261177063, + "rewards/rejected": 0.11254329979419708, + "step": 160 + }, + { + "epoch": 0.03, + "learning_rate": 4.351351351351352e-06, + "logits/chosen": -0.36362436413764954, + "logits/rejected": -0.3039014935493469, + "logps/chosen": -215.9579315185547, + "logps/rejected": -89.28739166259766, + "loss": 0.478, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.8396682739257812, + "rewards/margins": 1.3229148387908936, + "rewards/rejected": 0.5167533755302429, + "step": 161 + }, + { + "epoch": 0.03, + "learning_rate": 4.378378378378379e-06, + "logits/chosen": -0.3755668103694916, + "logits/rejected": -0.3900182247161865, + "logps/chosen": -80.28208923339844, + "logps/rejected": -35.141021728515625, + "loss": 0.8972, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.020497893914580345, + "rewards/margins": -0.20459061861038208, + "rewards/rejected": 0.22508850693702698, + "step": 162 + }, + { + "epoch": 0.03, + "learning_rate": 4.4054054054054054e-06, + "logits/chosen": -0.35150954127311707, + "logits/rejected": -0.2975267171859741, + "logps/chosen": -105.2725830078125, + "logps/rejected": -83.23442840576172, + "loss": 0.9367, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.171916201710701, + "rewards/margins": -0.6783302426338196, + "rewards/rejected": 0.8502464294433594, + "step": 163 + }, + { + "epoch": 0.03, + "learning_rate": 4.432432432432433e-06, + "logits/chosen": -0.26578161120414734, + "logits/rejected": -0.2557249665260315, + "logps/chosen": -20.608081817626953, + "logps/rejected": -5.084046363830566, + "loss": 0.5839, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.045597076416015625, + "rewards/margins": -0.028025247156620026, + "rewards/rejected": 0.07362232357263565, + "step": 164 + }, + { + "epoch": 0.03, + "learning_rate": 4.45945945945946e-06, + "logits/chosen": -0.3853006958961487, + "logits/rejected": -0.38487449288368225, + "logps/chosen": -88.90899658203125, + "logps/rejected": -82.85183715820312, + "loss": 0.8251, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.064117431640625, + "rewards/margins": -0.34173280000686646, + "rewards/rejected": 0.40585023164749146, + "step": 165 + }, + { + "epoch": 0.03, + "learning_rate": 4.486486486486487e-06, + "logits/chosen": -0.49084994196891785, + "logits/rejected": -0.9786820411682129, + "logps/chosen": -231.0400390625, + "logps/rejected": -38.45098876953125, + "loss": 0.3114, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.7028488516807556, + "rewards/margins": 0.6620422601699829, + "rewards/rejected": 0.04080658033490181, + "step": 166 + }, + { + "epoch": 0.03, + "learning_rate": 4.513513513513514e-06, + "logits/chosen": -0.24522343277931213, + "logits/rejected": -0.24353036284446716, + "logps/chosen": -2.8589372634887695, + "logps/rejected": -4.371041774749756, + "loss": 0.6488, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0571148656308651, + "rewards/margins": -0.01078636571764946, + "rewards/rejected": 0.06790123134851456, + "step": 167 + }, + { + "epoch": 0.03, + "learning_rate": 4.540540540540541e-06, + "logits/chosen": -0.10647883266210556, + "logits/rejected": -0.13186874985694885, + "logps/chosen": -86.13424682617188, + "logps/rejected": -92.98977661132812, + "loss": 0.3342, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.7761184573173523, + "rewards/margins": 0.3728942573070526, + "rewards/rejected": 0.4032242000102997, + "step": 168 + }, + { + "epoch": 0.03, + "learning_rate": 4.567567567567568e-06, + "logits/chosen": -0.35560309886932373, + "logits/rejected": -0.29308122396469116, + "logps/chosen": -49.29555892944336, + "logps/rejected": -93.42361450195312, + "loss": 0.4974, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.6565849184989929, + "rewards/margins": 0.2613128423690796, + "rewards/rejected": 0.39527207612991333, + "step": 169 + }, + { + "epoch": 0.03, + "learning_rate": 4.594594594594596e-06, + "logits/chosen": -0.40532663464546204, + "logits/rejected": -0.4148384630680084, + "logps/chosen": -91.16151428222656, + "logps/rejected": -67.8692398071289, + "loss": 1.0784, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.4865402281284332, + "rewards/margins": -0.3812316954135895, + "rewards/rejected": 0.8677719235420227, + "step": 170 + }, + { + "epoch": 0.03, + "learning_rate": 4.621621621621622e-06, + "logits/chosen": -0.2474246472120285, + "logits/rejected": -0.23218487203121185, + "logps/chosen": -79.73190307617188, + "logps/rejected": -91.44429016113281, + "loss": 0.5947, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.971784234046936, + "rewards/margins": 0.4424629211425781, + "rewards/rejected": 0.5293213129043579, + "step": 171 + }, + { + "epoch": 0.03, + "learning_rate": 4.6486486486486495e-06, + "logits/chosen": -0.2832660973072052, + "logits/rejected": -0.24877415597438812, + "logps/chosen": -177.03244018554688, + "logps/rejected": -104.60763549804688, + "loss": 0.6848, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.7508880496025085, + "rewards/margins": -0.00638580322265625, + "rewards/rejected": 0.7572738528251648, + "step": 172 + }, + { + "epoch": 0.03, + "learning_rate": 4.675675675675676e-06, + "logits/chosen": -0.3628458082675934, + "logits/rejected": -0.3709790110588074, + "logps/chosen": -76.40803527832031, + "logps/rejected": -106.74444580078125, + "loss": 0.5874, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.24645309150218964, + "rewards/margins": 0.037680044770240784, + "rewards/rejected": 0.20877304673194885, + "step": 173 + }, + { + "epoch": 0.03, + "learning_rate": 4.702702702702703e-06, + "logits/chosen": -0.432625412940979, + "logits/rejected": -0.44562795758247375, + "logps/chosen": -74.16155242919922, + "logps/rejected": -76.84075927734375, + "loss": 0.8864, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.385528564453125, + "rewards/margins": -0.2843155264854431, + "rewards/rejected": 0.6698440909385681, + "step": 174 + }, + { + "epoch": 0.03, + "learning_rate": 4.72972972972973e-06, + "logits/chosen": -0.5271410346031189, + "logits/rejected": -0.5432368516921997, + "logps/chosen": -262.1299133300781, + "logps/rejected": -234.118408203125, + "loss": 0.772, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.753686547279358, + "rewards/margins": 0.03702700138092041, + "rewards/rejected": 1.7166595458984375, + "step": 175 + }, + { + "epoch": 0.03, + "learning_rate": 4.756756756756757e-06, + "logits/chosen": -0.48524972796440125, + "logits/rejected": -0.5194293856620789, + "logps/chosen": -96.34738159179688, + "logps/rejected": -73.47918701171875, + "loss": 0.7565, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.4535629451274872, + "rewards/margins": -0.2921607792377472, + "rewards/rejected": 0.7457237243652344, + "step": 176 + }, + { + "epoch": 0.03, + "learning_rate": 4.783783783783784e-06, + "logits/chosen": -0.6095321178436279, + "logits/rejected": -0.6252351999282837, + "logps/chosen": -93.01603698730469, + "logps/rejected": -67.78550720214844, + "loss": 0.7469, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.89950031042099, + "rewards/margins": 0.05935746431350708, + "rewards/rejected": 0.8401428461074829, + "step": 177 + }, + { + "epoch": 0.03, + "learning_rate": 4.810810810810811e-06, + "logits/chosen": -0.4824122488498688, + "logits/rejected": -0.4417378604412079, + "logps/chosen": -74.24661254882812, + "logps/rejected": -83.44276428222656, + "loss": 0.6046, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.7329498529434204, + "rewards/margins": -0.16741329431533813, + "rewards/rejected": 0.9003631472587585, + "step": 178 + }, + { + "epoch": 0.03, + "learning_rate": 4.837837837837838e-06, + "logits/chosen": -0.44060978293418884, + "logits/rejected": -0.44060978293418884, + "logps/chosen": -187.6918487548828, + "logps/rejected": -187.6918487548828, + "loss": 0.7116, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.5632370114326477, + "rewards/margins": 0.0, + "rewards/rejected": 0.5632370114326477, + "step": 179 + }, + { + "epoch": 0.03, + "learning_rate": 4.864864864864866e-06, + "logits/chosen": -0.3316657841205597, + "logits/rejected": -0.3297908306121826, + "logps/chosen": -101.62100982666016, + "logps/rejected": -52.452850341796875, + "loss": 1.0196, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.4539321959018707, + "rewards/margins": -0.5871139764785767, + "rewards/rejected": 1.041046142578125, + "step": 180 + }, + { + "epoch": 0.03, + "learning_rate": 4.891891891891893e-06, + "logits/chosen": -0.5974945425987244, + "logits/rejected": -0.5723721981048584, + "logps/chosen": -67.2164077758789, + "logps/rejected": -61.5245361328125, + "loss": 0.5838, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.7860832214355469, + "rewards/margins": 0.17645412683486938, + "rewards/rejected": 0.6096290946006775, + "step": 181 + }, + { + "epoch": 0.03, + "learning_rate": 4.91891891891892e-06, + "logits/chosen": -0.22171048820018768, + "logits/rejected": -0.24672889709472656, + "logps/chosen": -6.729167938232422, + "logps/rejected": -95.65017700195312, + "loss": 0.7867, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.22950629889965057, + "rewards/margins": 0.3941478729248047, + "rewards/rejected": -0.1646415740251541, + "step": 182 + }, + { + "epoch": 0.03, + "learning_rate": 4.9459459459459466e-06, + "logits/chosen": -0.4402918219566345, + "logits/rejected": -0.4296458959579468, + "logps/chosen": -89.40145874023438, + "logps/rejected": -58.27642822265625, + "loss": 0.8592, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3524620234966278, + "rewards/margins": 0.07607805728912354, + "rewards/rejected": 0.2763839662075043, + "step": 183 + }, + { + "epoch": 0.03, + "learning_rate": 4.9729729729729735e-06, + "logits/chosen": -0.39938247203826904, + "logits/rejected": -0.38870805501937866, + "logps/chosen": -131.6815185546875, + "logps/rejected": -94.52764129638672, + "loss": 0.3511, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.5745849609375, + "rewards/margins": 1.2499504089355469, + "rewards/rejected": 0.3246345520019531, + "step": 184 + }, + { + "epoch": 0.03, + "learning_rate": 5e-06, + "logits/chosen": -0.5677275061607361, + "logits/rejected": -0.5532841086387634, + "logps/chosen": -98.20549011230469, + "logps/rejected": -72.39690399169922, + "loss": 0.5599, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.4574951231479645, + "rewards/margins": 0.2599838376045227, + "rewards/rejected": 0.19751130044460297, + "step": 185 + }, + { + "epoch": 0.03, + "learning_rate": 5.027027027027027e-06, + "logits/chosen": -0.16714484989643097, + "logits/rejected": -0.1381940394639969, + "logps/chosen": -128.68344116210938, + "logps/rejected": -153.51187133789062, + "loss": 0.9654, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.1109161376953125, + "rewards/margins": -1.140966773033142, + "rewards/rejected": 1.2518829107284546, + "step": 186 + }, + { + "epoch": 0.03, + "learning_rate": 5.054054054054054e-06, + "logits/chosen": -0.4187050461769104, + "logits/rejected": -0.3865879476070404, + "logps/chosen": -55.80255126953125, + "logps/rejected": -58.285926818847656, + "loss": 0.368, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.807019829750061, + "rewards/margins": 0.32818758487701416, + "rewards/rejected": 0.4788322448730469, + "step": 187 + }, + { + "epoch": 0.03, + "learning_rate": 5.081081081081082e-06, + "logits/chosen": -0.2277982085943222, + "logits/rejected": -0.03156036138534546, + "logps/chosen": -244.27928161621094, + "logps/rejected": -31.901409149169922, + "loss": 0.2121, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.7035964727401733, + "rewards/margins": 1.382720947265625, + "rewards/rejected": 0.3208755552768707, + "step": 188 + }, + { + "epoch": 0.03, + "learning_rate": 5.108108108108108e-06, + "logits/chosen": -0.530383288860321, + "logits/rejected": -0.530383288860321, + "logps/chosen": -45.406715393066406, + "logps/rejected": -45.406715393066406, + "loss": 0.7322, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.6578788757324219, + "rewards/margins": 0.0, + "rewards/rejected": 0.6578788757324219, + "step": 189 + }, + { + "epoch": 0.03, + "learning_rate": 5.135135135135135e-06, + "logits/chosen": -0.46107083559036255, + "logits/rejected": -0.46107083559036255, + "logps/chosen": -26.588611602783203, + "logps/rejected": -26.588611602783203, + "loss": 0.5642, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.322824090719223, + "rewards/margins": 0.0, + "rewards/rejected": 0.322824090719223, + "step": 190 + }, + { + "epoch": 0.03, + "learning_rate": 5.162162162162162e-06, + "logits/chosen": 0.12930189073085785, + "logits/rejected": 0.12711942195892334, + "logps/chosen": -13.208555221557617, + "logps/rejected": -2.5189478397369385, + "loss": 1.0119, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.011366081424057484, + "rewards/margins": -0.145709827542305, + "rewards/rejected": 0.1570759117603302, + "step": 191 + }, + { + "epoch": 0.03, + "learning_rate": 5.18918918918919e-06, + "logits/chosen": -0.5397018790245056, + "logits/rejected": -0.5148532390594482, + "logps/chosen": -137.2183074951172, + "logps/rejected": -69.62406921386719, + "loss": 0.3454, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.3065185546875, + "rewards/margins": 0.6277992129325867, + "rewards/rejected": 0.6787193417549133, + "step": 192 + }, + { + "epoch": 0.03, + "learning_rate": 5.216216216216217e-06, + "logits/chosen": -0.11033254116773605, + "logits/rejected": -0.007185749709606171, + "logps/chosen": -124.92326354980469, + "logps/rejected": -62.838584899902344, + "loss": 0.4235, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.2341583967208862, + "rewards/margins": 0.29033362865448, + "rewards/rejected": 0.9438247680664062, + "step": 193 + }, + { + "epoch": 0.03, + "learning_rate": 5.243243243243244e-06, + "logits/chosen": -0.43250250816345215, + "logits/rejected": -0.39114484190940857, + "logps/chosen": -85.87202453613281, + "logps/rejected": -87.0369644165039, + "loss": 0.5983, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.232074022293091, + "rewards/margins": 0.4239463806152344, + "rewards/rejected": 1.8081276416778564, + "step": 194 + }, + { + "epoch": 0.03, + "learning_rate": 5.2702702702702705e-06, + "logits/chosen": -0.5019484758377075, + "logits/rejected": -0.4686777591705322, + "logps/chosen": -219.60992431640625, + "logps/rejected": -278.68292236328125, + "loss": 1.0788, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.4785979986190796, + "rewards/margins": -0.982559323310852, + "rewards/rejected": 2.4611573219299316, + "step": 195 + }, + { + "epoch": 0.03, + "learning_rate": 5.297297297297298e-06, + "logits/chosen": -0.22686974704265594, + "logits/rejected": -0.23679137229919434, + "logps/chosen": -80.14512634277344, + "logps/rejected": -45.597843170166016, + "loss": 0.5821, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2777359187602997, + "rewards/margins": 0.2909504175186157, + "rewards/rejected": -0.013214493170380592, + "step": 196 + }, + { + "epoch": 0.03, + "learning_rate": 5.324324324324324e-06, + "logits/chosen": -0.327831506729126, + "logits/rejected": -0.2663736343383789, + "logps/chosen": -90.38093566894531, + "logps/rejected": -34.111915588378906, + "loss": 0.8312, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.6975822448730469, + "rewards/margins": 0.42062148451805115, + "rewards/rejected": 0.2769607603549957, + "step": 197 + }, + { + "epoch": 0.03, + "learning_rate": 5.351351351351351e-06, + "logits/chosen": -0.10995859652757645, + "logits/rejected": -0.10995859652757645, + "logps/chosen": -89.24122619628906, + "logps/rejected": -89.24122619628906, + "loss": 0.6777, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.5260025262832642, + "rewards/margins": 0.0, + "rewards/rejected": 0.5260025262832642, + "step": 198 + }, + { + "epoch": 0.03, + "learning_rate": 5.378378378378378e-06, + "logits/chosen": -0.6199027299880981, + "logits/rejected": -0.5808255672454834, + "logps/chosen": -195.969482421875, + "logps/rejected": -259.1502685546875, + "loss": 0.5219, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.0311646461486816, + "rewards/margins": -0.21514582633972168, + "rewards/rejected": 2.2463104724884033, + "step": 199 + }, + { + "epoch": 0.03, + "learning_rate": 5.405405405405406e-06, + "logits/chosen": -0.6558588147163391, + "logits/rejected": -0.4452419877052307, + "logps/chosen": -93.61932373046875, + "logps/rejected": -63.95402908325195, + "loss": 0.9685, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.525347888469696, + "rewards/margins": 0.12613102793693542, + "rewards/rejected": 0.3992168605327606, + "step": 200 + }, + { + "epoch": 0.03, + "learning_rate": 5.432432432432433e-06, + "logits/chosen": -0.5460140705108643, + "logits/rejected": -0.45593687891960144, + "logps/chosen": -95.54086303710938, + "logps/rejected": -123.65989685058594, + "loss": 0.7153, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.0840721130371094, + "rewards/margins": -0.19529342651367188, + "rewards/rejected": 1.2793655395507812, + "step": 201 + }, + { + "epoch": 0.03, + "learning_rate": 5.45945945945946e-06, + "logits/chosen": -0.44681593775749207, + "logits/rejected": -0.37062421441078186, + "logps/chosen": -65.87855529785156, + "logps/rejected": -92.7637939453125, + "loss": 1.1922, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.52779620885849, + "rewards/margins": -1.1659576892852783, + "rewards/rejected": 1.6937538385391235, + "step": 202 + }, + { + "epoch": 0.03, + "learning_rate": 5.486486486486487e-06, + "logits/chosen": -0.4099199175834656, + "logits/rejected": -0.3897953927516937, + "logps/chosen": -128.39859008789062, + "logps/rejected": -86.62623596191406, + "loss": 1.2435, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.05333099514245987, + "rewards/margins": -1.0132194757461548, + "rewards/rejected": 0.9598884582519531, + "step": 203 + }, + { + "epoch": 0.03, + "learning_rate": 5.513513513513515e-06, + "logits/chosen": 0.004122351296246052, + "logits/rejected": 0.00878084171563387, + "logps/chosen": -52.20966339111328, + "logps/rejected": -18.511863708496094, + "loss": 0.5984, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.09960594028234482, + "rewards/margins": 0.011361122131347656, + "rewards/rejected": 0.08824481815099716, + "step": 204 + }, + { + "epoch": 0.03, + "learning_rate": 5.540540540540541e-06, + "logits/chosen": -0.029416177421808243, + "logits/rejected": -0.029416177421808243, + "logps/chosen": -85.04864501953125, + "logps/rejected": -85.04864501953125, + "loss": 0.8433, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.09983062744140625, + "rewards/margins": 0.0, + "rewards/rejected": 0.09983062744140625, + "step": 205 + }, + { + "epoch": 0.03, + "learning_rate": 5.567567567567568e-06, + "logits/chosen": -0.3968685567378998, + "logits/rejected": -0.2838006317615509, + "logps/chosen": -140.42282104492188, + "logps/rejected": -23.51209831237793, + "loss": 0.5254, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.20451202988624573, + "rewards/margins": -0.3897928297519684, + "rewards/rejected": 0.18528079986572266, + "step": 206 + }, + { + "epoch": 0.03, + "learning_rate": 5.5945945945945945e-06, + "logits/chosen": -0.45610326528549194, + "logits/rejected": -0.23061521351337433, + "logps/chosen": -176.4048309326172, + "logps/rejected": -16.905866622924805, + "loss": 1.0725, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.216375708580017, + "rewards/margins": 1.1198762655258179, + "rewards/rejected": 0.09649944305419922, + "step": 207 + }, + { + "epoch": 0.03, + "learning_rate": 5.621621621621622e-06, + "logits/chosen": -0.7665066123008728, + "logits/rejected": -0.3889113962650299, + "logps/chosen": -90.9207763671875, + "logps/rejected": -133.72616577148438, + "loss": 0.7164, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.5695175528526306, + "rewards/margins": -0.584460437297821, + "rewards/rejected": 1.1539779901504517, + "step": 208 + }, + { + "epoch": 0.03, + "learning_rate": 5.648648648648649e-06, + "logits/chosen": -0.46441346406936646, + "logits/rejected": -0.4896722435951233, + "logps/chosen": -33.076622009277344, + "logps/rejected": -13.204696655273438, + "loss": 0.7443, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.000875473022461, + "rewards/margins": 0.5299698114395142, + "rewards/rejected": 0.47090569138526917, + "step": 209 + }, + { + "epoch": 0.03, + "learning_rate": 5.675675675675676e-06, + "logits/chosen": -0.73723965883255, + "logits/rejected": -0.7580863833427429, + "logps/chosen": -88.95182800292969, + "logps/rejected": -84.82159423828125, + "loss": 0.7687, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.8045341372489929, + "rewards/margins": -0.6523033976554871, + "rewards/rejected": 1.45683753490448, + "step": 210 + }, + { + "epoch": 0.03, + "learning_rate": 5.702702702702702e-06, + "logits/chosen": -0.3644505441188812, + "logits/rejected": -0.35873159766197205, + "logps/chosen": -29.09490966796875, + "logps/rejected": -49.29778289794922, + "loss": 0.439, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.9700366854667664, + "rewards/margins": -0.0033206939697265625, + "rewards/rejected": 0.9733573794364929, + "step": 211 + }, + { + "epoch": 0.03, + "learning_rate": 5.729729729729731e-06, + "logits/chosen": -0.2239336222410202, + "logits/rejected": -0.09465984255075455, + "logps/chosen": -48.84666442871094, + "logps/rejected": -21.819196701049805, + "loss": 0.4216, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.2244106531143188, + "rewards/margins": 1.0941575765609741, + "rewards/rejected": 0.13025303184986115, + "step": 212 + }, + { + "epoch": 0.03, + "learning_rate": 5.756756756756757e-06, + "logits/chosen": -0.43609267473220825, + "logits/rejected": -0.43609267473220825, + "logps/chosen": -42.07761001586914, + "logps/rejected": -42.07761001586914, + "loss": 0.6503, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.10236358642578125, + "rewards/margins": 0.0, + "rewards/rejected": 0.10236358642578125, + "step": 213 + }, + { + "epoch": 0.03, + "learning_rate": 5.783783783783784e-06, + "logits/chosen": -0.7076036930084229, + "logits/rejected": -0.639098584651947, + "logps/chosen": -61.81955337524414, + "logps/rejected": -163.08001708984375, + "loss": 0.6691, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.5045658349990845, + "rewards/margins": -0.04410207271575928, + "rewards/rejected": 0.5486679077148438, + "step": 214 + }, + { + "epoch": 0.03, + "learning_rate": 5.810810810810811e-06, + "logits/chosen": -0.4888700246810913, + "logits/rejected": -0.4888700246810913, + "logps/chosen": -9.202495574951172, + "logps/rejected": -9.202495574951172, + "loss": 0.5058, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.3832721710205078, + "rewards/margins": 0.0, + "rewards/rejected": 0.3832721710205078, + "step": 215 + }, + { + "epoch": 0.04, + "learning_rate": 5.837837837837839e-06, + "logits/chosen": -0.6351229548454285, + "logits/rejected": -0.6213122606277466, + "logps/chosen": -110.57669830322266, + "logps/rejected": -69.03643798828125, + "loss": 1.1673, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.2312217950820923, + "rewards/margins": -0.3048560619354248, + "rewards/rejected": 1.536077857017517, + "step": 216 + }, + { + "epoch": 0.04, + "learning_rate": 5.8648648648648655e-06, + "logits/chosen": -0.3118152916431427, + "logits/rejected": -0.2871686518192291, + "logps/chosen": -113.5634994506836, + "logps/rejected": -132.2198944091797, + "loss": 1.0209, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.158516764640808, + "rewards/margins": -0.7293448448181152, + "rewards/rejected": 1.8878616094589233, + "step": 217 + }, + { + "epoch": 0.04, + "learning_rate": 5.8918918918918924e-06, + "logits/chosen": -0.6266109347343445, + "logits/rejected": -0.5990840792655945, + "logps/chosen": -47.042240142822266, + "logps/rejected": -70.89442443847656, + "loss": 0.7284, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.6756710410118103, + "rewards/margins": -0.22505831718444824, + "rewards/rejected": 0.9007293581962585, + "step": 218 + }, + { + "epoch": 0.04, + "learning_rate": 5.9189189189189185e-06, + "logits/chosen": -0.44157737493515015, + "logits/rejected": -0.32074663043022156, + "logps/chosen": -109.67311096191406, + "logps/rejected": -78.32131958007812, + "loss": 0.725, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.20706939697265625, + "rewards/margins": -0.3215179443359375, + "rewards/rejected": 0.5285873413085938, + "step": 219 + }, + { + "epoch": 0.04, + "learning_rate": 5.945945945945947e-06, + "logits/chosen": -0.6278207302093506, + "logits/rejected": -0.6112365126609802, + "logps/chosen": -124.8326644897461, + "logps/rejected": -115.2213363647461, + "loss": 0.9405, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.7603355646133423, + "rewards/margins": -1.0253875255584717, + "rewards/rejected": 1.785723090171814, + "step": 220 + }, + { + "epoch": 0.04, + "learning_rate": 5.972972972972973e-06, + "logits/chosen": -0.26352062821388245, + "logits/rejected": -0.22804851830005646, + "logps/chosen": -58.074859619140625, + "logps/rejected": -19.203006744384766, + "loss": 0.941, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.88433837890625, + "rewards/margins": 0.7970386743545532, + "rewards/rejected": 0.08729972690343857, + "step": 221 + }, + { + "epoch": 0.04, + "learning_rate": 6e-06, + "logits/chosen": -0.8018525838851929, + "logits/rejected": -0.7828300595283508, + "logps/chosen": -52.74665451049805, + "logps/rejected": -21.87416648864746, + "loss": 0.3057, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.142921805381775, + "rewards/margins": 0.914406955242157, + "rewards/rejected": 0.2285148650407791, + "step": 222 + }, + { + "epoch": 0.04, + "learning_rate": 6.027027027027027e-06, + "logits/chosen": -0.6558531522750854, + "logits/rejected": -0.6633104085922241, + "logps/chosen": -56.594696044921875, + "logps/rejected": -76.10774230957031, + "loss": 0.7613, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.0991486310958862, + "rewards/margins": -0.25614845752716064, + "rewards/rejected": 1.3552970886230469, + "step": 223 + }, + { + "epoch": 0.04, + "learning_rate": 6.054054054054055e-06, + "logits/chosen": -0.5184029936790466, + "logits/rejected": -0.4257000684738159, + "logps/chosen": -186.2162322998047, + "logps/rejected": -223.58657836914062, + "loss": 1.1575, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.6324630975723267, + "rewards/margins": -0.7439299821853638, + "rewards/rejected": 2.3763930797576904, + "step": 224 + }, + { + "epoch": 0.04, + "learning_rate": 6.081081081081082e-06, + "logits/chosen": -0.5012893080711365, + "logits/rejected": -0.4682873487472534, + "logps/chosen": -165.57894897460938, + "logps/rejected": -38.147621154785156, + "loss": 0.1431, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.0879533290863037, + "rewards/margins": 1.8528367280960083, + "rewards/rejected": 0.23511658608913422, + "step": 225 + }, + { + "epoch": 0.04, + "learning_rate": 6.108108108108109e-06, + "logits/chosen": -0.1703360378742218, + "logits/rejected": -0.05899113789200783, + "logps/chosen": -153.6275634765625, + "logps/rejected": -74.43595123291016, + "loss": 1.2833, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.06465759128332138, + "rewards/margins": -1.9666626453399658, + "rewards/rejected": 1.9020050764083862, + "step": 226 + }, + { + "epoch": 0.04, + "learning_rate": 6.135135135135135e-06, + "logits/chosen": -0.09606818854808807, + "logits/rejected": -0.10380269587039948, + "logps/chosen": -107.85609436035156, + "logps/rejected": -111.49337005615234, + "loss": 0.8885, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.577801525592804, + "rewards/margins": -0.3192809820175171, + "rewards/rejected": 0.897082507610321, + "step": 227 + }, + { + "epoch": 0.04, + "learning_rate": 6.162162162162163e-06, + "logits/chosen": -0.6679939031600952, + "logits/rejected": -0.5980499982833862, + "logps/chosen": -85.62117004394531, + "logps/rejected": -142.50534057617188, + "loss": 0.7909, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.0825774669647217, + "rewards/margins": -0.5484864711761475, + "rewards/rejected": 2.631063938140869, + "step": 228 + }, + { + "epoch": 0.04, + "learning_rate": 6.1891891891891895e-06, + "logits/chosen": -0.4002380967140198, + "logits/rejected": -0.41836047172546387, + "logps/chosen": -49.155277252197266, + "logps/rejected": -81.14459228515625, + "loss": 0.8066, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.8835930228233337, + "rewards/margins": -0.8573173880577087, + "rewards/rejected": 1.7409104108810425, + "step": 229 + }, + { + "epoch": 0.04, + "learning_rate": 6.2162162162162164e-06, + "logits/chosen": -0.5616676807403564, + "logits/rejected": -0.48742276430130005, + "logps/chosen": -64.29045867919922, + "logps/rejected": -102.63634490966797, + "loss": 0.998, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.8119087219238281, + "rewards/margins": -0.9369766712188721, + "rewards/rejected": 1.7488853931427002, + "step": 230 + }, + { + "epoch": 0.04, + "learning_rate": 6.243243243243243e-06, + "logits/chosen": -0.37961775064468384, + "logits/rejected": -0.39208653569221497, + "logps/chosen": -42.14962387084961, + "logps/rejected": -5.354367733001709, + "loss": 0.5919, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.09659957885742188, + "rewards/margins": -0.3401297628879547, + "rewards/rejected": 0.24353018403053284, + "step": 231 + }, + { + "epoch": 0.04, + "learning_rate": 6.270270270270271e-06, + "logits/chosen": -0.5783861875534058, + "logits/rejected": -0.5769264101982117, + "logps/chosen": -174.91390991210938, + "logps/rejected": -153.86123657226562, + "loss": 0.717, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.2106918096542358, + "rewards/margins": -0.22735750675201416, + "rewards/rejected": 1.43804931640625, + "step": 232 + }, + { + "epoch": 0.04, + "learning_rate": 6.297297297297298e-06, + "logits/chosen": -0.5690034627914429, + "logits/rejected": -0.5792561769485474, + "logps/chosen": -51.35363006591797, + "logps/rejected": -109.36624145507812, + "loss": 0.5611, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.25248488783836365, + "rewards/margins": 0.1568610966205597, + "rewards/rejected": 0.09562378376722336, + "step": 233 + }, + { + "epoch": 0.04, + "learning_rate": 6.324324324324325e-06, + "logits/chosen": -0.41137516498565674, + "logits/rejected": -0.27592867612838745, + "logps/chosen": -70.94331359863281, + "logps/rejected": -20.01375961303711, + "loss": 0.5011, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.2283538579940796, + "rewards/margins": 1.0154752731323242, + "rewards/rejected": 0.21287861466407776, + "step": 234 + }, + { + "epoch": 0.04, + "learning_rate": 6.351351351351351e-06, + "logits/chosen": -0.648924708366394, + "logits/rejected": -0.648924708366394, + "logps/chosen": -90.79373168945312, + "logps/rejected": -90.79373168945312, + "loss": 0.5709, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.9667602777481079, + "rewards/margins": 0.0, + "rewards/rejected": 0.9667602777481079, + "step": 235 + }, + { + "epoch": 0.04, + "learning_rate": 6.378378378378379e-06, + "logits/chosen": -0.5514968633651733, + "logits/rejected": -0.5186258554458618, + "logps/chosen": -57.84584045410156, + "logps/rejected": -148.16326904296875, + "loss": 0.2878, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.171075463294983, + "rewards/margins": 0.780975341796875, + "rewards/rejected": 0.3901000916957855, + "step": 236 + }, + { + "epoch": 0.04, + "learning_rate": 6.405405405405406e-06, + "logits/chosen": -0.3020959496498108, + "logits/rejected": -0.22074739634990692, + "logps/chosen": -87.28330993652344, + "logps/rejected": -113.90668487548828, + "loss": 0.7924, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.178233340382576, + "rewards/margins": -0.0496978759765625, + "rewards/rejected": 0.2279312163591385, + "step": 237 + }, + { + "epoch": 0.04, + "learning_rate": 6.432432432432433e-06, + "logits/chosen": -0.3646334707736969, + "logits/rejected": -0.3524111211299896, + "logps/chosen": -66.67977142333984, + "logps/rejected": -95.65327453613281, + "loss": 1.0722, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.8063408136367798, + "rewards/margins": -0.7171608209609985, + "rewards/rejected": 1.5235016345977783, + "step": 238 + }, + { + "epoch": 0.04, + "learning_rate": 6.45945945945946e-06, + "logits/chosen": -0.42921173572540283, + "logits/rejected": -0.3898155689239502, + "logps/chosen": -164.2279052734375, + "logps/rejected": -73.22964477539062, + "loss": 0.2335, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.4433655738830566, + "rewards/margins": 1.0725624561309814, + "rewards/rejected": 1.3708031177520752, + "step": 239 + }, + { + "epoch": 0.04, + "learning_rate": 6.486486486486487e-06, + "logits/chosen": -0.061583537608385086, + "logits/rejected": -0.02204938232898712, + "logps/chosen": -65.92337799072266, + "logps/rejected": -66.271240234375, + "loss": 0.4389, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.4421707391738892, + "rewards/margins": 0.23659825325012207, + "rewards/rejected": 1.205572485923767, + "step": 240 + }, + { + "epoch": 0.04, + "learning_rate": 6.513513513513514e-06, + "logits/chosen": -0.4213923513889313, + "logits/rejected": -0.37677037715911865, + "logps/chosen": -46.17170715332031, + "logps/rejected": -166.16127014160156, + "loss": 1.501, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.8764320611953735, + "rewards/margins": -1.9355477094650269, + "rewards/rejected": 2.8119797706604004, + "step": 241 + }, + { + "epoch": 0.04, + "learning_rate": 6.540540540540541e-06, + "logits/chosen": -0.18132369220256805, + "logits/rejected": -0.18132369220256805, + "logps/chosen": -43.82415008544922, + "logps/rejected": -43.82415008544922, + "loss": 0.9941, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.13422317802906036, + "rewards/margins": 0.0, + "rewards/rejected": 0.13422317802906036, + "step": 242 + }, + { + "epoch": 0.04, + "learning_rate": 6.567567567567567e-06, + "logits/chosen": -0.0281538013368845, + "logits/rejected": -0.0176337119191885, + "logps/chosen": -2.1231753826141357, + "logps/rejected": -9.883855819702148, + "loss": 0.6701, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2677726149559021, + "rewards/margins": 0.05458958446979523, + "rewards/rejected": 0.21318303048610687, + "step": 243 + }, + { + "epoch": 0.04, + "learning_rate": 6.594594594594595e-06, + "logits/chosen": -0.46993309259414673, + "logits/rejected": -0.5287684202194214, + "logps/chosen": -122.69329833984375, + "logps/rejected": -119.63229370117188, + "loss": 0.6452, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.4435333013534546, + "rewards/margins": -0.16474461555480957, + "rewards/rejected": 1.6082779169082642, + "step": 244 + }, + { + "epoch": 0.04, + "learning_rate": 6.621621621621622e-06, + "logits/chosen": -0.5822150707244873, + "logits/rejected": -0.4961172342300415, + "logps/chosen": -115.43033599853516, + "logps/rejected": -29.923030853271484, + "loss": 1.0287, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.09178924560546875, + "rewards/margins": 0.04367256164550781, + "rewards/rejected": 0.04811668395996094, + "step": 245 + }, + { + "epoch": 0.04, + "learning_rate": 6.648648648648649e-06, + "logits/chosen": -0.306241899728775, + "logits/rejected": -0.293308824300766, + "logps/chosen": -3.5920727252960205, + "logps/rejected": -21.456296920776367, + "loss": 0.7155, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.29816582798957825, + "rewards/margins": 0.2953297793865204, + "rewards/rejected": 0.002836036728695035, + "step": 246 + }, + { + "epoch": 0.04, + "learning_rate": 6.675675675675676e-06, + "logits/chosen": -0.5634353160858154, + "logits/rejected": -0.42872199416160583, + "logps/chosen": -164.69512939453125, + "logps/rejected": -84.9027099609375, + "loss": 0.314, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.140023946762085, + "rewards/margins": 0.2211083173751831, + "rewards/rejected": 1.9189156293869019, + "step": 247 + }, + { + "epoch": 0.04, + "learning_rate": 6.702702702702704e-06, + "logits/chosen": -0.6783609986305237, + "logits/rejected": -0.6282975077629089, + "logps/chosen": -270.560302734375, + "logps/rejected": -170.6841583251953, + "loss": 0.4998, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.03936767578125, + "rewards/margins": -0.2838699221611023, + "rewards/rejected": 0.2445022612810135, + "step": 248 + }, + { + "epoch": 0.04, + "learning_rate": 6.729729729729731e-06, + "logits/chosen": -0.15011727809906006, + "logits/rejected": -0.1617453545331955, + "logps/chosen": -22.965059280395508, + "logps/rejected": -26.81301498413086, + "loss": 0.6334, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.18887920677661896, + "rewards/margins": -0.08449401706457138, + "rewards/rejected": -0.10438518971204758, + "step": 249 + }, + { + "epoch": 0.04, + "learning_rate": 6.7567567567567575e-06, + "logits/chosen": -0.3496231138706207, + "logits/rejected": -0.3235044479370117, + "logps/chosen": -61.18611145019531, + "logps/rejected": -43.80776596069336, + "loss": 0.7697, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.3430015742778778, + "rewards/margins": -0.2857349216938019, + "rewards/rejected": 0.6287364959716797, + "step": 250 + }, + { + "epoch": 0.04, + "learning_rate": 6.783783783783784e-06, + "logits/chosen": -0.3616917133331299, + "logits/rejected": -0.32686424255371094, + "logps/chosen": -148.93878173828125, + "logps/rejected": -78.43911743164062, + "loss": 0.3442, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.211491346359253, + "rewards/margins": 0.34481799602508545, + "rewards/rejected": 1.8666733503341675, + "step": 251 + }, + { + "epoch": 0.04, + "learning_rate": 6.810810810810811e-06, + "logits/chosen": -0.5381073951721191, + "logits/rejected": -0.5263601541519165, + "logps/chosen": -96.33840942382812, + "logps/rejected": -100.66444396972656, + "loss": 0.7953, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.4710053205490112, + "rewards/margins": -0.3739844560623169, + "rewards/rejected": 1.8449897766113281, + "step": 252 + }, + { + "epoch": 0.04, + "learning_rate": 6.837837837837838e-06, + "logits/chosen": -0.3888112008571625, + "logits/rejected": -0.37421783804893494, + "logps/chosen": -121.62832641601562, + "logps/rejected": -56.21634292602539, + "loss": 1.1268, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.8447647094726562, + "rewards/margins": 1.4741417169570923, + "rewards/rejected": 0.37062302231788635, + "step": 253 + }, + { + "epoch": 0.04, + "learning_rate": 6.864864864864865e-06, + "logits/chosen": -0.41287684440612793, + "logits/rejected": -0.36942052841186523, + "logps/chosen": -50.232757568359375, + "logps/rejected": -62.0122184753418, + "loss": 0.6261, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.440998911857605, + "rewards/margins": 0.07104611396789551, + "rewards/rejected": 1.3699527978897095, + "step": 254 + }, + { + "epoch": 0.04, + "learning_rate": 6.891891891891892e-06, + "logits/chosen": -0.5951519012451172, + "logits/rejected": -0.6335293054580688, + "logps/chosen": -178.17568969726562, + "logps/rejected": -120.1450424194336, + "loss": 1.0902, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.679931640625, + "rewards/margins": 0.37310099601745605, + "rewards/rejected": 1.306830644607544, + "step": 255 + }, + { + "epoch": 0.04, + "learning_rate": 6.91891891891892e-06, + "logits/chosen": -0.5097212195396423, + "logits/rejected": -0.4970630407333374, + "logps/chosen": -139.24790954589844, + "logps/rejected": -96.02497863769531, + "loss": 0.3032, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.014012098312378, + "rewards/margins": 0.3142164945602417, + "rewards/rejected": 1.6997956037521362, + "step": 256 + }, + { + "epoch": 0.04, + "learning_rate": 6.945945945945947e-06, + "logits/chosen": -0.41313740611076355, + "logits/rejected": -0.40211567282676697, + "logps/chosen": -46.31581115722656, + "logps/rejected": -34.00297164916992, + "loss": 0.4817, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.2586860656738281, + "rewards/margins": 0.035671234130859375, + "rewards/rejected": 1.2230148315429688, + "step": 257 + }, + { + "epoch": 0.04, + "learning_rate": 6.972972972972973e-06, + "logits/chosen": -0.33021101355552673, + "logits/rejected": -0.2778749465942383, + "logps/chosen": -75.69873809814453, + "logps/rejected": -22.350170135498047, + "loss": 0.9469, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.47225189208984375, + "rewards/margins": 0.22040003538131714, + "rewards/rejected": 0.2518518567085266, + "step": 258 + }, + { + "epoch": 0.04, + "learning_rate": 7e-06, + "logits/chosen": -0.5045824646949768, + "logits/rejected": -0.4962257742881775, + "logps/chosen": -133.764404296875, + "logps/rejected": -125.12527465820312, + "loss": 0.5814, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.147149682044983, + "rewards/margins": -0.3947204351425171, + "rewards/rejected": 1.5418701171875, + "step": 259 + }, + { + "epoch": 0.04, + "learning_rate": 7.027027027027028e-06, + "logits/chosen": -0.3030274212360382, + "logits/rejected": -0.27360308170318604, + "logps/chosen": -65.45365905761719, + "logps/rejected": -25.603897094726562, + "loss": 1.319, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.7678176760673523, + "rewards/margins": -0.6121334433555603, + "rewards/rejected": 1.3799511194229126, + "step": 260 + }, + { + "epoch": 0.04, + "learning_rate": 7.054054054054055e-06, + "logits/chosen": -0.4462277889251709, + "logits/rejected": -0.358822762966156, + "logps/chosen": -167.16554260253906, + "logps/rejected": -69.79096984863281, + "loss": 0.2519, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.519709825515747, + "rewards/margins": 0.6160858273506165, + "rewards/rejected": 0.9036239981651306, + "step": 261 + }, + { + "epoch": 0.04, + "learning_rate": 7.0810810810810815e-06, + "logits/chosen": -0.5293858647346497, + "logits/rejected": -0.4347819685935974, + "logps/chosen": -148.29312133789062, + "logps/rejected": -152.40399169921875, + "loss": 1.867, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.41286012530326843, + "rewards/margins": -2.4757492542266846, + "rewards/rejected": 2.8886094093322754, + "step": 262 + }, + { + "epoch": 0.04, + "learning_rate": 7.1081081081081085e-06, + "logits/chosen": -0.4818858802318573, + "logits/rejected": -0.3934015929698944, + "logps/chosen": -140.39459228515625, + "logps/rejected": -66.250732421875, + "loss": 1.0515, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.860302686691284, + "rewards/margins": 1.8153678178787231, + "rewards/rejected": 1.044934868812561, + "step": 263 + }, + { + "epoch": 0.04, + "learning_rate": 7.135135135135136e-06, + "logits/chosen": -0.419750839471817, + "logits/rejected": -0.38671672344207764, + "logps/chosen": -117.01793670654297, + "logps/rejected": -121.44218444824219, + "loss": 0.7664, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.9144248962402344, + "rewards/margins": 0.6569068431854248, + "rewards/rejected": 1.2575180530548096, + "step": 264 + }, + { + "epoch": 0.04, + "learning_rate": 7.162162162162163e-06, + "logits/chosen": -0.10959067940711975, + "logits/rejected": -0.16007813811302185, + "logps/chosen": -61.504947662353516, + "logps/rejected": -38.655399322509766, + "loss": 0.8844, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.4380809962749481, + "rewards/margins": -1.0254600048065186, + "rewards/rejected": 1.463541030883789, + "step": 265 + }, + { + "epoch": 0.04, + "learning_rate": 7.189189189189189e-06, + "logits/chosen": -0.8217937350273132, + "logits/rejected": -0.5897722244262695, + "logps/chosen": -136.2455596923828, + "logps/rejected": -184.05706787109375, + "loss": 0.7491, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.7133285999298096, + "rewards/margins": -1.1948699951171875, + "rewards/rejected": 3.908198595046997, + "step": 266 + }, + { + "epoch": 0.04, + "learning_rate": 7.216216216216216e-06, + "logits/chosen": -0.3303723633289337, + "logits/rejected": -0.2991628050804138, + "logps/chosen": -199.38668823242188, + "logps/rejected": -206.53854370117188, + "loss": 0.5356, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.9314544200897217, + "rewards/margins": 1.3268370628356934, + "rewards/rejected": 2.6046173572540283, + "step": 267 + }, + { + "epoch": 0.04, + "learning_rate": 7.243243243243244e-06, + "logits/chosen": -0.5145074725151062, + "logits/rejected": -0.48301440477371216, + "logps/chosen": -118.58301544189453, + "logps/rejected": -133.4744415283203, + "loss": 0.5987, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.9162964224815369, + "rewards/margins": 0.06120610237121582, + "rewards/rejected": 0.855090320110321, + "step": 268 + }, + { + "epoch": 0.04, + "learning_rate": 7.270270270270271e-06, + "logits/chosen": -0.6645691394805908, + "logits/rejected": -0.5625127553939819, + "logps/chosen": -121.33076477050781, + "logps/rejected": -20.44670867919922, + "loss": 0.1206, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.4828643798828125, + "rewards/margins": 2.244030475616455, + "rewards/rejected": 0.23883400857448578, + "step": 269 + }, + { + "epoch": 0.04, + "learning_rate": 7.297297297297298e-06, + "logits/chosen": -0.5796568989753723, + "logits/rejected": -0.559922456741333, + "logps/chosen": -112.19935607910156, + "logps/rejected": -69.89230346679688, + "loss": 0.845, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.746210515499115, + "rewards/margins": -1.3247222900390625, + "rewards/rejected": 2.0709328651428223, + "step": 270 + }, + { + "epoch": 0.04, + "learning_rate": 7.324324324324325e-06, + "logits/chosen": -0.4267655313014984, + "logits/rejected": -0.3737596869468689, + "logps/chosen": -35.18921661376953, + "logps/rejected": -13.71591854095459, + "loss": 0.4293, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.7916290163993835, + "rewards/margins": -0.038120388984680176, + "rewards/rejected": 0.8297494053840637, + "step": 271 + }, + { + "epoch": 0.04, + "learning_rate": 7.3513513513513525e-06, + "logits/chosen": -0.49337664246559143, + "logits/rejected": -0.43890833854675293, + "logps/chosen": -60.42906951904297, + "logps/rejected": -87.84027099609375, + "loss": 0.6551, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.0054664611816406, + "rewards/margins": -0.47071385383605957, + "rewards/rejected": 1.4761803150177002, + "step": 272 + }, + { + "epoch": 0.04, + "learning_rate": 7.3783783783783794e-06, + "logits/chosen": -0.12027273327112198, + "logits/rejected": -0.12027273327112198, + "logps/chosen": -72.90975952148438, + "logps/rejected": -72.90975952148438, + "loss": 0.7566, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.913928210735321, + "rewards/margins": 0.0, + "rewards/rejected": 0.913928210735321, + "step": 273 + }, + { + "epoch": 0.04, + "learning_rate": 7.4054054054054055e-06, + "logits/chosen": -0.6866872310638428, + "logits/rejected": -0.6838214993476868, + "logps/chosen": -66.29534912109375, + "logps/rejected": -29.236085891723633, + "loss": 0.8468, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.7634018659591675, + "rewards/margins": 0.6372072696685791, + "rewards/rejected": 1.1261945962905884, + "step": 274 + }, + { + "epoch": 0.04, + "learning_rate": 7.4324324324324324e-06, + "logits/chosen": -0.5301135778427124, + "logits/rejected": -0.4469902515411377, + "logps/chosen": -103.37748718261719, + "logps/rejected": -162.56736755371094, + "loss": 0.6862, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.018702745437622, + "rewards/margins": -0.3854949474334717, + "rewards/rejected": 2.4041976928710938, + "step": 275 + }, + { + "epoch": 0.04, + "learning_rate": 7.45945945945946e-06, + "logits/chosen": -0.35047227144241333, + "logits/rejected": -0.37482646107673645, + "logps/chosen": -56.27423095703125, + "logps/rejected": -77.22215270996094, + "loss": 0.4166, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.56805419921875, + "rewards/margins": 0.2980484068393707, + "rewards/rejected": 0.2700057923793793, + "step": 276 + }, + { + "epoch": 0.04, + "learning_rate": 7.486486486486487e-06, + "logits/chosen": -0.5765478610992432, + "logits/rejected": -0.5005326271057129, + "logps/chosen": -77.9109115600586, + "logps/rejected": -76.31784057617188, + "loss": 1.2638, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.2618201971054077, + "rewards/margins": -0.2947777509689331, + "rewards/rejected": 1.5565979480743408, + "step": 277 + }, + { + "epoch": 0.05, + "learning_rate": 7.513513513513514e-06, + "logits/chosen": -0.11391573399305344, + "logits/rejected": -0.1579018384218216, + "logps/chosen": -53.8552131652832, + "logps/rejected": -72.2514877319336, + "loss": 0.8456, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.768606960773468, + "rewards/margins": -0.6932826638221741, + "rewards/rejected": 1.461889624595642, + "step": 278 + }, + { + "epoch": 0.05, + "learning_rate": 7.540540540540541e-06, + "logits/chosen": -0.8761853575706482, + "logits/rejected": -0.7881235480308533, + "logps/chosen": -140.78567504882812, + "logps/rejected": -111.49227142333984, + "loss": 1.3263, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.9456207752227783, + "rewards/margins": -1.7854743003845215, + "rewards/rejected": 3.7310950756073, + "step": 279 + }, + { + "epoch": 0.05, + "learning_rate": 7.567567567567569e-06, + "logits/chosen": -0.46765846014022827, + "logits/rejected": -0.362008273601532, + "logps/chosen": -182.8135223388672, + "logps/rejected": -213.86094665527344, + "loss": 0.5356, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.071789503097534, + "rewards/margins": -0.5803146362304688, + "rewards/rejected": 3.652104139328003, + "step": 280 + }, + { + "epoch": 0.05, + "learning_rate": 7.594594594594596e-06, + "logits/chosen": -0.32429561018943787, + "logits/rejected": -0.2891373932361603, + "logps/chosen": -65.37638854980469, + "logps/rejected": -70.685546875, + "loss": 0.2714, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.1183899641036987, + "rewards/margins": 0.5297546982765198, + "rewards/rejected": 0.588635265827179, + "step": 281 + }, + { + "epoch": 0.05, + "learning_rate": 7.621621621621622e-06, + "logits/chosen": -0.4569520950317383, + "logits/rejected": -0.48044052720069885, + "logps/chosen": -75.54312133789062, + "logps/rejected": -91.20838928222656, + "loss": 0.6835, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.4993385076522827, + "rewards/margins": 0.038819074630737305, + "rewards/rejected": 1.4605194330215454, + "step": 282 + }, + { + "epoch": 0.05, + "learning_rate": 7.648648648648649e-06, + "logits/chosen": -0.5300353765487671, + "logits/rejected": -0.5467429757118225, + "logps/chosen": -116.62547302246094, + "logps/rejected": -81.73649597167969, + "loss": 0.6794, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.6563690304756165, + "rewards/margins": -0.5139251351356506, + "rewards/rejected": 1.170294165611267, + "step": 283 + }, + { + "epoch": 0.05, + "learning_rate": 7.675675675675676e-06, + "logits/chosen": -0.454026997089386, + "logits/rejected": -0.4164271354675293, + "logps/chosen": -139.10494995117188, + "logps/rejected": -128.2135467529297, + "loss": 1.1876, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.8006699085235596, + "rewards/margins": 1.2162888050079346, + "rewards/rejected": 0.584381103515625, + "step": 284 + }, + { + "epoch": 0.05, + "learning_rate": 7.702702702702704e-06, + "logits/chosen": 0.06395858526229858, + "logits/rejected": 0.061525698751211166, + "logps/chosen": -14.114311218261719, + "logps/rejected": -5.117259979248047, + "loss": 0.4662, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.22850589454174042, + "rewards/margins": -0.35113197565078735, + "rewards/rejected": 0.12262606620788574, + "step": 285 + }, + { + "epoch": 0.05, + "learning_rate": 7.72972972972973e-06, + "logits/chosen": -0.3764650225639343, + "logits/rejected": -0.3764650225639343, + "logps/chosen": -25.52888298034668, + "logps/rejected": -25.52888298034668, + "loss": 0.845, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.0306615829467773, + "rewards/margins": 0.0, + "rewards/rejected": 1.0306615829467773, + "step": 286 + }, + { + "epoch": 0.05, + "learning_rate": 7.756756756756756e-06, + "logits/chosen": -0.4152284264564514, + "logits/rejected": -0.3167814016342163, + "logps/chosen": -131.38531494140625, + "logps/rejected": -87.55381774902344, + "loss": 0.1075, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.8602447509765625, + "rewards/margins": 3.1529860496520996, + "rewards/rejected": 1.7072585821151733, + "step": 287 + }, + { + "epoch": 0.05, + "learning_rate": 7.783783783783784e-06, + "logits/chosen": -0.24283216893672943, + "logits/rejected": -0.19698242843151093, + "logps/chosen": -43.753623962402344, + "logps/rejected": -39.819114685058594, + "loss": 1.9685, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.7786914706230164, + "rewards/margins": 0.0033614635467529297, + "rewards/rejected": 0.7753300070762634, + "step": 288 + }, + { + "epoch": 0.05, + "learning_rate": 7.810810810810812e-06, + "logits/chosen": -0.1951722949743271, + "logits/rejected": -0.1951722949743271, + "logps/chosen": -31.60072135925293, + "logps/rejected": -31.60072135925293, + "loss": 1.9587, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.27795812487602234, + "rewards/margins": 0.0, + "rewards/rejected": 0.27795812487602234, + "step": 289 + }, + { + "epoch": 0.05, + "learning_rate": 7.837837837837838e-06, + "logits/chosen": -0.6177215576171875, + "logits/rejected": -0.6319472789764404, + "logps/chosen": -115.8067855834961, + "logps/rejected": -100.31466674804688, + "loss": 1.9071, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.16336289048194885, + "rewards/margins": -1.9101661443710327, + "rewards/rejected": 2.073529005050659, + "step": 290 + }, + { + "epoch": 0.05, + "learning_rate": 7.864864864864866e-06, + "logits/chosen": -0.9212203621864319, + "logits/rejected": -0.9275326132774353, + "logps/chosen": -56.95507049560547, + "logps/rejected": -79.95561218261719, + "loss": 0.1606, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.139646291732788, + "rewards/margins": 1.8602495193481445, + "rewards/rejected": 0.27939683198928833, + "step": 291 + }, + { + "epoch": 0.05, + "learning_rate": 7.891891891891894e-06, + "logits/chosen": -0.31271007657051086, + "logits/rejected": -0.25306838750839233, + "logps/chosen": -149.69686889648438, + "logps/rejected": -155.1887969970703, + "loss": 1.4921, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.383676141500473, + "rewards/margins": -2.5701661109924316, + "rewards/rejected": 2.9538421630859375, + "step": 292 + }, + { + "epoch": 0.05, + "learning_rate": 7.91891891891892e-06, + "logits/chosen": -0.43429678678512573, + "logits/rejected": -0.44065508246421814, + "logps/chosen": -3.3223018646240234, + "logps/rejected": -11.006916046142578, + "loss": 0.5411, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.06927838176488876, + "rewards/margins": 0.0016486644744873047, + "rewards/rejected": 0.06762971729040146, + "step": 293 + }, + { + "epoch": 0.05, + "learning_rate": 7.945945945945946e-06, + "logits/chosen": -0.5737243890762329, + "logits/rejected": -0.5492347478866577, + "logps/chosen": -64.08099365234375, + "logps/rejected": -61.52092742919922, + "loss": 0.4244, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.8340644836425781, + "rewards/margins": 0.40911865234375, + "rewards/rejected": 1.4249458312988281, + "step": 294 + }, + { + "epoch": 0.05, + "learning_rate": 7.972972972972974e-06, + "logits/chosen": -0.3465483486652374, + "logits/rejected": -0.3261931240558624, + "logps/chosen": -76.35345458984375, + "logps/rejected": -88.81427001953125, + "loss": 0.3205, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.8406036496162415, + "rewards/margins": 0.6007232666015625, + "rewards/rejected": 0.23988036811351776, + "step": 295 + }, + { + "epoch": 0.05, + "learning_rate": 8.000000000000001e-06, + "logits/chosen": -0.4961633086204529, + "logits/rejected": -0.41706302762031555, + "logps/chosen": -101.70747375488281, + "logps/rejected": -15.664767265319824, + "loss": 1.5452, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.7545806765556335, + "rewards/margins": 0.20532596111297607, + "rewards/rejected": 0.5492547154426575, + "step": 296 + }, + { + "epoch": 0.05, + "learning_rate": 8.027027027027027e-06, + "logits/chosen": -0.8575960397720337, + "logits/rejected": -0.8921543955802917, + "logps/chosen": -127.20940399169922, + "logps/rejected": -57.94717788696289, + "loss": 1.0408, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.035813141614198685, + "rewards/margins": -0.2820712924003601, + "rewards/rejected": 0.3178844451904297, + "step": 297 + }, + { + "epoch": 0.05, + "learning_rate": 8.054054054054055e-06, + "logits/chosen": -0.06280798465013504, + "logits/rejected": -0.06929250061511993, + "logps/chosen": -143.85675048828125, + "logps/rejected": -59.697479248046875, + "loss": 0.5406, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.1078659296035767, + "rewards/margins": -0.5561492443084717, + "rewards/rejected": 1.6640151739120483, + "step": 298 + }, + { + "epoch": 0.05, + "learning_rate": 8.081081081081081e-06, + "logits/chosen": -0.7494542002677917, + "logits/rejected": -0.680593729019165, + "logps/chosen": -122.91658020019531, + "logps/rejected": -51.6643180847168, + "loss": 0.5365, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.9156143069267273, + "rewards/margins": 0.0829501748085022, + "rewards/rejected": 0.8326641321182251, + "step": 299 + }, + { + "epoch": 0.05, + "learning_rate": 8.108108108108109e-06, + "logits/chosen": -0.4717361629009247, + "logits/rejected": -0.36663109064102173, + "logps/chosen": -64.86531066894531, + "logps/rejected": -59.55781173706055, + "loss": 0.3702, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.027264356613159, + "rewards/margins": 0.4210246801376343, + "rewards/rejected": 1.606239676475525, + "step": 300 + }, + { + "epoch": 0.05, + "learning_rate": 8.135135135135137e-06, + "logits/chosen": -0.4598202705383301, + "logits/rejected": -0.4214410185813904, + "logps/chosen": -199.19430541992188, + "logps/rejected": -134.08157348632812, + "loss": 0.5801, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.4714691638946533, + "rewards/margins": -0.04375004768371582, + "rewards/rejected": 3.515219211578369, + "step": 301 + }, + { + "epoch": 0.05, + "learning_rate": 8.162162162162163e-06, + "logits/chosen": -0.25537124276161194, + "logits/rejected": -0.25537124276161194, + "logps/chosen": -43.272621154785156, + "logps/rejected": -43.272621154785156, + "loss": 0.6361, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.201886773109436, + "rewards/margins": 0.0, + "rewards/rejected": 1.201886773109436, + "step": 302 + }, + { + "epoch": 0.05, + "learning_rate": 8.189189189189189e-06, + "logits/chosen": -0.15899893641471863, + "logits/rejected": -0.15786297619342804, + "logps/chosen": -3.1423957347869873, + "logps/rejected": -1.8352115154266357, + "loss": 0.4171, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.2103487253189087, + "rewards/margins": -0.030344441533088684, + "rewards/rejected": 0.24069316685199738, + "step": 303 + }, + { + "epoch": 0.05, + "learning_rate": 8.216216216216217e-06, + "logits/chosen": -0.7261577844619751, + "logits/rejected": -0.7218644618988037, + "logps/chosen": -58.07544708251953, + "logps/rejected": -75.53610229492188, + "loss": 0.6669, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.3272148370742798, + "rewards/margins": -0.7612007856369019, + "rewards/rejected": 2.0884156227111816, + "step": 304 + }, + { + "epoch": 0.05, + "learning_rate": 8.243243243243245e-06, + "logits/chosen": -0.5513230562210083, + "logits/rejected": -0.4586784839630127, + "logps/chosen": -53.443328857421875, + "logps/rejected": -84.36640930175781, + "loss": 1.6415, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.4760208129882812, + "rewards/margins": -2.351802110671997, + "rewards/rejected": 3.8278229236602783, + "step": 305 + }, + { + "epoch": 0.05, + "learning_rate": 8.27027027027027e-06, + "logits/chosen": -0.5031084418296814, + "logits/rejected": -0.4758879542350769, + "logps/chosen": -83.66217041015625, + "logps/rejected": -76.66629791259766, + "loss": 0.6737, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3349594175815582, + "rewards/margins": 0.2539558410644531, + "rewards/rejected": 0.0810035690665245, + "step": 306 + }, + { + "epoch": 0.05, + "learning_rate": 8.297297297297298e-06, + "logits/chosen": -0.6138209104537964, + "logits/rejected": -0.5590410232543945, + "logps/chosen": -114.84789276123047, + "logps/rejected": -67.81812286376953, + "loss": 1.3994, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.2710762023925781, + "rewards/margins": -0.8674416542053223, + "rewards/rejected": 2.1385178565979004, + "step": 307 + }, + { + "epoch": 0.05, + "learning_rate": 8.324324324324326e-06, + "logits/chosen": 0.11230668425559998, + "logits/rejected": 0.13095642626285553, + "logps/chosen": -4.604063510894775, + "logps/rejected": -8.898099899291992, + "loss": 0.7762, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.12973637878894806, + "rewards/margins": 0.0640440508723259, + "rewards/rejected": 0.06569232791662216, + "step": 308 + }, + { + "epoch": 0.05, + "learning_rate": 8.351351351351352e-06, + "logits/chosen": -0.496889591217041, + "logits/rejected": -0.49604061245918274, + "logps/chosen": -66.01382446289062, + "logps/rejected": -66.33280944824219, + "loss": 0.7353, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.9988746643066406, + "rewards/margins": -0.10877692699432373, + "rewards/rejected": 1.1076515913009644, + "step": 309 + }, + { + "epoch": 0.05, + "learning_rate": 8.378378378378378e-06, + "logits/chosen": -0.30541861057281494, + "logits/rejected": -0.2964977025985718, + "logps/chosen": -61.83455276489258, + "logps/rejected": -97.37834167480469, + "loss": 0.5745, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.7152767181396484, + "rewards/margins": 0.45639458298683167, + "rewards/rejected": 0.2588821351528168, + "step": 310 + }, + { + "epoch": 0.05, + "learning_rate": 8.405405405405406e-06, + "logits/chosen": -0.37490323185920715, + "logits/rejected": -0.4017786383628845, + "logps/chosen": -69.94365692138672, + "logps/rejected": -43.80485153198242, + "loss": 1.0263, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.6321701407432556, + "rewards/margins": -0.576519787311554, + "rewards/rejected": 1.2086899280548096, + "step": 311 + }, + { + "epoch": 0.05, + "learning_rate": 8.432432432432434e-06, + "logits/chosen": -0.46352964639663696, + "logits/rejected": -0.4257551431655884, + "logps/chosen": -126.91966247558594, + "logps/rejected": -62.720298767089844, + "loss": 1.8353, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.3145294189453125, + "rewards/margins": -0.9944695234298706, + "rewards/rejected": 1.308998942375183, + "step": 312 + }, + { + "epoch": 0.05, + "learning_rate": 8.45945945945946e-06, + "logits/chosen": -0.43662917613983154, + "logits/rejected": -0.33315309882164, + "logps/chosen": -102.99406433105469, + "logps/rejected": -19.96674919128418, + "loss": 0.5317, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.2115448713302612, + "rewards/margins": 0.7167927026748657, + "rewards/rejected": 0.4947521388530731, + "step": 313 + }, + { + "epoch": 0.05, + "learning_rate": 8.486486486486488e-06, + "logits/chosen": -0.34311607480049133, + "logits/rejected": -0.3408963084220886, + "logps/chosen": -25.545333862304688, + "logps/rejected": -22.305702209472656, + "loss": 1.2928, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.12831497192382812, + "rewards/margins": -0.5796711444854736, + "rewards/rejected": 0.4513561427593231, + "step": 314 + }, + { + "epoch": 0.05, + "learning_rate": 8.513513513513514e-06, + "logits/chosen": -0.6797809600830078, + "logits/rejected": -0.7320390343666077, + "logps/chosen": -143.3366241455078, + "logps/rejected": -91.69696044921875, + "loss": 0.2441, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.741209506988525, + "rewards/margins": 0.49324655532836914, + "rewards/rejected": 4.247962951660156, + "step": 315 + }, + { + "epoch": 0.05, + "learning_rate": 8.540540540540542e-06, + "logits/chosen": -0.7199043035507202, + "logits/rejected": -0.6471121311187744, + "logps/chosen": -90.09335327148438, + "logps/rejected": -74.01091003417969, + "loss": 0.7623, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.7194008231163025, + "rewards/margins": -0.8857811093330383, + "rewards/rejected": 1.6051819324493408, + "step": 316 + }, + { + "epoch": 0.05, + "learning_rate": 8.567567567567568e-06, + "logits/chosen": -0.48920655250549316, + "logits/rejected": -0.31943610310554504, + "logps/chosen": -53.37885284423828, + "logps/rejected": -13.669550895690918, + "loss": 0.1437, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.4963982105255127, + "rewards/margins": 2.0251317024230957, + "rewards/rejected": 0.4712664783000946, + "step": 317 + }, + { + "epoch": 0.05, + "learning_rate": 8.594594594594595e-06, + "logits/chosen": -0.49429982900619507, + "logits/rejected": -0.4314880073070526, + "logps/chosen": -113.6270980834961, + "logps/rejected": -17.679527282714844, + "loss": 0.3789, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.0713013410568237, + "rewards/margins": 0.6815162897109985, + "rewards/rejected": 0.3897850215435028, + "step": 318 + }, + { + "epoch": 0.05, + "learning_rate": 8.621621621621622e-06, + "logits/chosen": -0.48401787877082825, + "logits/rejected": -0.3914058208465576, + "logps/chosen": -117.02652740478516, + "logps/rejected": -184.55007934570312, + "loss": 1.2627, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.0060951709747314, + "rewards/margins": -2.4032890796661377, + "rewards/rejected": 3.409384250640869, + "step": 319 + }, + { + "epoch": 0.05, + "learning_rate": 8.64864864864865e-06, + "logits/chosen": -0.9566572904586792, + "logits/rejected": -0.9709179401397705, + "logps/chosen": -80.78709411621094, + "logps/rejected": -63.23046112060547, + "loss": 1.3393, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.248577833175659, + "rewards/margins": 1.8300285339355469, + "rewards/rejected": 0.4185493588447571, + "step": 320 + }, + { + "epoch": 0.05, + "learning_rate": 8.675675675675677e-06, + "logits/chosen": -0.5753427743911743, + "logits/rejected": -0.618905782699585, + "logps/chosen": -139.14999389648438, + "logps/rejected": -117.58961486816406, + "loss": 0.1509, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.5589141845703125, + "rewards/margins": 1.11031174659729, + "rewards/rejected": 2.4486024379730225, + "step": 321 + }, + { + "epoch": 0.05, + "learning_rate": 8.702702702702703e-06, + "logits/chosen": -0.4251304864883423, + "logits/rejected": -0.43055909872055054, + "logps/chosen": -10.32309341430664, + "logps/rejected": -2.1006107330322266, + "loss": 1.2093, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.12319622188806534, + "rewards/margins": -0.390415757894516, + "rewards/rejected": 0.26721954345703125, + "step": 322 + }, + { + "epoch": 0.05, + "learning_rate": 8.72972972972973e-06, + "logits/chosen": -0.4913829565048218, + "logits/rejected": -0.48254838585853577, + "logps/chosen": -39.90951156616211, + "logps/rejected": -77.63725280761719, + "loss": 0.6551, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.5762557983398438, + "rewards/margins": -0.6690413951873779, + "rewards/rejected": 2.2452971935272217, + "step": 323 + }, + { + "epoch": 0.05, + "learning_rate": 8.756756756756759e-06, + "logits/chosen": -1.450566053390503, + "logits/rejected": -1.4697514772415161, + "logps/chosen": -30.721986770629883, + "logps/rejected": -21.698925018310547, + "loss": 0.4356, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.1152594089508057, + "rewards/margins": 0.16003155708312988, + "rewards/rejected": 0.9552278518676758, + "step": 324 + }, + { + "epoch": 0.05, + "learning_rate": 8.783783783783785e-06, + "logits/chosen": -0.26981717348098755, + "logits/rejected": -0.27157914638519287, + "logps/chosen": -6.665643215179443, + "logps/rejected": -3.2234973907470703, + "loss": 2.7283, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3622414171695709, + "rewards/margins": 0.0439453125, + "rewards/rejected": 0.3182961046695709, + "step": 325 + }, + { + "epoch": 0.05, + "learning_rate": 8.810810810810811e-06, + "logits/chosen": -0.698486864566803, + "logits/rejected": -0.5895633101463318, + "logps/chosen": -306.4192810058594, + "logps/rejected": -163.27056884765625, + "loss": 1.5014, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.3076629638671875, + "rewards/margins": -2.89794921875, + "rewards/rejected": 5.2056121826171875, + "step": 326 + }, + { + "epoch": 0.05, + "learning_rate": 8.837837837837839e-06, + "logits/chosen": -0.5324114561080933, + "logits/rejected": -0.5324114561080933, + "logps/chosen": -64.591064453125, + "logps/rejected": -64.591064453125, + "loss": 0.6052, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.1268905401229858, + "rewards/margins": 0.0, + "rewards/rejected": 1.1268905401229858, + "step": 327 + }, + { + "epoch": 0.05, + "learning_rate": 8.864864864864866e-06, + "logits/chosen": -0.6786962151527405, + "logits/rejected": -0.6170807480812073, + "logps/chosen": -70.346923828125, + "logps/rejected": -83.83063507080078, + "loss": 0.7554, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.4483659267425537, + "rewards/margins": -0.0879354476928711, + "rewards/rejected": 2.536301374435425, + "step": 328 + }, + { + "epoch": 0.05, + "learning_rate": 8.891891891891893e-06, + "logits/chosen": -0.5584442019462585, + "logits/rejected": -0.416290819644928, + "logps/chosen": -68.97056579589844, + "logps/rejected": -74.76520538330078, + "loss": 0.2777, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.149066209793091, + "rewards/margins": 0.4377021789550781, + "rewards/rejected": 1.7113640308380127, + "step": 329 + }, + { + "epoch": 0.05, + "learning_rate": 8.91891891891892e-06, + "logits/chosen": -0.5550109148025513, + "logits/rejected": -0.5088756084442139, + "logps/chosen": -65.20993041992188, + "logps/rejected": -76.40754699707031, + "loss": 0.6581, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.5776108503341675, + "rewards/margins": -0.48010551929473877, + "rewards/rejected": 2.0577163696289062, + "step": 330 + }, + { + "epoch": 0.05, + "learning_rate": 8.945945945945946e-06, + "logits/chosen": -0.49624887108802795, + "logits/rejected": -0.36985790729522705, + "logps/chosen": -63.520957946777344, + "logps/rejected": -18.121450424194336, + "loss": 0.2956, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.3825302124023438, + "rewards/margins": 2.1618659496307373, + "rewards/rejected": 0.22066421806812286, + "step": 331 + }, + { + "epoch": 0.05, + "learning_rate": 8.972972972972974e-06, + "logits/chosen": -0.21551167964935303, + "logits/rejected": -0.2314784675836563, + "logps/chosen": -28.809009552001953, + "logps/rejected": -22.295949935913086, + "loss": 0.566, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.44886454939842224, + "rewards/margins": -0.43738630414009094, + "rewards/rejected": -0.011478233151137829, + "step": 332 + }, + { + "epoch": 0.05, + "learning_rate": 9e-06, + "logits/chosen": -0.6883687376976013, + "logits/rejected": -0.6348608732223511, + "logps/chosen": -60.47346115112305, + "logps/rejected": -43.545082092285156, + "loss": 1.2493, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.9930286407470703, + "rewards/margins": -0.25416719913482666, + "rewards/rejected": 1.247195839881897, + "step": 333 + }, + { + "epoch": 0.05, + "learning_rate": 9.027027027027028e-06, + "logits/chosen": -0.48168089985847473, + "logits/rejected": -0.3941109776496887, + "logps/chosen": -131.2890167236328, + "logps/rejected": -83.94004821777344, + "loss": 1.5877, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.9691330194473267, + "rewards/margins": -0.6331809759140015, + "rewards/rejected": 1.6023139953613281, + "step": 334 + }, + { + "epoch": 0.05, + "learning_rate": 9.054054054054054e-06, + "logits/chosen": -0.6787780523300171, + "logits/rejected": -0.6867193579673767, + "logps/chosen": -13.580673217773438, + "logps/rejected": -1.4941753149032593, + "loss": 1.1631, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.2151205986738205, + "rewards/margins": -0.3837011158466339, + "rewards/rejected": 0.16858051717281342, + "step": 335 + }, + { + "epoch": 0.05, + "learning_rate": 9.081081081081082e-06, + "logits/chosen": -0.38995882868766785, + "logits/rejected": -0.3968876600265503, + "logps/chosen": -57.224769592285156, + "logps/rejected": -91.63072204589844, + "loss": 0.9434, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.7745246887207031, + "rewards/margins": -0.48016583919525146, + "rewards/rejected": 1.2546905279159546, + "step": 336 + }, + { + "epoch": 0.05, + "learning_rate": 9.10810810810811e-06, + "logits/chosen": -0.5275914669036865, + "logits/rejected": -0.5199727416038513, + "logps/chosen": -166.86630249023438, + "logps/rejected": -193.94216918945312, + "loss": 0.6772, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.1065690517425537, + "rewards/margins": -0.9948241710662842, + "rewards/rejected": 4.101393222808838, + "step": 337 + }, + { + "epoch": 0.05, + "learning_rate": 9.135135135135136e-06, + "logits/chosen": -0.7459405660629272, + "logits/rejected": -0.7025982141494751, + "logps/chosen": -88.59495544433594, + "logps/rejected": -111.82389831542969, + "loss": 0.5568, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.590252637863159, + "rewards/margins": 0.4387679100036621, + "rewards/rejected": 3.151484727859497, + "step": 338 + }, + { + "epoch": 0.06, + "learning_rate": 9.162162162162162e-06, + "logits/chosen": -0.17929622530937195, + "logits/rejected": -0.22249449789524078, + "logps/chosen": -82.63052368164062, + "logps/rejected": -147.5269317626953, + "loss": 0.9608, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.7769317626953125, + "rewards/margins": 1.2119308710098267, + "rewards/rejected": -0.4349990785121918, + "step": 339 + }, + { + "epoch": 0.06, + "learning_rate": 9.189189189189191e-06, + "logits/chosen": -0.6256023049354553, + "logits/rejected": -0.5614812970161438, + "logps/chosen": -38.40522003173828, + "logps/rejected": -7.395800590515137, + "loss": 1.6429, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.6748287677764893, + "rewards/margins": 0.9740585684776306, + "rewards/rejected": 0.7007701992988586, + "step": 340 + }, + { + "epoch": 0.06, + "learning_rate": 9.216216216216217e-06, + "logits/chosen": -0.4284626543521881, + "logits/rejected": -0.3838454484939575, + "logps/chosen": -64.06928253173828, + "logps/rejected": -45.80847930908203, + "loss": 1.2767, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.790814995765686, + "rewards/margins": -0.790473222732544, + "rewards/rejected": 1.58128821849823, + "step": 341 + }, + { + "epoch": 0.06, + "learning_rate": 9.243243243243243e-06, + "logits/chosen": -0.39201006293296814, + "logits/rejected": -0.39993107318878174, + "logps/chosen": -19.755977630615234, + "logps/rejected": -3.9521641731262207, + "loss": 0.709, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.0045883655548096, + "rewards/margins": 0.7407395839691162, + "rewards/rejected": 0.26384878158569336, + "step": 342 + }, + { + "epoch": 0.06, + "learning_rate": 9.270270270270271e-06, + "logits/chosen": -0.5174297094345093, + "logits/rejected": -0.559615969657898, + "logps/chosen": -51.45703125, + "logps/rejected": -67.31831359863281, + "loss": 1.0604, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.7262687683105469, + "rewards/margins": -0.8200356960296631, + "rewards/rejected": 2.54630446434021, + "step": 343 + }, + { + "epoch": 0.06, + "learning_rate": 9.297297297297299e-06, + "logits/chosen": -0.36874499917030334, + "logits/rejected": -0.36503270268440247, + "logps/chosen": -6.674489498138428, + "logps/rejected": -3.690516710281372, + "loss": 0.5751, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.10461373627185822, + "rewards/margins": -0.13861939311027527, + "rewards/rejected": 0.24323312938213348, + "step": 344 + }, + { + "epoch": 0.06, + "learning_rate": 9.324324324324325e-06, + "logits/chosen": -0.2065931260585785, + "logits/rejected": -0.21736155450344086, + "logps/chosen": -82.31595611572266, + "logps/rejected": -69.97293853759766, + "loss": 0.7454, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.3818405866622925, + "rewards/margins": -0.8723992109298706, + "rewards/rejected": 2.254239797592163, + "step": 345 + }, + { + "epoch": 0.06, + "learning_rate": 9.351351351351353e-06, + "logits/chosen": -0.4815380275249481, + "logits/rejected": -0.47798001766204834, + "logps/chosen": -92.28070831298828, + "logps/rejected": -103.99251556396484, + "loss": 0.7878, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.1243599653244019, + "rewards/margins": 0.652687132358551, + "rewards/rejected": 0.47167283296585083, + "step": 346 + }, + { + "epoch": 0.06, + "learning_rate": 9.378378378378379e-06, + "logits/chosen": -0.4057600796222687, + "logits/rejected": -0.5315192341804504, + "logps/chosen": -60.77193832397461, + "logps/rejected": -63.17349624633789, + "loss": 0.9552, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.6640575528144836, + "rewards/margins": -0.11445391178131104, + "rewards/rejected": 0.7785114645957947, + "step": 347 + }, + { + "epoch": 0.06, + "learning_rate": 9.405405405405407e-06, + "logits/chosen": -0.6153685450553894, + "logits/rejected": -0.5773225426673889, + "logps/chosen": -46.40425109863281, + "logps/rejected": -25.81399917602539, + "loss": 1.2695, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.09406738728284836, + "rewards/margins": -0.5731827020645142, + "rewards/rejected": 0.6672500967979431, + "step": 348 + }, + { + "epoch": 0.06, + "learning_rate": 9.432432432432433e-06, + "logits/chosen": -0.20983672142028809, + "logits/rejected": -0.20886193215847015, + "logps/chosen": -5.000154495239258, + "logps/rejected": -18.958303451538086, + "loss": 0.4407, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.46401387453079224, + "rewards/margins": 0.07201233506202698, + "rewards/rejected": 0.39200153946876526, + "step": 349 + }, + { + "epoch": 0.06, + "learning_rate": 9.45945945945946e-06, + "logits/chosen": -0.4890599250793457, + "logits/rejected": -0.4877145290374756, + "logps/chosen": -100.36521911621094, + "logps/rejected": -104.75135803222656, + "loss": 0.7618, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2970016598701477, + "rewards/margins": 0.23106995224952698, + "rewards/rejected": 0.06593170017004013, + "step": 350 + }, + { + "epoch": 0.06, + "learning_rate": 9.486486486486487e-06, + "logits/chosen": -0.6829345226287842, + "logits/rejected": -0.6727765798568726, + "logps/chosen": -264.56591796875, + "logps/rejected": -145.3578643798828, + "loss": 0.2446, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.918429851531982, + "rewards/margins": 0.8365039825439453, + "rewards/rejected": 4.081925868988037, + "step": 351 + }, + { + "epoch": 0.06, + "learning_rate": 9.513513513513514e-06, + "logits/chosen": -0.2049439400434494, + "logits/rejected": -0.2243448942899704, + "logps/chosen": -56.430259704589844, + "logps/rejected": -52.198822021484375, + "loss": 0.7733, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.1411956548690796, + "rewards/margins": -1.1274970769882202, + "rewards/rejected": 2.2686927318573, + "step": 352 + }, + { + "epoch": 0.06, + "learning_rate": 9.540540540540542e-06, + "logits/chosen": -0.6326491832733154, + "logits/rejected": -0.5943217277526855, + "logps/chosen": -65.40185546875, + "logps/rejected": -124.55590057373047, + "loss": 3.3495, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.307870626449585, + "rewards/margins": -2.9946205615997314, + "rewards/rejected": 5.302491188049316, + "step": 353 + }, + { + "epoch": 0.06, + "learning_rate": 9.567567567567568e-06, + "logits/chosen": -0.38731369376182556, + "logits/rejected": -0.38731369376182556, + "logps/chosen": -28.24608612060547, + "logps/rejected": -28.24608612060547, + "loss": 0.4276, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.2899872064590454, + "rewards/margins": 0.0, + "rewards/rejected": 1.2899872064590454, + "step": 354 + }, + { + "epoch": 0.06, + "learning_rate": 9.594594594594594e-06, + "logits/chosen": -0.243162602186203, + "logits/rejected": -0.2574932873249054, + "logps/chosen": -51.27798843383789, + "logps/rejected": -34.245758056640625, + "loss": 0.9204, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.44439926743507385, + "rewards/margins": -0.3757217228412628, + "rewards/rejected": 0.8201209902763367, + "step": 355 + }, + { + "epoch": 0.06, + "learning_rate": 9.621621621621622e-06, + "logits/chosen": -0.6676461696624756, + "logits/rejected": -0.6663308143615723, + "logps/chosen": -155.33978271484375, + "logps/rejected": -165.0880584716797, + "loss": 0.8298, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.304208278656006, + "rewards/margins": 0.4055953025817871, + "rewards/rejected": 3.8986129760742188, + "step": 356 + }, + { + "epoch": 0.06, + "learning_rate": 9.64864864864865e-06, + "logits/chosen": -0.9680883884429932, + "logits/rejected": -0.9422879815101624, + "logps/chosen": -94.29291534423828, + "logps/rejected": -141.2462921142578, + "loss": 1.5241, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.7429512739181519, + "rewards/margins": 1.7476799488067627, + "rewards/rejected": -0.0047286986373364925, + "step": 357 + }, + { + "epoch": 0.06, + "learning_rate": 9.675675675675676e-06, + "logits/chosen": -0.6155257821083069, + "logits/rejected": -0.43187013268470764, + "logps/chosen": -89.30162048339844, + "logps/rejected": -19.779857635498047, + "loss": 0.0165, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.338761806488037, + "rewards/margins": 4.671536922454834, + "rewards/rejected": 0.6672247052192688, + "step": 358 + }, + { + "epoch": 0.06, + "learning_rate": 9.702702702702704e-06, + "logits/chosen": -0.5955896377563477, + "logits/rejected": -0.5558439493179321, + "logps/chosen": -102.1772689819336, + "logps/rejected": -37.07979965209961, + "loss": 0.6753, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.4533836543560028, + "rewards/margins": 0.07266885042190552, + "rewards/rejected": 0.3807148039340973, + "step": 359 + }, + { + "epoch": 0.06, + "learning_rate": 9.729729729729732e-06, + "logits/chosen": -0.2428319752216339, + "logits/rejected": -0.24733129143714905, + "logps/chosen": -75.43546295166016, + "logps/rejected": -122.94139099121094, + "loss": 1.0188, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.231150820851326, + "rewards/margins": 0.011708065867424011, + "rewards/rejected": 0.21944275498390198, + "step": 360 + }, + { + "epoch": 0.06, + "learning_rate": 9.756756756756758e-06, + "logits/chosen": -0.6800702214241028, + "logits/rejected": -0.6554678082466125, + "logps/chosen": -59.40484619140625, + "logps/rejected": -38.267494201660156, + "loss": 0.5739, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.45224228501319885, + "rewards/margins": -0.46936914324760437, + "rewards/rejected": 0.9216114282608032, + "step": 361 + }, + { + "epoch": 0.06, + "learning_rate": 9.783783783783785e-06, + "logits/chosen": -0.6174260377883911, + "logits/rejected": -0.6496756076812744, + "logps/chosen": -90.08123016357422, + "logps/rejected": -51.421546936035156, + "loss": 0.6233, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.121608018875122, + "rewards/margins": -0.6566978693008423, + "rewards/rejected": 1.7783058881759644, + "step": 362 + }, + { + "epoch": 0.06, + "learning_rate": 9.810810810810811e-06, + "logits/chosen": -0.6743595004081726, + "logits/rejected": -0.6691867709159851, + "logps/chosen": -71.1065444946289, + "logps/rejected": -84.1654052734375, + "loss": 0.6657, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.7230339050292969, + "rewards/margins": -0.6199805736541748, + "rewards/rejected": 2.3430144786834717, + "step": 363 + }, + { + "epoch": 0.06, + "learning_rate": 9.83783783783784e-06, + "logits/chosen": -0.5149006247520447, + "logits/rejected": -0.5149006247520447, + "logps/chosen": -0.7066545486450195, + "logps/rejected": -0.7066545486450195, + "loss": 0.6009, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.18547387421131134, + "rewards/margins": 0.0, + "rewards/rejected": 0.18547387421131134, + "step": 364 + }, + { + "epoch": 0.06, + "learning_rate": 9.864864864864865e-06, + "logits/chosen": -0.6667061448097229, + "logits/rejected": -0.6719810962677002, + "logps/chosen": -79.56892395019531, + "logps/rejected": -79.50450134277344, + "loss": 1.1545, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.9977859854698181, + "rewards/margins": -1.84810471534729, + "rewards/rejected": 2.845890760421753, + "step": 365 + }, + { + "epoch": 0.06, + "learning_rate": 9.891891891891893e-06, + "logits/chosen": -0.8543103933334351, + "logits/rejected": -0.788999617099762, + "logps/chosen": -105.53317260742188, + "logps/rejected": -95.43896484375, + "loss": 1.5586, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.730183482170105, + "rewards/margins": -2.7412872314453125, + "rewards/rejected": 4.471470832824707, + "step": 366 + }, + { + "epoch": 0.06, + "learning_rate": 9.91891891891892e-06, + "logits/chosen": -0.4200245141983032, + "logits/rejected": -0.37365785241127014, + "logps/chosen": -64.25897216796875, + "logps/rejected": -57.26987838745117, + "loss": 0.3981, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.6199111938476562, + "rewards/margins": 0.09955096244812012, + "rewards/rejected": 1.5203602313995361, + "step": 367 + }, + { + "epoch": 0.06, + "learning_rate": 9.945945945945947e-06, + "logits/chosen": -0.9090340733528137, + "logits/rejected": -0.8372100591659546, + "logps/chosen": -61.28740692138672, + "logps/rejected": -76.99655151367188, + "loss": 1.011, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.7258399724960327, + "rewards/margins": -1.4836434125900269, + "rewards/rejected": 3.2094833850860596, + "step": 368 + }, + { + "epoch": 0.06, + "learning_rate": 9.972972972972975e-06, + "logits/chosen": -0.4040333330631256, + "logits/rejected": -0.3175643980503082, + "logps/chosen": -40.018428802490234, + "logps/rejected": -48.146995544433594, + "loss": 0.525, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.379943609237671, + "rewards/margins": 1.6510319709777832, + "rewards/rejected": 0.7289115786552429, + "step": 369 + }, + { + "epoch": 0.06, + "learning_rate": 1e-05, + "logits/chosen": -0.45652851462364197, + "logits/rejected": -0.34402841329574585, + "logps/chosen": -64.89192199707031, + "logps/rejected": -10.187543869018555, + "loss": 0.1064, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.5968093872070312, + "rewards/margins": 1.998934268951416, + "rewards/rejected": 0.5978750586509705, + "step": 370 + }, + { + "epoch": 0.06, + "learning_rate": 9.999999827273657e-06, + "logits/chosen": -0.6211113333702087, + "logits/rejected": -0.5366843342781067, + "logps/chosen": -83.47526550292969, + "logps/rejected": -134.87350463867188, + "loss": 1.3081, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.6608489751815796, + "rewards/margins": 1.483866810798645, + "rewards/rejected": 0.176982119679451, + "step": 371 + }, + { + "epoch": 0.06, + "learning_rate": 9.999999309094633e-06, + "logits/chosen": -0.42778998613357544, + "logits/rejected": -0.46493837237358093, + "logps/chosen": -292.25439453125, + "logps/rejected": -67.40526580810547, + "loss": 0.3083, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.3812530040740967, + "rewards/margins": 0.605863094329834, + "rewards/rejected": 1.7753899097442627, + "step": 372 + }, + { + "epoch": 0.06, + "learning_rate": 9.999998445462969e-06, + "logits/chosen": -0.5584471821784973, + "logits/rejected": -0.5798459649085999, + "logps/chosen": -89.85227966308594, + "logps/rejected": -174.52621459960938, + "loss": 1.3002, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.2193939685821533, + "rewards/margins": -1.8985307216644287, + "rewards/rejected": 5.117924690246582, + "step": 373 + }, + { + "epoch": 0.06, + "learning_rate": 9.999997236378723e-06, + "logits/chosen": -0.3905222415924072, + "logits/rejected": -0.19199161231517792, + "logps/chosen": -103.94282531738281, + "logps/rejected": -66.38993835449219, + "loss": 0.086, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.382688999176025, + "rewards/margins": 2.28641676902771, + "rewards/rejected": 2.0962722301483154, + "step": 374 + }, + { + "epoch": 0.06, + "learning_rate": 9.999995681841979e-06, + "logits/chosen": -1.0716191530227661, + "logits/rejected": -1.0431020259857178, + "logps/chosen": -89.55866241455078, + "logps/rejected": -181.06007385253906, + "loss": 0.9053, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.176770806312561, + "rewards/margins": -0.8035697937011719, + "rewards/rejected": 1.980340600013733, + "step": 375 + }, + { + "epoch": 0.06, + "learning_rate": 9.999993781852842e-06, + "logits/chosen": -0.3008856475353241, + "logits/rejected": -0.09651866555213928, + "logps/chosen": -72.88656616210938, + "logps/rejected": -112.69959259033203, + "loss": 2.4995, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.3063850402832031, + "rewards/margins": -1.7203004360198975, + "rewards/rejected": 3.0266854763031006, + "step": 376 + }, + { + "epoch": 0.06, + "learning_rate": 9.999991536411447e-06, + "logits/chosen": -0.3377732038497925, + "logits/rejected": -0.3377732038497925, + "logps/chosen": -82.28802490234375, + "logps/rejected": -82.28802490234375, + "loss": 0.372, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.6157439947128296, + "rewards/margins": 0.0, + "rewards/rejected": 1.6157439947128296, + "step": 377 + }, + { + "epoch": 0.06, + "learning_rate": 9.999988945517944e-06, + "logits/chosen": -0.19720743596553802, + "logits/rejected": -0.15118253231048584, + "logps/chosen": -108.56809997558594, + "logps/rejected": -58.91706466674805, + "loss": 1.3118, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.3461517095565796, + "rewards/margins": -0.4904972314834595, + "rewards/rejected": 1.836648941040039, + "step": 378 + }, + { + "epoch": 0.06, + "learning_rate": 9.999986009172517e-06, + "logits/chosen": -0.44962847232818604, + "logits/rejected": -0.391620934009552, + "logps/chosen": -84.76294708251953, + "logps/rejected": -66.20850372314453, + "loss": 1.3378, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.291815996170044, + "rewards/margins": -1.209191083908081, + "rewards/rejected": 2.501007080078125, + "step": 379 + }, + { + "epoch": 0.06, + "learning_rate": 9.999982727375367e-06, + "logits/chosen": -0.7721155881881714, + "logits/rejected": -0.6911370754241943, + "logps/chosen": -59.282466888427734, + "logps/rejected": -23.899147033691406, + "loss": 0.1624, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.6167758703231812, + "rewards/margins": 1.4152781963348389, + "rewards/rejected": 0.2014976590871811, + "step": 380 + }, + { + "epoch": 0.06, + "learning_rate": 9.999979100126723e-06, + "logits/chosen": -0.8855939507484436, + "logits/rejected": -0.8854695558547974, + "logps/chosen": -93.7998046875, + "logps/rejected": -137.75802612304688, + "loss": 3.04, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.6637222170829773, + "rewards/margins": -4.125839710235596, + "rewards/rejected": 4.789561748504639, + "step": 381 + }, + { + "epoch": 0.06, + "learning_rate": 9.999975127426831e-06, + "logits/chosen": -0.6106223464012146, + "logits/rejected": -0.530592143535614, + "logps/chosen": -51.244178771972656, + "logps/rejected": -76.84086608886719, + "loss": 0.9322, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.8475837707519531, + "rewards/margins": -0.78204345703125, + "rewards/rejected": 2.629627227783203, + "step": 382 + }, + { + "epoch": 0.06, + "learning_rate": 9.999970809275968e-06, + "logits/chosen": -0.9958774447441101, + "logits/rejected": -0.9284671545028687, + "logps/chosen": -82.2879409790039, + "logps/rejected": -75.94694519042969, + "loss": 0.5128, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.878683567047119, + "rewards/margins": -0.42203831672668457, + "rewards/rejected": 3.3007218837738037, + "step": 383 + }, + { + "epoch": 0.06, + "learning_rate": 9.999966145674433e-06, + "logits/chosen": -0.7262387871742249, + "logits/rejected": -0.7262387871742249, + "logps/chosen": -2.0570602416992188, + "logps/rejected": -2.0570602416992188, + "loss": 0.4837, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.6485873460769653, + "rewards/margins": 0.0, + "rewards/rejected": 0.6485873460769653, + "step": 384 + }, + { + "epoch": 0.06, + "learning_rate": 9.999961136622548e-06, + "logits/chosen": -0.8149638772010803, + "logits/rejected": -0.6667890548706055, + "logps/chosen": -140.89529418945312, + "logps/rejected": -73.05198669433594, + "loss": 0.0841, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.583645820617676, + "rewards/margins": 3.854383945465088, + "rewards/rejected": 1.7292617559432983, + "step": 385 + }, + { + "epoch": 0.06, + "learning_rate": 9.999955782120656e-06, + "logits/chosen": -1.09994637966156, + "logits/rejected": -1.047536849975586, + "logps/chosen": -87.62387084960938, + "logps/rejected": -59.899559020996094, + "loss": 2.0689, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.2750412225723267, + "rewards/margins": -1.8131386041641235, + "rewards/rejected": 3.08817982673645, + "step": 386 + }, + { + "epoch": 0.06, + "learning_rate": 9.999950082169132e-06, + "logits/chosen": -0.9503659009933472, + "logits/rejected": -0.911098062992096, + "logps/chosen": -248.10427856445312, + "logps/rejected": -42.033111572265625, + "loss": 0.0195, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.951620578765869, + "rewards/margins": 4.505518913269043, + "rewards/rejected": 0.44610175490379333, + "step": 387 + }, + { + "epoch": 0.06, + "learning_rate": 9.999944036768366e-06, + "logits/chosen": -1.0511187314987183, + "logits/rejected": -1.0276294946670532, + "logps/chosen": -82.48953247070312, + "logps/rejected": -85.93863677978516, + "loss": 0.5952, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.6512062549591064, + "rewards/margins": -0.5694968700408936, + "rewards/rejected": 2.220703125, + "step": 388 + }, + { + "epoch": 0.06, + "learning_rate": 9.999937645918777e-06, + "logits/chosen": -0.5561637282371521, + "logits/rejected": -0.585045576095581, + "logps/chosen": -164.99365234375, + "logps/rejected": -67.21111297607422, + "loss": 0.3732, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.465445041656494, + "rewards/margins": 1.6610016822814941, + "rewards/rejected": 0.804443359375, + "step": 389 + }, + { + "epoch": 0.06, + "learning_rate": 9.999930909620807e-06, + "logits/chosen": -1.3555892705917358, + "logits/rejected": -1.2339868545532227, + "logps/chosen": -70.390869140625, + "logps/rejected": -99.54331970214844, + "loss": 1.4418, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.1808273792266846, + "rewards/margins": -1.7311341762542725, + "rewards/rejected": 4.911961555480957, + "step": 390 + }, + { + "epoch": 0.06, + "learning_rate": 9.999923827874922e-06, + "logits/chosen": -0.8472676873207092, + "logits/rejected": -0.8642930388450623, + "logps/chosen": -52.434112548828125, + "logps/rejected": -51.23066711425781, + "loss": 0.6343, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.1445239782333374, + "rewards/margins": -0.877142071723938, + "rewards/rejected": 2.0216660499572754, + "step": 391 + }, + { + "epoch": 0.06, + "learning_rate": 9.999916400681608e-06, + "logits/chosen": -0.6753020882606506, + "logits/rejected": -0.5554223656654358, + "logps/chosen": -111.0845947265625, + "logps/rejected": -41.858680725097656, + "loss": 0.6205, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.6855742931365967, + "rewards/margins": 3.389127254486084, + "rewards/rejected": 0.2964470088481903, + "step": 392 + }, + { + "epoch": 0.06, + "learning_rate": 9.999908628041382e-06, + "logits/chosen": -0.8002011179924011, + "logits/rejected": -0.688954770565033, + "logps/chosen": -40.652374267578125, + "logps/rejected": -55.31637191772461, + "loss": 0.6331, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.3905857801437378, + "rewards/margins": -0.28992760181427, + "rewards/rejected": 1.6805133819580078, + "step": 393 + }, + { + "epoch": 0.06, + "learning_rate": 9.999900509954779e-06, + "logits/chosen": -0.6659075617790222, + "logits/rejected": -0.6659075617790222, + "logps/chosen": -51.81819152832031, + "logps/rejected": -51.81819152832031, + "loss": 0.3692, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.024118423461914062, + "rewards/margins": 0.0, + "rewards/rejected": -0.024118423461914062, + "step": 394 + }, + { + "epoch": 0.06, + "learning_rate": 9.99989204642236e-06, + "logits/chosen": -0.4294048249721527, + "logits/rejected": -0.4606705605983734, + "logps/chosen": -52.80230712890625, + "logps/rejected": -70.98595428466797, + "loss": 0.7398, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.062058210372925, + "rewards/margins": -0.5907869338989258, + "rewards/rejected": 2.6528451442718506, + "step": 395 + }, + { + "epoch": 0.06, + "learning_rate": 9.99988323744471e-06, + "logits/chosen": -0.9282757639884949, + "logits/rejected": -0.8860697746276855, + "logps/chosen": -100.5160140991211, + "logps/rejected": -50.73949432373047, + "loss": 0.7916, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.49018630385398865, + "rewards/margins": -0.4516460597515106, + "rewards/rejected": 0.9418323636054993, + "step": 396 + }, + { + "epoch": 0.06, + "learning_rate": 9.999874083022437e-06, + "logits/chosen": -0.7548536658287048, + "logits/rejected": -0.8750247359275818, + "logps/chosen": -215.0258331298828, + "logps/rejected": -138.83657836914062, + "loss": 1.728, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.2703323364257812, + "rewards/margins": -2.787397861480713, + "rewards/rejected": 6.057730197906494, + "step": 397 + }, + { + "epoch": 0.06, + "learning_rate": 9.999864583156176e-06, + "logits/chosen": -0.7353742718696594, + "logits/rejected": -0.646490752696991, + "logps/chosen": -87.63317108154297, + "logps/rejected": -80.13179779052734, + "loss": 0.2466, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.534259796142578, + "rewards/margins": 1.3668243885040283, + "rewards/rejected": 2.16743540763855, + "step": 398 + }, + { + "epoch": 0.06, + "learning_rate": 9.99985473784658e-06, + "logits/chosen": -0.7873914837837219, + "logits/rejected": -0.759459912776947, + "logps/chosen": -111.84617614746094, + "logps/rejected": -65.00360107421875, + "loss": 1.2909, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.7865874767303467, + "rewards/margins": -1.023594856262207, + "rewards/rejected": 3.8101823329925537, + "step": 399 + }, + { + "epoch": 0.06, + "learning_rate": 9.999844547094331e-06, + "logits/chosen": -0.5938705205917358, + "logits/rejected": -0.5938705205917358, + "logps/chosen": -121.09465026855469, + "logps/rejected": -121.09465026855469, + "loss": 0.3656, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.4742110967636108, + "rewards/margins": 0.0, + "rewards/rejected": 1.4742110967636108, + "step": 400 + }, + { + "epoch": 0.07, + "learning_rate": 9.999834010900132e-06, + "logits/chosen": -0.627332329750061, + "logits/rejected": -0.5646047592163086, + "logps/chosen": -53.0100212097168, + "logps/rejected": -142.82174682617188, + "loss": 1.9973, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.4535038471221924, + "rewards/margins": -1.506547212600708, + "rewards/rejected": 4.9600510597229, + "step": 401 + }, + { + "epoch": 0.07, + "learning_rate": 9.999823129264712e-06, + "logits/chosen": -0.6219439506530762, + "logits/rejected": -0.5913634896278381, + "logps/chosen": -22.630128860473633, + "logps/rejected": -110.27232360839844, + "loss": 1.6935, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.548039436340332, + "rewards/margins": -3.108398199081421, + "rewards/rejected": 3.656437635421753, + "step": 402 + }, + { + "epoch": 0.07, + "learning_rate": 9.999811902188823e-06, + "logits/chosen": -0.1884494125843048, + "logits/rejected": -0.18283551931381226, + "logps/chosen": -18.656482696533203, + "logps/rejected": -21.655803680419922, + "loss": 0.6234, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.03280143812298775, + "rewards/margins": -0.025915909558534622, + "rewards/rejected": 0.05871734768152237, + "step": 403 + }, + { + "epoch": 0.07, + "learning_rate": 9.999800329673241e-06, + "logits/chosen": -0.5637255907058716, + "logits/rejected": -0.5249358415603638, + "logps/chosen": -72.46772766113281, + "logps/rejected": -49.975223541259766, + "loss": 0.3087, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.0419601202011108, + "rewards/margins": 0.2749324440956116, + "rewards/rejected": 0.7670276761054993, + "step": 404 + }, + { + "epoch": 0.07, + "learning_rate": 9.999788411718764e-06, + "logits/chosen": -0.456184059381485, + "logits/rejected": -0.41730549931526184, + "logps/chosen": -82.32405090332031, + "logps/rejected": -84.71533966064453, + "loss": 1.4098, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.594197988510132, + "rewards/margins": -1.1159942150115967, + "rewards/rejected": 3.7101922035217285, + "step": 405 + }, + { + "epoch": 0.07, + "learning_rate": 9.999776148326216e-06, + "logits/chosen": -0.5294751524925232, + "logits/rejected": -0.47915202379226685, + "logps/chosen": -48.14524841308594, + "logps/rejected": -33.63380432128906, + "loss": 0.3534, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.2690773010253906, + "rewards/margins": 0.9390525817871094, + "rewards/rejected": 1.3300247192382812, + "step": 406 + }, + { + "epoch": 0.07, + "learning_rate": 9.999763539496444e-06, + "logits/chosen": -0.6199308633804321, + "logits/rejected": -0.6296709179878235, + "logps/chosen": -215.40524291992188, + "logps/rejected": -59.40437316894531, + "loss": 1.2193, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.614309787750244, + "rewards/margins": 3.4105072021484375, + "rewards/rejected": 1.203802466392517, + "step": 407 + }, + { + "epoch": 0.07, + "learning_rate": 9.999750585230322e-06, + "logits/chosen": -0.6350545883178711, + "logits/rejected": -0.6393775939941406, + "logps/chosen": -140.47579956054688, + "logps/rejected": -167.34286499023438, + "loss": 0.8984, + "rewards/accuracies": 0.0, + "rewards/chosen": 4.005031108856201, + "rewards/margins": -1.1529312133789062, + "rewards/rejected": 5.157962322235107, + "step": 408 + }, + { + "epoch": 0.07, + "learning_rate": 9.999737285528738e-06, + "logits/chosen": -0.3426356315612793, + "logits/rejected": -0.35006827116012573, + "logps/chosen": -2.893448829650879, + "logps/rejected": -1.5152199268341064, + "loss": 0.6994, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.2764627933502197, + "rewards/margins": -0.024847477674484253, + "rewards/rejected": 0.301310271024704, + "step": 409 + }, + { + "epoch": 0.07, + "learning_rate": 9.99972364039262e-06, + "logits/chosen": -0.5888041257858276, + "logits/rejected": -0.4156319499015808, + "logps/chosen": -121.20465087890625, + "logps/rejected": -30.260974884033203, + "loss": 0.0124, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.604419231414795, + "rewards/margins": 4.152353286743164, + "rewards/rejected": 0.452066034078598, + "step": 410 + }, + { + "epoch": 0.07, + "learning_rate": 9.999709649822904e-06, + "logits/chosen": -0.1223377212882042, + "logits/rejected": -0.11865762621164322, + "logps/chosen": -0.8169300556182861, + "logps/rejected": -10.60782527923584, + "loss": 0.4378, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.27376487851142883, + "rewards/margins": 0.1851564198732376, + "rewards/rejected": 0.08860845863819122, + "step": 411 + }, + { + "epoch": 0.07, + "learning_rate": 9.99969531382056e-06, + "logits/chosen": -0.35870644450187683, + "logits/rejected": -0.37023234367370605, + "logps/chosen": -13.100363731384277, + "logps/rejected": -1.2675409317016602, + "loss": 0.783, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.18929986655712128, + "rewards/margins": -0.4864753484725952, + "rewards/rejected": 0.29717549681663513, + "step": 412 + }, + { + "epoch": 0.07, + "learning_rate": 9.999680632386578e-06, + "logits/chosen": -0.6101425290107727, + "logits/rejected": -0.5845712423324585, + "logps/chosen": -219.05535888671875, + "logps/rejected": -123.21439361572266, + "loss": 0.3718, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.063955783843994, + "rewards/margins": 1.7017709016799927, + "rewards/rejected": 1.3621848821640015, + "step": 413 + }, + { + "epoch": 0.07, + "learning_rate": 9.99966560552197e-06, + "logits/chosen": -0.48435553908348083, + "logits/rejected": -0.4151197373867035, + "logps/chosen": -30.54161262512207, + "logps/rejected": -5.972098350524902, + "loss": 0.4818, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.7427946329116821, + "rewards/margins": 0.28931552171707153, + "rewards/rejected": 0.4534791111946106, + "step": 414 + }, + { + "epoch": 0.07, + "learning_rate": 9.999650233227776e-06, + "logits/chosen": -0.45657724142074585, + "logits/rejected": -0.46911484003067017, + "logps/chosen": -15.834884643554688, + "logps/rejected": -47.87289047241211, + "loss": 1.2186, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5315044522285461, + "rewards/margins": 0.17025834321975708, + "rewards/rejected": 0.36124610900878906, + "step": 415 + }, + { + "epoch": 0.07, + "learning_rate": 9.99963451550506e-06, + "logits/chosen": -0.5411736369132996, + "logits/rejected": -0.5371279120445251, + "logps/chosen": -141.2910919189453, + "logps/rejected": -89.89002990722656, + "loss": 1.0146, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.5167099237442017, + "rewards/margins": -0.6385055780410767, + "rewards/rejected": 2.1552155017852783, + "step": 416 + }, + { + "epoch": 0.07, + "learning_rate": 9.999618452354904e-06, + "logits/chosen": -0.15422503650188446, + "logits/rejected": -0.16407038271427155, + "logps/chosen": -2.9078328609466553, + "logps/rejected": -28.092233657836914, + "loss": 0.3798, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.30066803097724915, + "rewards/margins": 0.3437645733356476, + "rewards/rejected": -0.04309654235839844, + "step": 417 + }, + { + "epoch": 0.07, + "learning_rate": 9.99960204377842e-06, + "logits/chosen": -0.9740189909934998, + "logits/rejected": -0.9205109477043152, + "logps/chosen": -130.16122436523438, + "logps/rejected": -203.47874450683594, + "loss": 0.1693, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.058325290679932, + "rewards/margins": 1.7046599388122559, + "rewards/rejected": 4.353665351867676, + "step": 418 + }, + { + "epoch": 0.07, + "learning_rate": 9.999585289776741e-06, + "logits/chosen": -0.6839978098869324, + "logits/rejected": -0.6421657800674438, + "logps/chosen": -263.7747802734375, + "logps/rejected": -52.89398193359375, + "loss": 1.0758, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.815704345703125, + "rewards/margins": -0.01839911937713623, + "rewards/rejected": 1.8341034650802612, + "step": 419 + }, + { + "epoch": 0.07, + "learning_rate": 9.999568190351025e-06, + "logits/chosen": -0.5060741305351257, + "logits/rejected": -0.47986072301864624, + "logps/chosen": -70.7144775390625, + "logps/rejected": -38.57166290283203, + "loss": 0.5185, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.1755036115646362, + "rewards/margins": -0.2685633897781372, + "rewards/rejected": 1.4440670013427734, + "step": 420 + }, + { + "epoch": 0.07, + "learning_rate": 9.999550745502455e-06, + "logits/chosen": -0.9507628679275513, + "logits/rejected": -1.0421583652496338, + "logps/chosen": -152.6170654296875, + "logps/rejected": -259.3061218261719, + "loss": 1.9445, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.9756653308868408, + "rewards/margins": -3.021658182144165, + "rewards/rejected": 4.997323513031006, + "step": 421 + }, + { + "epoch": 0.07, + "learning_rate": 9.999532955232234e-06, + "logits/chosen": -0.5770989060401917, + "logits/rejected": -0.5509093403816223, + "logps/chosen": -65.5550308227539, + "logps/rejected": -44.85157012939453, + "loss": 0.4452, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.334388017654419, + "rewards/margins": 0.04392516613006592, + "rewards/rejected": 1.290462851524353, + "step": 422 + }, + { + "epoch": 0.07, + "learning_rate": 9.999514819541591e-06, + "logits/chosen": -0.3963908553123474, + "logits/rejected": -0.42522355914115906, + "logps/chosen": -153.662109375, + "logps/rejected": -46.61937713623047, + "loss": 0.2416, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.7361481189727783, + "rewards/margins": 0.6053977012634277, + "rewards/rejected": 2.1307504177093506, + "step": 423 + }, + { + "epoch": 0.07, + "learning_rate": 9.999496338431782e-06, + "logits/chosen": -0.27434736490249634, + "logits/rejected": -0.31386783719062805, + "logps/chosen": -10.04806900024414, + "logps/rejected": -79.49365234375, + "loss": 2.2322, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.261728972196579, + "rewards/margins": -3.7480289936065674, + "rewards/rejected": 4.009757995605469, + "step": 424 + }, + { + "epoch": 0.07, + "learning_rate": 9.99947751190408e-06, + "logits/chosen": -0.7671518325805664, + "logits/rejected": -0.7257546186447144, + "logps/chosen": -107.57484436035156, + "logps/rejected": -82.59727478027344, + "loss": 0.0768, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.752510070800781, + "rewards/margins": 2.630394697189331, + "rewards/rejected": 3.12211537361145, + "step": 425 + }, + { + "epoch": 0.07, + "learning_rate": 9.999458339959787e-06, + "logits/chosen": -0.1699574887752533, + "logits/rejected": -0.17620447278022766, + "logps/chosen": -85.91279602050781, + "logps/rejected": -54.84676742553711, + "loss": 0.3254, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.0406990051269531, + "rewards/margins": 0.348318874835968, + "rewards/rejected": 0.6923801302909851, + "step": 426 + }, + { + "epoch": 0.07, + "learning_rate": 9.99943882260023e-06, + "logits/chosen": -0.5487872362136841, + "logits/rejected": -0.4887520670890808, + "logps/chosen": -80.78262329101562, + "logps/rejected": -79.95294189453125, + "loss": 0.657, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.7373809814453125, + "rewards/margins": 0.2496809959411621, + "rewards/rejected": 2.4876999855041504, + "step": 427 + }, + { + "epoch": 0.07, + "learning_rate": 9.999418959826752e-06, + "logits/chosen": -0.7658848166465759, + "logits/rejected": -0.6895476579666138, + "logps/chosen": -62.9686393737793, + "logps/rejected": -88.50628662109375, + "loss": 0.1894, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.8663747310638428, + "rewards/margins": 0.8884503841400146, + "rewards/rejected": 1.9779243469238281, + "step": 428 + }, + { + "epoch": 0.07, + "learning_rate": 9.999398751640733e-06, + "logits/chosen": -0.8743526935577393, + "logits/rejected": -0.8908799290657043, + "logps/chosen": -86.58353424072266, + "logps/rejected": -131.9100341796875, + "loss": 1.6829, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.6867721676826477, + "rewards/margins": -3.1430482864379883, + "rewards/rejected": 3.829820394515991, + "step": 429 + }, + { + "epoch": 0.07, + "learning_rate": 9.999378198043561e-06, + "logits/chosen": -0.385913223028183, + "logits/rejected": -0.385913223028183, + "logps/chosen": -4.47573184967041, + "logps/rejected": -4.47573184967041, + "loss": 0.8232, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.040885329246521, + "rewards/margins": 0.0, + "rewards/rejected": 1.040885329246521, + "step": 430 + }, + { + "epoch": 0.07, + "learning_rate": 9.999357299036663e-06, + "logits/chosen": -0.07354751974344254, + "logits/rejected": -0.07446512579917908, + "logps/chosen": -4.395656585693359, + "logps/rejected": -9.800917625427246, + "loss": 0.6056, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.15999893844127655, + "rewards/margins": 0.33401408791542053, + "rewards/rejected": -0.17401514947414398, + "step": 431 + }, + { + "epoch": 0.07, + "learning_rate": 9.99933605462148e-06, + "logits/chosen": -0.5802794694900513, + "logits/rejected": -0.5115866661071777, + "logps/chosen": -78.2546615600586, + "logps/rejected": -48.03901672363281, + "loss": 0.8089, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.3545615673065186, + "rewards/margins": 1.4208709001541138, + "rewards/rejected": 1.9336906671524048, + "step": 432 + }, + { + "epoch": 0.07, + "learning_rate": 9.999314464799478e-06, + "logits/chosen": -0.7746233940124512, + "logits/rejected": -0.8100113272666931, + "logps/chosen": -74.78851318359375, + "logps/rejected": -99.40338134765625, + "loss": 1.0104, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.9629493951797485, + "rewards/margins": -1.698900580406189, + "rewards/rejected": 3.6618499755859375, + "step": 433 + }, + { + "epoch": 0.07, + "learning_rate": 9.999292529572152e-06, + "logits/chosen": -0.37511757016181946, + "logits/rejected": -0.4017540514469147, + "logps/chosen": -34.39573287963867, + "logps/rejected": -82.96332550048828, + "loss": 0.912, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.2679424285888672, + "rewards/margins": -0.15394097566604614, + "rewards/rejected": 0.42188340425491333, + "step": 434 + }, + { + "epoch": 0.07, + "learning_rate": 9.999270248941014e-06, + "logits/chosen": -0.7129276394844055, + "logits/rejected": -0.6950557231903076, + "logps/chosen": -71.69268035888672, + "logps/rejected": -106.98333740234375, + "loss": 0.8132, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.400429606437683, + "rewards/margins": -1.071546196937561, + "rewards/rejected": 2.471975803375244, + "step": 435 + }, + { + "epoch": 0.07, + "learning_rate": 9.999247622907607e-06, + "logits/chosen": -0.6763436794281006, + "logits/rejected": -0.6606556177139282, + "logps/chosen": -132.42547607421875, + "logps/rejected": -93.8411636352539, + "loss": 0.8279, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.3871827125549316, + "rewards/margins": -1.2677955627441406, + "rewards/rejected": 4.654978275299072, + "step": 436 + }, + { + "epoch": 0.07, + "learning_rate": 9.999224651473492e-06, + "logits/chosen": -0.5910436511039734, + "logits/rejected": -0.577771782875061, + "logps/chosen": -64.33863830566406, + "logps/rejected": -64.68930053710938, + "loss": 0.4714, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.4366950988769531, + "rewards/margins": -0.059311747550964355, + "rewards/rejected": 1.4960068464279175, + "step": 437 + }, + { + "epoch": 0.07, + "learning_rate": 9.999201334640256e-06, + "logits/chosen": -0.746282160282135, + "logits/rejected": -0.6192538142204285, + "logps/chosen": -53.17189407348633, + "logps/rejected": -27.48844337463379, + "loss": 0.2486, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.982554316520691, + "rewards/margins": 0.6452871561050415, + "rewards/rejected": 1.3372671604156494, + "step": 438 + }, + { + "epoch": 0.07, + "learning_rate": 9.999177672409512e-06, + "logits/chosen": -0.5662052631378174, + "logits/rejected": -0.5662052631378174, + "logps/chosen": -63.49863815307617, + "logps/rejected": -63.49863815307617, + "loss": 0.759, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.6630924344062805, + "rewards/margins": 0.0, + "rewards/rejected": 0.6630924344062805, + "step": 439 + }, + { + "epoch": 0.07, + "learning_rate": 9.999153664782893e-06, + "logits/chosen": -0.542739987373352, + "logits/rejected": -0.5413082242012024, + "logps/chosen": -7.9556474685668945, + "logps/rejected": -3.776505470275879, + "loss": 0.4003, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.3406323492527008, + "rewards/margins": -0.13288649916648865, + "rewards/rejected": 0.47351884841918945, + "step": 440 + }, + { + "epoch": 0.07, + "learning_rate": 9.999129311762057e-06, + "logits/chosen": -0.7572247982025146, + "logits/rejected": -0.6148703694343567, + "logps/chosen": -197.89244079589844, + "logps/rejected": -248.7156524658203, + "loss": 2.1216, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.4272308349609375, + "rewards/margins": -2.748927593231201, + "rewards/rejected": 6.176158428192139, + "step": 441 + }, + { + "epoch": 0.07, + "learning_rate": 9.99910461334869e-06, + "logits/chosen": -0.4828752279281616, + "logits/rejected": -0.47758224606513977, + "logps/chosen": -69.75314331054688, + "logps/rejected": -60.40615463256836, + "loss": 0.2575, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.452693223953247, + "rewards/margins": 0.801334798336029, + "rewards/rejected": 0.651358425617218, + "step": 442 + }, + { + "epoch": 0.07, + "learning_rate": 9.999079569544494e-06, + "logits/chosen": -0.8536330461502075, + "logits/rejected": -0.8464754819869995, + "logps/chosen": -40.15966033935547, + "logps/rejected": -59.23655700683594, + "loss": 0.2116, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.4442901611328125, + "rewards/margins": 0.7505760192871094, + "rewards/rejected": 0.6937141418457031, + "step": 443 + }, + { + "epoch": 0.07, + "learning_rate": 9.999054180351203e-06, + "logits/chosen": -0.5909495949745178, + "logits/rejected": -0.5909495949745178, + "logps/chosen": -0.2434171438217163, + "logps/rejected": -0.2434171438217163, + "loss": 0.6722, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.06352955847978592, + "rewards/margins": 0.0, + "rewards/rejected": 0.06352955847978592, + "step": 444 + }, + { + "epoch": 0.07, + "learning_rate": 9.999028445770569e-06, + "logits/chosen": -0.5312553644180298, + "logits/rejected": -0.5584549903869629, + "logps/chosen": -24.674942016601562, + "logps/rejected": -33.70439910888672, + "loss": 0.4898, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.4938930571079254, + "rewards/margins": -0.07639274001121521, + "rewards/rejected": 0.5702857971191406, + "step": 445 + }, + { + "epoch": 0.07, + "learning_rate": 9.99900236580437e-06, + "logits/chosen": -0.641033411026001, + "logits/rejected": -0.6506518721580505, + "logps/chosen": -74.60064697265625, + "logps/rejected": -84.462890625, + "loss": 0.6059, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.3057907223701477, + "rewards/margins": -0.7403731942176819, + "rewards/rejected": 1.0461639165878296, + "step": 446 + }, + { + "epoch": 0.07, + "learning_rate": 9.99897594045441e-06, + "logits/chosen": -1.125346064567566, + "logits/rejected": -1.0541346073150635, + "logps/chosen": -118.26776885986328, + "logps/rejected": -38.411659240722656, + "loss": 0.3041, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.659088134765625, + "rewards/margins": 0.4876556396484375, + "rewards/rejected": 0.1714324951171875, + "step": 447 + }, + { + "epoch": 0.07, + "learning_rate": 9.998949169722513e-06, + "logits/chosen": -0.39366796612739563, + "logits/rejected": -0.4087410867214203, + "logps/chosen": -89.53416442871094, + "logps/rejected": -44.186161041259766, + "loss": 1.1156, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.30882492661476135, + "rewards/margins": -0.9318341016769409, + "rewards/rejected": 1.2406589984893799, + "step": 448 + }, + { + "epoch": 0.07, + "learning_rate": 9.99892205361053e-06, + "logits/chosen": -0.4083171486854553, + "logits/rejected": -0.43549346923828125, + "logps/chosen": -40.75157928466797, + "logps/rejected": -55.79447555541992, + "loss": 1.2312, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.2353614568710327, + "rewards/margins": -1.2780927419662476, + "rewards/rejected": 2.5134541988372803, + "step": 449 + }, + { + "epoch": 0.07, + "learning_rate": 9.99889459212033e-06, + "logits/chosen": -0.7419711351394653, + "logits/rejected": -0.7394808530807495, + "logps/chosen": -89.3780517578125, + "logps/rejected": -35.75257110595703, + "loss": 0.6967, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.29877015948295593, + "rewards/margins": -1.0999813079833984, + "rewards/rejected": 1.3987514972686768, + "step": 450 + }, + { + "epoch": 0.07, + "learning_rate": 9.998866785253815e-06, + "logits/chosen": -0.7496198415756226, + "logits/rejected": -0.9102134704589844, + "logps/chosen": -154.39920043945312, + "logps/rejected": -93.80066680908203, + "loss": 3.6512, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.012765503488481045, + "rewards/margins": -6.857809066772461, + "rewards/rejected": 6.870574474334717, + "step": 451 + }, + { + "epoch": 0.07, + "learning_rate": 9.998838633012906e-06, + "logits/chosen": -0.35321274399757385, + "logits/rejected": -0.30154556035995483, + "logps/chosen": -44.191795349121094, + "logps/rejected": -16.452571868896484, + "loss": 0.3983, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.0534194707870483, + "rewards/margins": 0.6428905129432678, + "rewards/rejected": 0.4105289578437805, + "step": 452 + }, + { + "epoch": 0.07, + "learning_rate": 9.998810135399545e-06, + "logits/chosen": -1.0572917461395264, + "logits/rejected": -1.0098872184753418, + "logps/chosen": -119.19740295410156, + "logps/rejected": -116.77297973632812, + "loss": 0.971, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.65557861328125, + "rewards/margins": 0.3422638177871704, + "rewards/rejected": 1.3133147954940796, + "step": 453 + }, + { + "epoch": 0.07, + "learning_rate": 9.998781292415705e-06, + "logits/chosen": -0.6269446015357971, + "logits/rejected": -0.5832369923591614, + "logps/chosen": -99.8155288696289, + "logps/rejected": -44.478424072265625, + "loss": 0.0679, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.360802412033081, + "rewards/margins": 2.1309878826141357, + "rewards/rejected": 1.2298145294189453, + "step": 454 + }, + { + "epoch": 0.07, + "learning_rate": 9.998752104063376e-06, + "logits/chosen": -0.5126296877861023, + "logits/rejected": -0.5126296877861023, + "logps/chosen": -19.381614685058594, + "logps/rejected": -19.381614685058594, + "loss": 0.8144, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.06304416805505753, + "rewards/margins": 0.0, + "rewards/rejected": 0.06304416805505753, + "step": 455 + }, + { + "epoch": 0.07, + "learning_rate": 9.998722570344575e-06, + "logits/chosen": -0.8120833039283752, + "logits/rejected": -0.7247847318649292, + "logps/chosen": -121.1960678100586, + "logps/rejected": -68.6119384765625, + "loss": 0.3399, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1494271755218506, + "rewards/margins": 0.4586632251739502, + "rewards/rejected": 2.6907639503479004, + "step": 456 + }, + { + "epoch": 0.07, + "learning_rate": 9.998692691261343e-06, + "logits/chosen": -0.6251131892204285, + "logits/rejected": -0.5324440598487854, + "logps/chosen": -129.79974365234375, + "logps/rejected": -98.88525390625, + "loss": 0.6275, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.033590793609619, + "rewards/margins": 2.5488717555999756, + "rewards/rejected": 2.4847190380096436, + "step": 457 + }, + { + "epoch": 0.07, + "learning_rate": 9.998662466815743e-06, + "logits/chosen": -0.2111230343580246, + "logits/rejected": -0.16830573976039886, + "logps/chosen": -51.39461135864258, + "logps/rejected": -108.15548706054688, + "loss": 0.7832, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.161404013633728, + "rewards/margins": 0.5099543929100037, + "rewards/rejected": 0.6514496207237244, + "step": 458 + }, + { + "epoch": 0.07, + "learning_rate": 9.998631897009866e-06, + "logits/chosen": -0.8797751665115356, + "logits/rejected": -0.8326905369758606, + "logps/chosen": -87.2584228515625, + "logps/rejected": -59.278709411621094, + "loss": 0.3382, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.7853050231933594, + "rewards/margins": 1.2791671752929688, + "rewards/rejected": 0.5061378479003906, + "step": 459 + }, + { + "epoch": 0.07, + "learning_rate": 9.998600981845821e-06, + "logits/chosen": -0.8631202578544617, + "logits/rejected": -0.8940739035606384, + "logps/chosen": -151.85084533691406, + "logps/rejected": -145.96685791015625, + "loss": 2.2162, + "rewards/accuracies": 0.0, + "rewards/chosen": 4.758363246917725, + "rewards/margins": -0.41504669189453125, + "rewards/rejected": 5.173409938812256, + "step": 460 + }, + { + "epoch": 0.07, + "learning_rate": 9.998569721325746e-06, + "logits/chosen": -0.7685750722885132, + "logits/rejected": -0.6474487781524658, + "logps/chosen": -167.032958984375, + "logps/rejected": -35.77128601074219, + "loss": 0.081, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.750328063964844, + "rewards/margins": 3.4881911277770996, + "rewards/rejected": 1.2621368169784546, + "step": 461 + }, + { + "epoch": 0.07, + "learning_rate": 9.998538115451798e-06, + "logits/chosen": -0.719516396522522, + "logits/rejected": -0.7231394052505493, + "logps/chosen": -41.186641693115234, + "logps/rejected": -19.310762405395508, + "loss": 0.7101, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.9156387448310852, + "rewards/margins": 0.5601367950439453, + "rewards/rejected": 0.3555019497871399, + "step": 462 + }, + { + "epoch": 0.08, + "learning_rate": 9.998506164226167e-06, + "logits/chosen": -0.3215147852897644, + "logits/rejected": -0.3125801086425781, + "logps/chosen": -80.94648742675781, + "logps/rejected": -62.80645751953125, + "loss": 0.6723, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.2458603382110596, + "rewards/margins": -0.8280854225158691, + "rewards/rejected": 2.0739457607269287, + "step": 463 + }, + { + "epoch": 0.08, + "learning_rate": 9.998473867651053e-06, + "logits/chosen": -0.41489797830581665, + "logits/rejected": -0.4086804986000061, + "logps/chosen": -37.24509811401367, + "logps/rejected": -27.103342056274414, + "loss": 0.4208, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.1923442929983139, + "rewards/margins": -0.23883266746997833, + "rewards/rejected": 0.43117696046829224, + "step": 464 + }, + { + "epoch": 0.08, + "learning_rate": 9.998441225728693e-06, + "logits/chosen": -0.8223674297332764, + "logits/rejected": -0.6210578680038452, + "logps/chosen": -122.62875366210938, + "logps/rejected": -16.551687240600586, + "loss": 1.2305, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.156735420227051, + "rewards/margins": 4.6918230056762695, + "rewards/rejected": 0.4649122357368469, + "step": 465 + }, + { + "epoch": 0.08, + "learning_rate": 9.99840823846134e-06, + "logits/chosen": -0.4947105944156647, + "logits/rejected": -0.562181830406189, + "logps/chosen": -92.83828735351562, + "logps/rejected": -171.95211791992188, + "loss": 1.807, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.75398588180542, + "rewards/margins": 0.16662931442260742, + "rewards/rejected": 4.5873565673828125, + "step": 466 + }, + { + "epoch": 0.08, + "learning_rate": 9.99837490585127e-06, + "logits/chosen": -0.5617824196815491, + "logits/rejected": -0.5770268440246582, + "logps/chosen": -25.008455276489258, + "logps/rejected": -57.40690231323242, + "loss": 0.6278, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.9475561380386353, + "rewards/margins": 0.36054056882858276, + "rewards/rejected": 0.5870155692100525, + "step": 467 + }, + { + "epoch": 0.08, + "learning_rate": 9.998341227900792e-06, + "logits/chosen": -0.9396131038665771, + "logits/rejected": -0.7887718677520752, + "logps/chosen": -130.33670043945312, + "logps/rejected": -42.678192138671875, + "loss": 0.1983, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.22927713394165, + "rewards/margins": 4.311688423156738, + "rewards/rejected": 1.9175888299942017, + "step": 468 + }, + { + "epoch": 0.08, + "learning_rate": 9.99830720461223e-06, + "logits/chosen": -0.6752126812934875, + "logits/rejected": -0.6870279312133789, + "logps/chosen": -98.35821533203125, + "logps/rejected": -95.10501098632812, + "loss": 1.1572, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.8665527701377869, + "rewards/margins": -0.3238510489463806, + "rewards/rejected": 1.1904038190841675, + "step": 469 + }, + { + "epoch": 0.08, + "learning_rate": 9.998272835987933e-06, + "logits/chosen": -0.9047024250030518, + "logits/rejected": -0.9281625151634216, + "logps/chosen": -103.79441833496094, + "logps/rejected": -122.78941345214844, + "loss": 1.8392, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.5696853995323181, + "rewards/margins": -1.852961778640747, + "rewards/rejected": 2.42264723777771, + "step": 470 + }, + { + "epoch": 0.08, + "learning_rate": 9.998238122030277e-06, + "logits/chosen": -0.4720640778541565, + "logits/rejected": -0.6142868995666504, + "logps/chosen": -102.09748840332031, + "logps/rejected": -108.4295654296875, + "loss": 3.4669, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.08212127536535263, + "rewards/margins": -5.073473930358887, + "rewards/rejected": 5.155595302581787, + "step": 471 + }, + { + "epoch": 0.08, + "learning_rate": 9.99820306274166e-06, + "logits/chosen": -0.3179042637348175, + "logits/rejected": -0.3809886872768402, + "logps/chosen": -98.02729797363281, + "logps/rejected": -94.60031127929688, + "loss": 1.5039, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.2093994617462158, + "rewards/margins": -0.9938316345214844, + "rewards/rejected": 2.2032310962677, + "step": 472 + }, + { + "epoch": 0.08, + "learning_rate": 9.998167658124507e-06, + "logits/chosen": -0.9047409296035767, + "logits/rejected": -0.7901554107666016, + "logps/chosen": -80.18000793457031, + "logps/rejected": -45.61363983154297, + "loss": 0.2745, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.2579925060272217, + "rewards/margins": 1.343739628791809, + "rewards/rejected": 0.9142528772354126, + "step": 473 + }, + { + "epoch": 0.08, + "learning_rate": 9.998131908181262e-06, + "logits/chosen": -0.4556410312652588, + "logits/rejected": -0.5034669041633606, + "logps/chosen": -105.7313232421875, + "logps/rejected": -137.26535034179688, + "loss": 2.2627, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.9327606558799744, + "rewards/margins": -4.181241035461426, + "rewards/rejected": 5.114001750946045, + "step": 474 + }, + { + "epoch": 0.08, + "learning_rate": 9.998095812914392e-06, + "logits/chosen": -0.20671994984149933, + "logits/rejected": -0.17752812802791595, + "logps/chosen": -65.75345611572266, + "logps/rejected": -47.535614013671875, + "loss": 0.8563, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.6901581287384033, + "rewards/margins": 0.533240556716919, + "rewards/rejected": 1.1569175720214844, + "step": 475 + }, + { + "epoch": 0.08, + "learning_rate": 9.998059372326396e-06, + "logits/chosen": -0.5915430188179016, + "logits/rejected": -0.563319981098175, + "logps/chosen": -79.05455017089844, + "logps/rejected": -58.2633171081543, + "loss": 0.4032, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.9784584045410156, + "rewards/margins": -0.17184031009674072, + "rewards/rejected": 1.1502987146377563, + "step": 476 + }, + { + "epoch": 0.08, + "learning_rate": 9.998022586419788e-06, + "logits/chosen": -0.447589248418808, + "logits/rejected": -0.513126015663147, + "logps/chosen": -69.22532653808594, + "logps/rejected": -65.98654174804688, + "loss": 0.5419, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.49278724193573, + "rewards/margins": -0.193695068359375, + "rewards/rejected": 1.686482310295105, + "step": 477 + }, + { + "epoch": 0.08, + "learning_rate": 9.997985455197114e-06, + "logits/chosen": -0.5375664830207825, + "logits/rejected": -0.538677453994751, + "logps/chosen": -1.7046217918395996, + "logps/rejected": -2.0264480113983154, + "loss": 0.6748, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.27958089113235474, + "rewards/margins": -0.08183237910270691, + "rewards/rejected": 0.36141327023506165, + "step": 478 + }, + { + "epoch": 0.08, + "learning_rate": 9.997947978660934e-06, + "logits/chosen": -0.8726966977119446, + "logits/rejected": -1.1670366525650024, + "logps/chosen": -89.87974548339844, + "logps/rejected": -36.37171173095703, + "loss": 1.1337, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.6633377075195312, + "rewards/margins": 1.2852509021759033, + "rewards/rejected": 0.3780868649482727, + "step": 479 + }, + { + "epoch": 0.08, + "learning_rate": 9.99791015681384e-06, + "logits/chosen": -0.41761359572410583, + "logits/rejected": -0.4654918611049652, + "logps/chosen": -51.192203521728516, + "logps/rejected": -56.108863830566406, + "loss": 1.7418, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.597921371459961, + "rewards/margins": -0.8240108489990234, + "rewards/rejected": 2.4219322204589844, + "step": 480 + }, + { + "epoch": 0.08, + "learning_rate": 9.997871989658446e-06, + "logits/chosen": -0.7162566184997559, + "logits/rejected": -0.7604923248291016, + "logps/chosen": -227.07461547851562, + "logps/rejected": -54.89755630493164, + "loss": 0.1955, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.9083220958709717, + "rewards/margins": 1.118025541305542, + "rewards/rejected": 2.7902965545654297, + "step": 481 + }, + { + "epoch": 0.08, + "learning_rate": 9.997833477197386e-06, + "logits/chosen": -0.5926333665847778, + "logits/rejected": -0.5485033392906189, + "logps/chosen": -53.98997497558594, + "logps/rejected": -29.608821868896484, + "loss": 1.0083, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.3847579956054688, + "rewards/margins": 0.8740703463554382, + "rewards/rejected": 0.5106876492500305, + "step": 482 + }, + { + "epoch": 0.08, + "learning_rate": 9.997794619433324e-06, + "logits/chosen": -0.7581744194030762, + "logits/rejected": -0.9013305902481079, + "logps/chosen": -148.96783447265625, + "logps/rejected": -120.87214660644531, + "loss": 0.3054, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.6993408203125, + "rewards/margins": 1.4491851329803467, + "rewards/rejected": 3.2501556873321533, + "step": 483 + }, + { + "epoch": 0.08, + "learning_rate": 9.997755416368943e-06, + "logits/chosen": -0.7864773869514465, + "logits/rejected": -0.7101969122886658, + "logps/chosen": -163.59942626953125, + "logps/rejected": -35.79008483886719, + "loss": 0.1807, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.774142742156982, + "rewards/margins": 3.5113611221313477, + "rewards/rejected": 1.2627815008163452, + "step": 484 + }, + { + "epoch": 0.08, + "learning_rate": 9.997715868006952e-06, + "logits/chosen": -0.34894800186157227, + "logits/rejected": -0.34894800186157227, + "logps/chosen": -53.31202697753906, + "logps/rejected": -53.31202697753906, + "loss": 0.3518, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.4713363647460938, + "rewards/margins": 0.0, + "rewards/rejected": 2.4713363647460938, + "step": 485 + }, + { + "epoch": 0.08, + "learning_rate": 9.997675974350082e-06, + "logits/chosen": -0.3539597988128662, + "logits/rejected": -0.24276025593280792, + "logps/chosen": -120.74189758300781, + "logps/rejected": -86.2724609375, + "loss": 0.1544, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.6052017211914062, + "rewards/margins": 1.2256088256835938, + "rewards/rejected": 1.3795928955078125, + "step": 486 + }, + { + "epoch": 0.08, + "learning_rate": 9.997635735401092e-06, + "logits/chosen": -0.9335947632789612, + "logits/rejected": -0.9079059362411499, + "logps/chosen": -63.839012145996094, + "logps/rejected": -33.066993713378906, + "loss": 0.1951, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.0333549976348877, + "rewards/margins": 0.8385940790176392, + "rewards/rejected": 0.19476090371608734, + "step": 487 + }, + { + "epoch": 0.08, + "learning_rate": 9.99759515116276e-06, + "logits/chosen": -0.9402384161949158, + "logits/rejected": -0.9282086491584778, + "logps/chosen": -86.59893798828125, + "logps/rejected": -87.25572204589844, + "loss": 0.3841, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.3656333684921265, + "rewards/margins": -0.050018310546875, + "rewards/rejected": 1.4156516790390015, + "step": 488 + }, + { + "epoch": 0.08, + "learning_rate": 9.997554221637892e-06, + "logits/chosen": -0.21734513342380524, + "logits/rejected": -0.1774895340204239, + "logps/chosen": -91.66500854492188, + "logps/rejected": -51.254112243652344, + "loss": 0.7535, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.2961013317108154, + "rewards/margins": -0.22857451438903809, + "rewards/rejected": 2.5246758460998535, + "step": 489 + }, + { + "epoch": 0.08, + "learning_rate": 9.997512946829314e-06, + "logits/chosen": -1.2656983137130737, + "logits/rejected": -1.2921823263168335, + "logps/chosen": -101.4007568359375, + "logps/rejected": -39.99856185913086, + "loss": 1.2462, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.2704368829727173, + "rewards/margins": 0.9769264459609985, + "rewards/rejected": 0.29351043701171875, + "step": 490 + }, + { + "epoch": 0.08, + "learning_rate": 9.997471326739879e-06, + "logits/chosen": -0.33338290452957153, + "logits/rejected": -0.3300313949584961, + "logps/chosen": -4.233234405517578, + "logps/rejected": -9.014731407165527, + "loss": 1.43, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.22115622460842133, + "rewards/margins": 0.3755546808242798, + "rewards/rejected": -0.15439844131469727, + "step": 491 + }, + { + "epoch": 0.08, + "learning_rate": 9.99742936137246e-06, + "logits/chosen": -0.33171501755714417, + "logits/rejected": -0.3261875510215759, + "logps/chosen": -30.282859802246094, + "logps/rejected": -30.973121643066406, + "loss": 0.4311, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.13999329507350922, + "rewards/margins": 0.27117979526519775, + "rewards/rejected": -0.13118648529052734, + "step": 492 + }, + { + "epoch": 0.08, + "learning_rate": 9.997387050729958e-06, + "logits/chosen": -0.3738742768764496, + "logits/rejected": -0.37615397572517395, + "logps/chosen": -9.90479850769043, + "logps/rejected": -13.162745475769043, + "loss": 1.3852, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.09546804428100586, + "rewards/margins": 0.0800238624215126, + "rewards/rejected": 0.015444183722138405, + "step": 493 + }, + { + "epoch": 0.08, + "learning_rate": 9.997344394815298e-06, + "logits/chosen": -0.7864428758621216, + "logits/rejected": -0.7769027352333069, + "logps/chosen": -123.5482406616211, + "logps/rejected": -112.27481079101562, + "loss": 1.5604, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.8365745544433594, + "rewards/margins": 0.17548829317092896, + "rewards/rejected": 0.6610862612724304, + "step": 494 + }, + { + "epoch": 0.08, + "learning_rate": 9.997301393631426e-06, + "logits/chosen": -0.10598950833082199, + "logits/rejected": -0.11566110700368881, + "logps/chosen": -8.419697761535645, + "logps/rejected": -4.088217258453369, + "loss": 0.5717, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.036774542182683945, + "rewards/margins": -0.3016708791255951, + "rewards/rejected": 0.33844542503356934, + "step": 495 + }, + { + "epoch": 0.08, + "learning_rate": 9.997258047181312e-06, + "logits/chosen": -0.558407723903656, + "logits/rejected": -0.5250601172447205, + "logps/chosen": -41.2596435546875, + "logps/rejected": -24.213268280029297, + "loss": 0.18, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.6545323133468628, + "rewards/margins": 0.9764883518218994, + "rewards/rejected": 0.6780439615249634, + "step": 496 + }, + { + "epoch": 0.08, + "learning_rate": 9.997214355467952e-06, + "logits/chosen": -0.6353939175605774, + "logits/rejected": -0.6507539749145508, + "logps/chosen": -138.7439727783203, + "logps/rejected": -95.10894775390625, + "loss": 1.7721, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.992419481277466, + "rewards/margins": -1.5179460048675537, + "rewards/rejected": 5.5103654861450195, + "step": 497 + }, + { + "epoch": 0.08, + "learning_rate": 9.997170318494362e-06, + "logits/chosen": -0.2553275525569916, + "logits/rejected": -0.2624662220478058, + "logps/chosen": -3.1815147399902344, + "logps/rejected": -32.21918487548828, + "loss": 0.7218, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3225906491279602, + "rewards/margins": 0.1808345913887024, + "rewards/rejected": 0.1417560577392578, + "step": 498 + }, + { + "epoch": 0.08, + "learning_rate": 9.99712593626359e-06, + "logits/chosen": -0.3773285448551178, + "logits/rejected": -0.3751294016838074, + "logps/chosen": -84.9136962890625, + "logps/rejected": -46.85962677001953, + "loss": 0.7793, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.6742538809776306, + "rewards/margins": -0.9992080330848694, + "rewards/rejected": 1.6734619140625, + "step": 499 + }, + { + "epoch": 0.08, + "learning_rate": 9.997081208778696e-06, + "logits/chosen": -0.8974749445915222, + "logits/rejected": -0.6616013646125793, + "logps/chosen": -179.49095153808594, + "logps/rejected": -28.052043914794922, + "loss": 0.0798, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.651072978973389, + "rewards/margins": 5.177116394042969, + "rewards/rejected": 0.4739566743373871, + "step": 500 + }, + { + "epoch": 0.08, + "learning_rate": 9.997036136042774e-06, + "logits/chosen": -0.38259702920913696, + "logits/rejected": -0.42527684569358826, + "logps/chosen": -114.89398193359375, + "logps/rejected": -72.42953491210938, + "loss": 1.0265, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.6642937064170837, + "rewards/margins": -0.53653484582901, + "rewards/rejected": 1.2008285522460938, + "step": 501 + }, + { + "epoch": 0.08, + "learning_rate": 9.996990718058939e-06, + "logits/chosen": -0.7701273560523987, + "logits/rejected": -0.7592924237251282, + "logps/chosen": -95.8281478881836, + "logps/rejected": -154.69903564453125, + "loss": 1.4147, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.9986885786056519, + "rewards/margins": -1.7586768865585327, + "rewards/rejected": 3.7573654651641846, + "step": 502 + }, + { + "epoch": 0.08, + "learning_rate": 9.996944954830325e-06, + "logits/chosen": -0.7145405411720276, + "logits/rejected": -0.6740918755531311, + "logps/chosen": -64.47389221191406, + "logps/rejected": -57.93291473388672, + "loss": 0.7921, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.243584394454956, + "rewards/margins": -0.3312675952911377, + "rewards/rejected": 2.5748519897460938, + "step": 503 + }, + { + "epoch": 0.08, + "learning_rate": 9.996898846360098e-06, + "logits/chosen": -0.42298412322998047, + "logits/rejected": -0.28888261318206787, + "logps/chosen": -132.1811065673828, + "logps/rejected": -140.25965881347656, + "loss": 0.0827, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.516868591308594, + "rewards/margins": 2.1598448753356934, + "rewards/rejected": 4.3570237159729, + "step": 504 + }, + { + "epoch": 0.08, + "learning_rate": 9.996852392651441e-06, + "logits/chosen": -0.3281063735485077, + "logits/rejected": -0.25900140404701233, + "logps/chosen": -41.312660217285156, + "logps/rejected": -13.772229194641113, + "loss": 0.2562, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.6226913928985596, + "rewards/margins": 0.4465625286102295, + "rewards/rejected": 1.17612886428833, + "step": 505 + }, + { + "epoch": 0.08, + "learning_rate": 9.996805593707566e-06, + "logits/chosen": -0.9733811616897583, + "logits/rejected": -0.9127776622772217, + "logps/chosen": -120.02118682861328, + "logps/rejected": -103.25138854980469, + "loss": 0.0244, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.147296905517578, + "rewards/margins": 4.235652923583984, + "rewards/rejected": 0.9116439819335938, + "step": 506 + }, + { + "epoch": 0.08, + "learning_rate": 9.996758449531702e-06, + "logits/chosen": -0.17805159091949463, + "logits/rejected": -0.19942805171012878, + "logps/chosen": -14.909889221191406, + "logps/rejected": -19.726301193237305, + "loss": 0.4558, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.21246032416820526, + "rewards/margins": -0.39419442415237427, + "rewards/rejected": 0.6066547632217407, + "step": 507 + }, + { + "epoch": 0.08, + "learning_rate": 9.99671096012711e-06, + "logits/chosen": -0.8446605205535889, + "logits/rejected": -0.5046588778495789, + "logps/chosen": -134.94277954101562, + "logps/rejected": -64.05032348632812, + "loss": 0.4876, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.698330879211426, + "rewards/margins": 1.406437873840332, + "rewards/rejected": 3.2918930053710938, + "step": 508 + }, + { + "epoch": 0.08, + "learning_rate": 9.99666312549707e-06, + "logits/chosen": 0.02460525929927826, + "logits/rejected": 0.02460525929927826, + "logps/chosen": -1.4481475353240967, + "logps/rejected": -1.4481475353240967, + "loss": 0.4204, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.32154643535614014, + "rewards/margins": 0.0, + "rewards/rejected": 0.32154643535614014, + "step": 509 + }, + { + "epoch": 0.08, + "learning_rate": 9.996614945644887e-06, + "logits/chosen": -0.8960552215576172, + "logits/rejected": -0.8113311529159546, + "logps/chosen": -57.734535217285156, + "logps/rejected": -131.189453125, + "loss": 0.2298, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.701812744140625, + "rewards/margins": 0.866455078125, + "rewards/rejected": 0.835357666015625, + "step": 510 + }, + { + "epoch": 0.08, + "learning_rate": 9.99656642057389e-06, + "logits/chosen": -0.7271076440811157, + "logits/rejected": -0.6289541125297546, + "logps/chosen": -90.6060562133789, + "logps/rejected": -121.31483459472656, + "loss": 0.1343, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.0742623805999756, + "rewards/margins": 1.470282793045044, + "rewards/rejected": 0.6039795279502869, + "step": 511 + }, + { + "epoch": 0.08, + "learning_rate": 9.996517550287432e-06, + "logits/chosen": -0.35941746830940247, + "logits/rejected": -0.35941746830940247, + "logps/chosen": -67.99993896484375, + "logps/rejected": -67.99993896484375, + "loss": 0.6295, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.431307315826416, + "rewards/margins": 0.0, + "rewards/rejected": 2.431307315826416, + "step": 512 + }, + { + "epoch": 0.08, + "learning_rate": 9.996468334788887e-06, + "logits/chosen": -0.6779966354370117, + "logits/rejected": -0.609273374080658, + "logps/chosen": -118.990478515625, + "logps/rejected": -62.895843505859375, + "loss": 0.5926, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.7017425298690796, + "rewards/margins": -0.27322542667388916, + "rewards/rejected": 1.9749679565429688, + "step": 513 + }, + { + "epoch": 0.08, + "learning_rate": 9.996418774081658e-06, + "logits/chosen": -0.8679193258285522, + "logits/rejected": -0.8707323670387268, + "logps/chosen": -47.101226806640625, + "logps/rejected": -125.5334701538086, + "loss": 0.3641, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.9559379816055298, + "rewards/margins": 1.153588891029358, + "rewards/rejected": 0.8023490905761719, + "step": 514 + }, + { + "epoch": 0.08, + "learning_rate": 9.996368868169168e-06, + "logits/chosen": -0.36546966433525085, + "logits/rejected": -0.38682329654693604, + "logps/chosen": -23.83831787109375, + "logps/rejected": -43.977134704589844, + "loss": 0.4704, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.7000147104263306, + "rewards/margins": 0.6031061410903931, + "rewards/rejected": 0.0969085693359375, + "step": 515 + }, + { + "epoch": 0.08, + "learning_rate": 9.996318617054866e-06, + "logits/chosen": -0.4264315962791443, + "logits/rejected": -0.38829678297042847, + "logps/chosen": -98.57950592041016, + "logps/rejected": -69.08602142333984, + "loss": 0.4574, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.38317185640335083, + "rewards/margins": -0.37213438749313354, + "rewards/rejected": 0.7553062438964844, + "step": 516 + }, + { + "epoch": 0.08, + "learning_rate": 9.996268020742221e-06, + "logits/chosen": -0.47504061460494995, + "logits/rejected": -0.32200708985328674, + "logps/chosen": -90.62333679199219, + "logps/rejected": -11.537632942199707, + "loss": 0.3235, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5871444940567017, + "rewards/margins": 0.29084378480911255, + "rewards/rejected": 0.2963007092475891, + "step": 517 + }, + { + "epoch": 0.08, + "learning_rate": 9.996217079234734e-06, + "logits/chosen": -0.8945376873016357, + "logits/rejected": -0.6866511106491089, + "logps/chosen": -231.2179412841797, + "logps/rejected": -36.85175704956055, + "loss": 0.3214, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.270878553390503, + "rewards/margins": 2.0715763568878174, + "rewards/rejected": 0.1993023008108139, + "step": 518 + }, + { + "epoch": 0.08, + "learning_rate": 9.996165792535918e-06, + "logits/chosen": -0.5598844289779663, + "logits/rejected": -0.5959324240684509, + "logps/chosen": -70.79824829101562, + "logps/rejected": -118.31336975097656, + "loss": 1.1606, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.0594596862792969, + "rewards/margins": 0.16230469942092896, + "rewards/rejected": 0.8971549868583679, + "step": 519 + }, + { + "epoch": 0.08, + "learning_rate": 9.996114160649323e-06, + "logits/chosen": -0.6977422833442688, + "logits/rejected": -0.6671779751777649, + "logps/chosen": -60.121559143066406, + "logps/rejected": -55.98948287963867, + "loss": 0.4286, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.7288612127304077, + "rewards/margins": -0.26705360412597656, + "rewards/rejected": 1.9959148168563843, + "step": 520 + }, + { + "epoch": 0.08, + "learning_rate": 9.996062183578511e-06, + "logits/chosen": -0.7951878309249878, + "logits/rejected": -0.8369818329811096, + "logps/chosen": -216.9296875, + "logps/rejected": -39.669044494628906, + "loss": 0.1392, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.062242269515991, + "rewards/margins": 1.3419400453567505, + "rewards/rejected": 1.7203022241592407, + "step": 521 + }, + { + "epoch": 0.08, + "learning_rate": 9.996009861327077e-06, + "logits/chosen": -0.4610086977481842, + "logits/rejected": -0.09387217462062836, + "logps/chosen": -32.12115478515625, + "logps/rejected": -110.50818634033203, + "loss": 1.6324, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.0151852369308472, + "rewards/margins": -3.1410975456237793, + "rewards/rejected": 4.156282901763916, + "step": 522 + }, + { + "epoch": 0.08, + "learning_rate": 9.995957193898633e-06, + "logits/chosen": -0.7492886781692505, + "logits/rejected": -0.7582659125328064, + "logps/chosen": -86.86009979248047, + "logps/rejected": -80.36909484863281, + "loss": 1.9498, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.4326965808868408, + "rewards/margins": -0.9398689270019531, + "rewards/rejected": 2.372565507888794, + "step": 523 + }, + { + "epoch": 0.09, + "learning_rate": 9.99590418129682e-06, + "logits/chosen": -0.4703103303909302, + "logits/rejected": -0.5339702367782593, + "logps/chosen": -59.34809494018555, + "logps/rejected": -105.21234893798828, + "loss": 1.3237, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.8245663046836853, + "rewards/margins": 1.0331257581710815, + "rewards/rejected": -0.20855942368507385, + "step": 524 + }, + { + "epoch": 0.09, + "learning_rate": 9.9958508235253e-06, + "logits/chosen": -0.9125949144363403, + "logits/rejected": -0.9754081964492798, + "logps/chosen": -151.3309326171875, + "logps/rejected": -138.45611572265625, + "loss": 1.5953, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.9020798206329346, + "rewards/margins": -2.2309248447418213, + "rewards/rejected": 5.133004665374756, + "step": 525 + }, + { + "epoch": 0.09, + "learning_rate": 9.995797120587758e-06, + "logits/chosen": -0.3690211772918701, + "logits/rejected": -0.3093186318874359, + "logps/chosen": -63.32816696166992, + "logps/rejected": -45.725379943847656, + "loss": 0.3609, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.7874943017959595, + "rewards/margins": 0.011267483234405518, + "rewards/rejected": 0.776226818561554, + "step": 526 + }, + { + "epoch": 0.09, + "learning_rate": 9.995743072487906e-06, + "logits/chosen": -0.7628040909767151, + "logits/rejected": -0.7875310182571411, + "logps/chosen": -70.04345703125, + "logps/rejected": -67.32929992675781, + "loss": 0.5182, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.5775284171104431, + "rewards/margins": -0.5770408511161804, + "rewards/rejected": 1.1545692682266235, + "step": 527 + }, + { + "epoch": 0.09, + "learning_rate": 9.995688679229478e-06, + "logits/chosen": -0.5632475018501282, + "logits/rejected": -0.4947984218597412, + "logps/chosen": -57.993473052978516, + "logps/rejected": -84.11079406738281, + "loss": 0.331, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.554980993270874, + "rewards/margins": 0.30413246154785156, + "rewards/rejected": 2.2508485317230225, + "step": 528 + }, + { + "epoch": 0.09, + "learning_rate": 9.995633940816233e-06, + "logits/chosen": -0.5810078978538513, + "logits/rejected": -0.5407519340515137, + "logps/chosen": -148.11572265625, + "logps/rejected": -86.51282501220703, + "loss": 1.6017, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.1599273681640625, + "rewards/margins": -2.5806405544281006, + "rewards/rejected": 3.740567922592163, + "step": 529 + }, + { + "epoch": 0.09, + "learning_rate": 9.99557885725195e-06, + "logits/chosen": -0.9628244638442993, + "logits/rejected": -0.8286756277084351, + "logps/chosen": -131.77536010742188, + "logps/rejected": -93.9944839477539, + "loss": 0.1356, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.0437164306640625, + "rewards/margins": 4.006539344787598, + "rewards/rejected": 2.037177324295044, + "step": 530 + }, + { + "epoch": 0.09, + "learning_rate": 9.995523428540438e-06, + "logits/chosen": -0.6932198405265808, + "logits/rejected": -0.6937052607536316, + "logps/chosen": -45.81559371948242, + "logps/rejected": -136.73162841796875, + "loss": 1.3073, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.208719253540039, + "rewards/margins": -2.5333995819091797, + "rewards/rejected": 3.7421188354492188, + "step": 531 + }, + { + "epoch": 0.09, + "learning_rate": 9.995467654685525e-06, + "logits/chosen": -0.25431180000305176, + "logits/rejected": -0.24743704497814178, + "logps/chosen": -54.372920989990234, + "logps/rejected": -87.16967010498047, + "loss": 1.295, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.7501095533370972, + "rewards/margins": -0.6372660398483276, + "rewards/rejected": 2.387375593185425, + "step": 532 + }, + { + "epoch": 0.09, + "learning_rate": 9.995411535691064e-06, + "logits/chosen": -0.7476645708084106, + "logits/rejected": -0.5974178910255432, + "logps/chosen": -150.00424194335938, + "logps/rejected": -12.414652824401855, + "loss": 0.0198, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.30404806137085, + "rewards/margins": 4.427340507507324, + "rewards/rejected": 0.8767077326774597, + "step": 533 + }, + { + "epoch": 0.09, + "learning_rate": 9.995355071560933e-06, + "logits/chosen": -0.5358750820159912, + "logits/rejected": -0.4443216621875763, + "logps/chosen": -65.92866516113281, + "logps/rejected": -35.161529541015625, + "loss": 0.4154, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.9811394214630127, + "rewards/margins": 0.6134846210479736, + "rewards/rejected": 1.367654800415039, + "step": 534 + }, + { + "epoch": 0.09, + "learning_rate": 9.995298262299033e-06, + "logits/chosen": -0.8153934478759766, + "logits/rejected": -0.7771687507629395, + "logps/chosen": -49.71141815185547, + "logps/rejected": -62.381568908691406, + "loss": 0.6956, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.6339867115020752, + "rewards/margins": 0.3880356550216675, + "rewards/rejected": 1.2459510564804077, + "step": 535 + }, + { + "epoch": 0.09, + "learning_rate": 9.99524110790929e-06, + "logits/chosen": -0.6636674404144287, + "logits/rejected": -0.6268017888069153, + "logps/chosen": -110.36166381835938, + "logps/rejected": -65.15939331054688, + "loss": 1.0368, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.48795318603515625, + "rewards/margins": -0.867706298828125, + "rewards/rejected": 1.3556594848632812, + "step": 536 + }, + { + "epoch": 0.09, + "learning_rate": 9.99518360839565e-06, + "logits/chosen": -0.4037022888660431, + "logits/rejected": -0.3841680884361267, + "logps/chosen": -50.60184860229492, + "logps/rejected": -19.381675720214844, + "loss": 0.3777, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.2234039306640625, + "rewards/margins": -0.07456475496292114, + "rewards/rejected": 0.29796868562698364, + "step": 537 + }, + { + "epoch": 0.09, + "learning_rate": 9.995125763762089e-06, + "logits/chosen": -0.633635938167572, + "logits/rejected": -0.633635938167572, + "logps/chosen": -19.99736785888672, + "logps/rejected": -19.99736785888672, + "loss": 0.3793, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.17528609931468964, + "rewards/margins": 0.0, + "rewards/rejected": 0.17528609931468964, + "step": 538 + }, + { + "epoch": 0.09, + "learning_rate": 9.995067574012602e-06, + "logits/chosen": -0.8980571627616882, + "logits/rejected": -0.7789928913116455, + "logps/chosen": -167.86473083496094, + "logps/rejected": -42.13158416748047, + "loss": 0.0124, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.352299690246582, + "rewards/margins": 4.269140720367432, + "rewards/rejected": 1.0831588506698608, + "step": 539 + }, + { + "epoch": 0.09, + "learning_rate": 9.99500903915121e-06, + "logits/chosen": -0.5869081020355225, + "logits/rejected": -0.4788442850112915, + "logps/chosen": -72.9322280883789, + "logps/rejected": -16.57339096069336, + "loss": 0.2834, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.6212326288223267, + "rewards/margins": 0.33199408650398254, + "rewards/rejected": 0.2892385423183441, + "step": 540 + }, + { + "epoch": 0.09, + "learning_rate": 9.994950159181955e-06, + "logits/chosen": -1.0367443561553955, + "logits/rejected": -0.9909754395484924, + "logps/chosen": -90.22994232177734, + "logps/rejected": -124.49449157714844, + "loss": 0.8323, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.093275547027588, + "rewards/margins": -0.06443166732788086, + "rewards/rejected": 2.1577072143554688, + "step": 541 + }, + { + "epoch": 0.09, + "learning_rate": 9.994890934108907e-06, + "logits/chosen": -0.6589646935462952, + "logits/rejected": -0.6634554862976074, + "logps/chosen": -126.39125061035156, + "logps/rejected": -112.97175598144531, + "loss": 0.6184, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.8666335940361023, + "rewards/margins": -0.7752014994621277, + "rewards/rejected": 1.64183509349823, + "step": 542 + }, + { + "epoch": 0.09, + "learning_rate": 9.994831363936157e-06, + "logits/chosen": -0.8006525039672852, + "logits/rejected": -0.8061873316764832, + "logps/chosen": -80.4658203125, + "logps/rejected": -63.72724914550781, + "loss": 1.4863, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.481008917093277, + "rewards/margins": -0.9151932001113892, + "rewards/rejected": 1.3962020874023438, + "step": 543 + }, + { + "epoch": 0.09, + "learning_rate": 9.994771448667823e-06, + "logits/chosen": -0.6338704824447632, + "logits/rejected": -0.5906277894973755, + "logps/chosen": -182.84719848632812, + "logps/rejected": -74.98015594482422, + "loss": 0.1984, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.269771099090576, + "rewards/margins": 3.0413317680358887, + "rewards/rejected": 2.2284393310546875, + "step": 544 + }, + { + "epoch": 0.09, + "learning_rate": 9.994711188308041e-06, + "logits/chosen": -0.7499282956123352, + "logits/rejected": -0.8493741154670715, + "logps/chosen": -57.051944732666016, + "logps/rejected": -188.84307861328125, + "loss": 2.2636, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.76140558719635, + "rewards/margins": -2.026198387145996, + "rewards/rejected": 3.7876038551330566, + "step": 545 + }, + { + "epoch": 0.09, + "learning_rate": 9.994650582860978e-06, + "logits/chosen": -0.740382730960846, + "logits/rejected": -0.6341003179550171, + "logps/chosen": -142.47727966308594, + "logps/rejected": -15.941146850585938, + "loss": 0.0696, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.6618881225585938, + "rewards/margins": 2.473339557647705, + "rewards/rejected": 1.1885484457015991, + "step": 546 + }, + { + "epoch": 0.09, + "learning_rate": 9.99458963233082e-06, + "logits/chosen": -0.3252037763595581, + "logits/rejected": -0.32657453417778015, + "logps/chosen": -8.53258228302002, + "logps/rejected": -20.871248245239258, + "loss": 0.4629, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.019965076819062233, + "rewards/margins": -0.26604166626930237, + "rewards/rejected": 0.28600674867630005, + "step": 547 + }, + { + "epoch": 0.09, + "learning_rate": 9.994528336721775e-06, + "logits/chosen": -0.49992093443870544, + "logits/rejected": -0.4955185651779175, + "logps/chosen": -64.36783599853516, + "logps/rejected": -18.615373611450195, + "loss": 0.2588, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.9240486025810242, + "rewards/margins": 0.5397918224334717, + "rewards/rejected": 0.3842567503452301, + "step": 548 + }, + { + "epoch": 0.09, + "learning_rate": 9.994466696038084e-06, + "logits/chosen": -0.6952155232429504, + "logits/rejected": -0.606764554977417, + "logps/chosen": -52.26399230957031, + "logps/rejected": -67.19408416748047, + "loss": 0.3937, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.7352294921875, + "rewards/margins": 0.1922461986541748, + "rewards/rejected": 1.5429832935333252, + "step": 549 + }, + { + "epoch": 0.09, + "learning_rate": 9.994404710283999e-06, + "logits/chosen": -0.6749190092086792, + "logits/rejected": -0.5042000412940979, + "logps/chosen": -86.73729705810547, + "logps/rejected": -58.319217681884766, + "loss": 0.6171, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.9150238037109375, + "rewards/margins": 3.287583589553833, + "rewards/rejected": 0.6274402737617493, + "step": 550 + }, + { + "epoch": 0.09, + "learning_rate": 9.994342379463808e-06, + "logits/chosen": -0.8970142602920532, + "logits/rejected": -0.7685427069664001, + "logps/chosen": -77.56838989257812, + "logps/rejected": -62.47480010986328, + "loss": 0.9815, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0726754665374756, + "rewards/margins": 0.6609277725219727, + "rewards/rejected": 2.411747694015503, + "step": 551 + }, + { + "epoch": 0.09, + "learning_rate": 9.994279703581815e-06, + "logits/chosen": -0.30463120341300964, + "logits/rejected": -0.30463120341300964, + "logps/chosen": -66.02491760253906, + "logps/rejected": -66.02491760253906, + "loss": 0.6154, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.7322860956192017, + "rewards/margins": 0.0, + "rewards/rejected": 1.7322860956192017, + "step": 552 + }, + { + "epoch": 0.09, + "learning_rate": 9.99421668264235e-06, + "logits/chosen": -0.8625001311302185, + "logits/rejected": -0.7955186367034912, + "logps/chosen": -82.51028442382812, + "logps/rejected": -65.58892822265625, + "loss": 0.0396, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.496013641357422, + "rewards/margins": 3.039508819580078, + "rewards/rejected": 1.4565048217773438, + "step": 553 + }, + { + "epoch": 0.09, + "learning_rate": 9.994153316649769e-06, + "logits/chosen": -0.9657637476921082, + "logits/rejected": -0.9727062582969666, + "logps/chosen": -133.34521484375, + "logps/rejected": -193.28321838378906, + "loss": 1.1923, + "rewards/accuracies": 0.0, + "rewards/chosen": 4.906983852386475, + "rewards/margins": -2.202084541320801, + "rewards/rejected": 7.109068393707275, + "step": 554 + }, + { + "epoch": 0.09, + "learning_rate": 9.994089605608448e-06, + "logits/chosen": -0.40530508756637573, + "logits/rejected": -0.4190651774406433, + "logps/chosen": -100.59036254882812, + "logps/rejected": -49.469688415527344, + "loss": 0.4934, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.8437241315841675, + "rewards/margins": -0.3887885808944702, + "rewards/rejected": 2.2325127124786377, + "step": 555 + }, + { + "epoch": 0.09, + "learning_rate": 9.994025549522792e-06, + "logits/chosen": -0.938129186630249, + "logits/rejected": -0.9298706650733948, + "logps/chosen": -74.99626159667969, + "logps/rejected": -79.64386749267578, + "loss": 0.1238, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.7897323369979858, + "rewards/margins": 1.3891593217849731, + "rewards/rejected": 0.4005729854106903, + "step": 556 + }, + { + "epoch": 0.09, + "learning_rate": 9.993961148397222e-06, + "logits/chosen": -0.38303837180137634, + "logits/rejected": -0.3932729959487915, + "logps/chosen": -92.90130615234375, + "logps/rejected": -99.09101867675781, + "loss": 0.2959, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.3343613147735596, + "rewards/margins": 0.6765434741973877, + "rewards/rejected": 1.6578178405761719, + "step": 557 + }, + { + "epoch": 0.09, + "learning_rate": 9.99389640223619e-06, + "logits/chosen": -0.6985897421836853, + "logits/rejected": -0.7521032691001892, + "logps/chosen": -88.09275817871094, + "logps/rejected": -125.0697021484375, + "loss": 1.7913, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.3311822414398193, + "rewards/margins": -2.829505205154419, + "rewards/rejected": 5.160687446594238, + "step": 558 + }, + { + "epoch": 0.09, + "learning_rate": 9.993831311044172e-06, + "logits/chosen": -0.32791268825531006, + "logits/rejected": -0.3546072244644165, + "logps/chosen": -21.033527374267578, + "logps/rejected": -25.70724105834961, + "loss": 0.7518, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.27543947100639343, + "rewards/margins": -0.7948142290115356, + "rewards/rejected": 1.0702537298202515, + "step": 559 + }, + { + "epoch": 0.09, + "learning_rate": 9.99376587482566e-06, + "logits/chosen": -0.8672590851783752, + "logits/rejected": -0.7600011229515076, + "logps/chosen": -76.0909194946289, + "logps/rejected": -78.37866973876953, + "loss": 0.6984, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.9649406671524048, + "rewards/margins": -0.851460337638855, + "rewards/rejected": 2.8164010047912598, + "step": 560 + }, + { + "epoch": 0.09, + "learning_rate": 9.993700093585178e-06, + "logits/chosen": -1.0164119005203247, + "logits/rejected": -0.9833223223686218, + "logps/chosen": -61.899173736572266, + "logps/rejected": -89.89266204833984, + "loss": 0.6808, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.5284732580184937, + "rewards/margins": -1.0560094118118286, + "rewards/rejected": 2.5844826698303223, + "step": 561 + }, + { + "epoch": 0.09, + "learning_rate": 9.99363396732727e-06, + "logits/chosen": -0.6119078993797302, + "logits/rejected": -0.6482664346694946, + "logps/chosen": -78.38432312011719, + "logps/rejected": -73.91250610351562, + "loss": 0.6786, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.7587524652481079, + "rewards/margins": -0.4996154308319092, + "rewards/rejected": 1.258367896080017, + "step": 562 + }, + { + "epoch": 0.09, + "learning_rate": 9.993567496056504e-06, + "logits/chosen": -0.866474986076355, + "logits/rejected": -0.8685253262519836, + "logps/chosen": -240.01922607421875, + "logps/rejected": -127.8979263305664, + "loss": 0.1665, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.256066799163818, + "rewards/margins": 1.3870275020599365, + "rewards/rejected": 3.869039297103882, + "step": 563 + }, + { + "epoch": 0.09, + "learning_rate": 9.993500679777478e-06, + "logits/chosen": -0.852167546749115, + "logits/rejected": -0.8916890621185303, + "logps/chosen": -218.22311401367188, + "logps/rejected": -116.01383209228516, + "loss": 0.7518, + "rewards/accuracies": 0.0, + "rewards/chosen": 4.632089138031006, + "rewards/margins": -1.2366523742675781, + "rewards/rejected": 5.868741512298584, + "step": 564 + }, + { + "epoch": 0.09, + "learning_rate": 9.9934335184948e-06, + "logits/chosen": -0.4976544976234436, + "logits/rejected": -0.49823498725891113, + "logps/chosen": -96.60612487792969, + "logps/rejected": -64.86869812011719, + "loss": 0.9003, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.166235327720642, + "rewards/margins": -0.8065613508224487, + "rewards/rejected": 1.9727966785430908, + "step": 565 + }, + { + "epoch": 0.09, + "learning_rate": 9.993366012213114e-06, + "logits/chosen": -0.7547597289085388, + "logits/rejected": -0.7528627514839172, + "logps/chosen": -90.31648254394531, + "logps/rejected": -97.31695556640625, + "loss": 0.6269, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.4355591535568237, + "rewards/margins": -0.8882406949996948, + "rewards/rejected": 2.3237998485565186, + "step": 566 + }, + { + "epoch": 0.09, + "learning_rate": 9.993298160937086e-06, + "logits/chosen": -0.4294118285179138, + "logits/rejected": -0.4294118285179138, + "logps/chosen": -7.471914768218994, + "logps/rejected": -7.471914768218994, + "loss": 0.988, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.9098547101020813, + "rewards/margins": 0.0, + "rewards/rejected": 0.9098547101020813, + "step": 567 + }, + { + "epoch": 0.09, + "learning_rate": 9.9932299646714e-06, + "logits/chosen": -0.6332605481147766, + "logits/rejected": -0.5598294734954834, + "logps/chosen": -67.27120971679688, + "logps/rejected": -17.95563316345215, + "loss": 0.1158, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.9331817626953125, + "rewards/margins": 1.4038441181182861, + "rewards/rejected": 0.5293377041816711, + "step": 568 + }, + { + "epoch": 0.09, + "learning_rate": 9.993161423420774e-06, + "logits/chosen": -0.863921046257019, + "logits/rejected": -0.6706574559211731, + "logps/chosen": -84.13998413085938, + "logps/rejected": -29.68218994140625, + "loss": 0.2184, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.43133544921875, + "rewards/margins": 0.6901454925537109, + "rewards/rejected": 0.7411899566650391, + "step": 569 + }, + { + "epoch": 0.09, + "learning_rate": 9.993092537189936e-06, + "logits/chosen": -0.708072304725647, + "logits/rejected": -0.6133208274841309, + "logps/chosen": -170.14297485351562, + "logps/rejected": -153.9881591796875, + "loss": 0.7974, + "rewards/accuracies": 0.0, + "rewards/chosen": 4.026904582977295, + "rewards/margins": -0.66302490234375, + "rewards/rejected": 4.689929485321045, + "step": 570 + }, + { + "epoch": 0.09, + "learning_rate": 9.99302330598365e-06, + "logits/chosen": -0.7737190127372742, + "logits/rejected": -0.6659673452377319, + "logps/chosen": -80.20640563964844, + "logps/rejected": -15.013071060180664, + "loss": 0.2828, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.064182996749878, + "rewards/margins": 1.328578233718872, + "rewards/rejected": 0.7356047034263611, + "step": 571 + }, + { + "epoch": 0.09, + "learning_rate": 9.992953729806696e-06, + "logits/chosen": -1.0001243352890015, + "logits/rejected": -0.8783328533172607, + "logps/chosen": -149.056640625, + "logps/rejected": -61.95871353149414, + "loss": 0.5822, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.013089179992676, + "rewards/margins": 2.110629081726074, + "rewards/rejected": 2.9024600982666016, + "step": 572 + }, + { + "epoch": 0.09, + "learning_rate": 9.992883808663885e-06, + "logits/chosen": -0.7400110960006714, + "logits/rejected": -0.8470534086227417, + "logps/chosen": -50.15863800048828, + "logps/rejected": -75.12637329101562, + "loss": 1.0175, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.4246010780334473, + "rewards/margins": -1.3644981384277344, + "rewards/rejected": 3.7890992164611816, + "step": 573 + }, + { + "epoch": 0.09, + "learning_rate": 9.992813542560045e-06, + "logits/chosen": -0.5720760822296143, + "logits/rejected": -0.4723474383354187, + "logps/chosen": -109.11819458007812, + "logps/rejected": -16.68526268005371, + "loss": 0.4668, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.5706207752227783, + "rewards/margins": 1.2886377573013306, + "rewards/rejected": 0.28198298811912537, + "step": 574 + }, + { + "epoch": 0.09, + "learning_rate": 9.992742931500032e-06, + "logits/chosen": -0.7008370757102966, + "logits/rejected": -0.5206031799316406, + "logps/chosen": -120.05878448486328, + "logps/rejected": -36.342308044433594, + "loss": 0.4231, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.999396562576294, + "rewards/margins": 2.6992433071136475, + "rewards/rejected": 0.30015334486961365, + "step": 575 + }, + { + "epoch": 0.09, + "learning_rate": 9.992671975488725e-06, + "logits/chosen": -1.0137356519699097, + "logits/rejected": -0.957088828086853, + "logps/chosen": -45.91450500488281, + "logps/rejected": -53.47895431518555, + "loss": 0.8165, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.0132042169570923, + "rewards/margins": -0.9955638647079468, + "rewards/rejected": 2.008768081665039, + "step": 576 + }, + { + "epoch": 0.09, + "learning_rate": 9.992600674531025e-06, + "logits/chosen": -0.9062435030937195, + "logits/rejected": -0.8482679128646851, + "logps/chosen": -61.2097053527832, + "logps/rejected": -59.259437561035156, + "loss": 0.2861, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.357543706893921, + "rewards/margins": 0.34011268615722656, + "rewards/rejected": 2.0174310207366943, + "step": 577 + }, + { + "epoch": 0.09, + "learning_rate": 9.992529028631859e-06, + "logits/chosen": -0.995043933391571, + "logits/rejected": -0.9308229088783264, + "logps/chosen": -72.85414123535156, + "logps/rejected": -39.48682403564453, + "loss": 0.1316, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.4845634698867798, + "rewards/margins": 1.2484840154647827, + "rewards/rejected": 0.2360794097185135, + "step": 578 + }, + { + "epoch": 0.09, + "learning_rate": 9.992457037796177e-06, + "logits/chosen": -0.774732768535614, + "logits/rejected": -0.49529844522476196, + "logps/chosen": -144.1862030029297, + "logps/rejected": -36.92414855957031, + "loss": 1.2769, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.476574659347534, + "rewards/margins": 2.7618274688720703, + "rewards/rejected": 0.7147472500801086, + "step": 579 + }, + { + "epoch": 0.09, + "learning_rate": 9.992384702028952e-06, + "logits/chosen": -0.8092189431190491, + "logits/rejected": -0.8092189431190491, + "logps/chosen": -40.35988998413086, + "logps/rejected": -40.35988998413086, + "loss": 0.5382, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.43326568603515625, + "rewards/margins": 0.0, + "rewards/rejected": 0.43326568603515625, + "step": 580 + }, + { + "epoch": 0.09, + "learning_rate": 9.992312021335181e-06, + "logits/chosen": -0.18342413008213043, + "logits/rejected": -0.3291226327419281, + "logps/chosen": -57.33086013793945, + "logps/rejected": -91.20169830322266, + "loss": 1.3398, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.2785526514053345, + "rewards/margins": -1.7348865270614624, + "rewards/rejected": 3.013439178466797, + "step": 581 + }, + { + "epoch": 0.09, + "learning_rate": 9.99223899571989e-06, + "logits/chosen": -0.9422094225883484, + "logits/rejected": -0.9671311974525452, + "logps/chosen": -59.821983337402344, + "logps/rejected": -71.91675567626953, + "loss": 0.6561, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.2994873523712158, + "rewards/margins": -0.960028886795044, + "rewards/rejected": 2.2595162391662598, + "step": 582 + }, + { + "epoch": 0.09, + "learning_rate": 9.99216562518812e-06, + "logits/chosen": -0.38553622364997864, + "logits/rejected": -0.3306349813938141, + "logps/chosen": -48.604774475097656, + "logps/rejected": -54.55311584472656, + "loss": 0.7427, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.5585678219795227, + "rewards/margins": -1.0901985168457031, + "rewards/rejected": 1.6487663984298706, + "step": 583 + }, + { + "epoch": 0.09, + "learning_rate": 9.992091909744943e-06, + "logits/chosen": -1.1783405542373657, + "logits/rejected": -1.0574631690979004, + "logps/chosen": -112.96754455566406, + "logps/rejected": -20.64132308959961, + "loss": 0.3023, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.648840427398682, + "rewards/margins": 5.147433280944824, + "rewards/rejected": 0.5014070868492126, + "step": 584 + }, + { + "epoch": 0.09, + "learning_rate": 9.99201784939545e-06, + "logits/chosen": -0.4394107162952423, + "logits/rejected": -0.43055424094200134, + "logps/chosen": -104.90371704101562, + "logps/rejected": -90.7194595336914, + "loss": 0.751, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.290697455406189, + "rewards/margins": -0.44903266429901123, + "rewards/rejected": 1.7397301197052002, + "step": 585 + }, + { + "epoch": 0.1, + "learning_rate": 9.991943444144758e-06, + "logits/chosen": -0.5018532872200012, + "logits/rejected": -0.5219767093658447, + "logps/chosen": -36.84790802001953, + "logps/rejected": -80.57701110839844, + "loss": 0.4699, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.7371082305908203, + "rewards/margins": -0.3756871223449707, + "rewards/rejected": 2.112795352935791, + "step": 586 + }, + { + "epoch": 0.1, + "learning_rate": 9.991868693998008e-06, + "logits/chosen": -0.6108866930007935, + "logits/rejected": -0.6571621894836426, + "logps/chosen": -87.5583724975586, + "logps/rejected": -120.98045349121094, + "loss": 0.8266, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.4824912548065186, + "rewards/margins": -0.9992027282714844, + "rewards/rejected": 3.481693983078003, + "step": 587 + }, + { + "epoch": 0.1, + "learning_rate": 9.991793598960364e-06, + "logits/chosen": -0.7124766707420349, + "logits/rejected": -0.6182643175125122, + "logps/chosen": -67.7507553100586, + "logps/rejected": -41.45924758911133, + "loss": 0.9298, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.865003228187561, + "rewards/margins": -0.5268089771270752, + "rewards/rejected": 1.3918122053146362, + "step": 588 + }, + { + "epoch": 0.1, + "learning_rate": 9.991718159037016e-06, + "logits/chosen": -0.8042739033699036, + "logits/rejected": -0.7087945342063904, + "logps/chosen": -80.48338317871094, + "logps/rejected": -78.08380126953125, + "loss": 1.1314, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.767797827720642, + "rewards/margins": -0.22195208072662354, + "rewards/rejected": 1.9897499084472656, + "step": 589 + }, + { + "epoch": 0.1, + "learning_rate": 9.991642374233175e-06, + "logits/chosen": -0.8174198865890503, + "logits/rejected": -0.809450089931488, + "logps/chosen": -45.783302307128906, + "logps/rejected": -31.243759155273438, + "loss": 0.4655, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.4436806440353394, + "rewards/margins": -0.38652610778808594, + "rewards/rejected": 1.8302067518234253, + "step": 590 + }, + { + "epoch": 0.1, + "learning_rate": 9.991566244554078e-06, + "logits/chosen": -0.6378777027130127, + "logits/rejected": -0.6378777027130127, + "logps/chosen": -115.64938354492188, + "logps/rejected": -115.64938354492188, + "loss": 0.3518, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.7094085216522217, + "rewards/margins": 0.0, + "rewards/rejected": 2.7094085216522217, + "step": 591 + }, + { + "epoch": 0.1, + "learning_rate": 9.991489770004985e-06, + "logits/chosen": -0.779199481010437, + "logits/rejected": -0.5872191786766052, + "logps/chosen": -62.46147155761719, + "logps/rejected": -20.61522674560547, + "loss": 0.1832, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.645160675048828, + "rewards/margins": 2.2592549324035645, + "rewards/rejected": 0.38590583205223083, + "step": 592 + }, + { + "epoch": 0.1, + "learning_rate": 9.991412950591177e-06, + "logits/chosen": -0.4164581894874573, + "logits/rejected": -0.39311879873275757, + "logps/chosen": -118.14533233642578, + "logps/rejected": -84.6113510131836, + "loss": 0.38, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.5621651411056519, + "rewards/margins": -0.09546422958374023, + "rewards/rejected": 1.657629370689392, + "step": 593 + }, + { + "epoch": 0.1, + "learning_rate": 9.991335786317964e-06, + "logits/chosen": -0.7694527506828308, + "logits/rejected": -0.6981874108314514, + "logps/chosen": -46.52619552612305, + "logps/rejected": -100.66731262207031, + "loss": 0.447, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.263036012649536, + "rewards/margins": 0.9699169397354126, + "rewards/rejected": 1.2931190729141235, + "step": 594 + }, + { + "epoch": 0.1, + "learning_rate": 9.991258277190677e-06, + "logits/chosen": -0.3200262784957886, + "logits/rejected": -0.2721765637397766, + "logps/chosen": -66.65724182128906, + "logps/rejected": -43.63938522338867, + "loss": 0.591, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.0997787714004517, + "rewards/margins": -0.7165629863739014, + "rewards/rejected": 1.816341757774353, + "step": 595 + }, + { + "epoch": 0.1, + "learning_rate": 9.99118042321467e-06, + "logits/chosen": -0.8684045672416687, + "logits/rejected": -0.7801951766014099, + "logps/chosen": -61.6823616027832, + "logps/rejected": -107.16596221923828, + "loss": 0.6871, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.9096142053604126, + "rewards/margins": -0.7622309923171997, + "rewards/rejected": 2.6718451976776123, + "step": 596 + }, + { + "epoch": 0.1, + "learning_rate": 9.991102224395323e-06, + "logits/chosen": -0.9068311452865601, + "logits/rejected": -0.9409644603729248, + "logps/chosen": -169.3260040283203, + "logps/rejected": -27.968833923339844, + "loss": 0.2867, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.763336181640625, + "rewards/margins": 0.7181192636489868, + "rewards/rejected": 0.04521694406867027, + "step": 597 + }, + { + "epoch": 0.1, + "learning_rate": 9.99102368073804e-06, + "logits/chosen": -0.4947035312652588, + "logits/rejected": -0.5098088979721069, + "logps/chosen": -40.701351165771484, + "logps/rejected": -43.55152893066406, + "loss": 0.389, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.9842979311943054, + "rewards/margins": -0.10049062967300415, + "rewards/rejected": 1.0847885608673096, + "step": 598 + }, + { + "epoch": 0.1, + "learning_rate": 9.990944792248244e-06, + "logits/chosen": -0.32265815138816833, + "logits/rejected": -0.306347519159317, + "logps/chosen": -3.083674669265747, + "logps/rejected": -2.5094995498657227, + "loss": 0.3813, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.3373183012008667, + "rewards/margins": -0.006379455327987671, + "rewards/rejected": 0.34369775652885437, + "step": 599 + }, + { + "epoch": 0.1, + "learning_rate": 9.990865558931387e-06, + "logits/chosen": -0.9972313642501831, + "logits/rejected": -0.942057728767395, + "logps/chosen": -39.45288848876953, + "logps/rejected": -95.04830932617188, + "loss": 0.3545, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.3031902313232422, + "rewards/margins": 0.00022852420806884766, + "rewards/rejected": 1.3029617071151733, + "step": 600 + }, + { + "epoch": 0.1, + "learning_rate": 9.990785980792944e-06, + "logits/chosen": -0.8079357147216797, + "logits/rejected": -0.8141458630561829, + "logps/chosen": -70.84452819824219, + "logps/rejected": -68.18826293945312, + "loss": 1.1157, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.8188629150390625, + "rewards/margins": -0.9279115200042725, + "rewards/rejected": 2.746774435043335, + "step": 601 + }, + { + "epoch": 0.1, + "learning_rate": 9.990706057838417e-06, + "logits/chosen": -0.9201887845993042, + "logits/rejected": -0.9928283095359802, + "logps/chosen": -133.2013397216797, + "logps/rejected": -117.105712890625, + "loss": 1.0198, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.3394882678985596, + "rewards/margins": -1.8451387882232666, + "rewards/rejected": 5.184627056121826, + "step": 602 + }, + { + "epoch": 0.1, + "learning_rate": 9.990625790073321e-06, + "logits/chosen": -0.5521355271339417, + "logits/rejected": -0.626850962638855, + "logps/chosen": -59.217315673828125, + "logps/rejected": -100.58346557617188, + "loss": 0.6054, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.7804436087608337, + "rewards/margins": -0.09155040979385376, + "rewards/rejected": 0.8719940185546875, + "step": 603 + }, + { + "epoch": 0.1, + "learning_rate": 9.990545177503203e-06, + "logits/chosen": -0.5285443663597107, + "logits/rejected": -0.5696931481361389, + "logps/chosen": -59.93455123901367, + "logps/rejected": -83.10730743408203, + "loss": 1.1438, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.7216129302978516, + "rewards/margins": -0.320590615272522, + "rewards/rejected": 1.0422035455703735, + "step": 604 + }, + { + "epoch": 0.1, + "learning_rate": 9.990464220133638e-06, + "logits/chosen": -0.7496558427810669, + "logits/rejected": -0.7682238817214966, + "logps/chosen": -65.83203125, + "logps/rejected": -109.82377624511719, + "loss": 1.1756, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.838952660560608, + "rewards/margins": -0.4170318841934204, + "rewards/rejected": 2.2559845447540283, + "step": 605 + }, + { + "epoch": 0.1, + "learning_rate": 9.990382917970213e-06, + "logits/chosen": -0.6094529628753662, + "logits/rejected": -0.6066014170646667, + "logps/chosen": -44.20207214355469, + "logps/rejected": -105.51985168457031, + "loss": 1.3842, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.3443565368652344, + "rewards/margins": -1.435675859451294, + "rewards/rejected": 2.7800323963165283, + "step": 606 + }, + { + "epoch": 0.1, + "learning_rate": 9.990301271018548e-06, + "logits/chosen": -0.2751193642616272, + "logits/rejected": -0.2894216477870941, + "logps/chosen": -52.34449005126953, + "logps/rejected": -81.29832458496094, + "loss": 0.8275, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.9695663452148438, + "rewards/margins": 1.257147192955017, + "rewards/rejected": 0.7124191522598267, + "step": 607 + }, + { + "epoch": 0.1, + "learning_rate": 9.990219279284284e-06, + "logits/chosen": -0.9029151797294617, + "logits/rejected": -0.961002767086029, + "logps/chosen": -74.40491485595703, + "logps/rejected": -150.57781982421875, + "loss": 2.1168, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.4227348566055298, + "rewards/margins": -2.6120309829711914, + "rewards/rejected": 4.034765720367432, + "step": 608 + }, + { + "epoch": 0.1, + "learning_rate": 9.990136942773086e-06, + "logits/chosen": -0.9935181736946106, + "logits/rejected": -1.0034496784210205, + "logps/chosen": -93.84173583984375, + "logps/rejected": -168.73907470703125, + "loss": 1.5689, + "rewards/accuracies": 0.0, + "rewards/chosen": 4.868359565734863, + "rewards/margins": -0.6837234497070312, + "rewards/rejected": 5.5520830154418945, + "step": 609 + }, + { + "epoch": 0.1, + "learning_rate": 9.990054261490643e-06, + "logits/chosen": -0.6327803730964661, + "logits/rejected": -0.5373698472976685, + "logps/chosen": -96.58204650878906, + "logps/rejected": -115.63294982910156, + "loss": 1.6528, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.4951553344726562, + "rewards/margins": -3.1073198318481445, + "rewards/rejected": 5.602475166320801, + "step": 610 + }, + { + "epoch": 0.1, + "learning_rate": 9.989971235442665e-06, + "logits/chosen": -0.9753302335739136, + "logits/rejected": -0.9228381514549255, + "logps/chosen": -90.47549438476562, + "logps/rejected": -68.99510955810547, + "loss": 1.6681, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.807574450969696, + "rewards/margins": 0.45736387372016907, + "rewards/rejected": 0.350210577249527, + "step": 611 + }, + { + "epoch": 0.1, + "learning_rate": 9.989887864634893e-06, + "logits/chosen": -1.0006333589553833, + "logits/rejected": -1.1057641506195068, + "logps/chosen": -198.7904052734375, + "logps/rejected": -175.9415740966797, + "loss": 1.5599, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.424696445465088, + "rewards/margins": -2.3341522216796875, + "rewards/rejected": 5.758848667144775, + "step": 612 + }, + { + "epoch": 0.1, + "learning_rate": 9.98980414907308e-06, + "logits/chosen": -0.9057644009590149, + "logits/rejected": -0.7303020358085632, + "logps/chosen": -203.33197021484375, + "logps/rejected": -39.27217483520508, + "loss": 0.2362, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.269402980804443, + "rewards/margins": 4.942075729370117, + "rewards/rejected": 0.32732734084129333, + "step": 613 + }, + { + "epoch": 0.1, + "learning_rate": 9.98972008876302e-06, + "logits/chosen": -0.9034970998764038, + "logits/rejected": -0.8731964230537415, + "logps/chosen": -88.34452819824219, + "logps/rejected": -61.20329284667969, + "loss": 0.4489, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.7298583984375, + "rewards/margins": -0.36316990852355957, + "rewards/rejected": 3.0930283069610596, + "step": 614 + }, + { + "epoch": 0.1, + "learning_rate": 9.98963568371051e-06, + "logits/chosen": -0.8007368445396423, + "logits/rejected": -0.6718770861625671, + "logps/chosen": -115.87725067138672, + "logps/rejected": -85.49740600585938, + "loss": 0.1194, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.403666019439697, + "rewards/margins": 2.5611817836761475, + "rewards/rejected": 2.84248423576355, + "step": 615 + }, + { + "epoch": 0.1, + "learning_rate": 9.98955093392139e-06, + "logits/chosen": -0.6477586030960083, + "logits/rejected": -0.6200270056724548, + "logps/chosen": -73.58465576171875, + "logps/rejected": -107.95306396484375, + "loss": 1.238, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.8040565848350525, + "rewards/margins": -1.5423393249511719, + "rewards/rejected": 2.346395969390869, + "step": 616 + }, + { + "epoch": 0.1, + "learning_rate": 9.989465839401511e-06, + "logits/chosen": -0.6740242838859558, + "logits/rejected": -0.5714662075042725, + "logps/chosen": -106.33731842041016, + "logps/rejected": -62.4372673034668, + "loss": 0.0715, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.045316219329834, + "rewards/margins": 2.085479974746704, + "rewards/rejected": 1.9598362445831299, + "step": 617 + }, + { + "epoch": 0.1, + "learning_rate": 9.989380400156752e-06, + "logits/chosen": -0.9594624638557434, + "logits/rejected": -0.9249041080474854, + "logps/chosen": -130.89614868164062, + "logps/rejected": -64.55281066894531, + "loss": 0.5614, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.5755233764648438, + "rewards/margins": -0.5640809535980225, + "rewards/rejected": 2.139604330062866, + "step": 618 + }, + { + "epoch": 0.1, + "learning_rate": 9.989294616193018e-06, + "logits/chosen": -0.6479396820068359, + "logits/rejected": -0.6559250354766846, + "logps/chosen": -33.72765350341797, + "logps/rejected": -36.97720718383789, + "loss": 0.9304, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.7054939270019531, + "rewards/margins": 0.16130709648132324, + "rewards/rejected": 1.5441868305206299, + "step": 619 + }, + { + "epoch": 0.1, + "learning_rate": 9.989208487516236e-06, + "logits/chosen": -0.6856578588485718, + "logits/rejected": -0.6499707698822021, + "logps/chosen": -101.27466583251953, + "logps/rejected": -96.44940185546875, + "loss": 1.4346, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.5953575372695923, + "rewards/margins": -1.633231282234192, + "rewards/rejected": 2.228588819503784, + "step": 620 + }, + { + "epoch": 0.1, + "learning_rate": 9.989122014132356e-06, + "logits/chosen": -0.6919955611228943, + "logits/rejected": -0.6613302826881409, + "logps/chosen": -33.145904541015625, + "logps/rejected": -25.889503479003906, + "loss": 1.0872, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.7613876461982727, + "rewards/margins": 0.20481300354003906, + "rewards/rejected": 0.5565746426582336, + "step": 621 + }, + { + "epoch": 0.1, + "learning_rate": 9.989035196047349e-06, + "logits/chosen": -0.5577796101570129, + "logits/rejected": -0.6226856708526611, + "logps/chosen": -148.78787231445312, + "logps/rejected": -104.54103088378906, + "loss": 1.1221, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.74180006980896, + "rewards/margins": -2.1034133434295654, + "rewards/rejected": 5.845213413238525, + "step": 622 + }, + { + "epoch": 0.1, + "learning_rate": 9.98894803326722e-06, + "logits/chosen": -0.15309657156467438, + "logits/rejected": -0.18869276344776154, + "logps/chosen": -64.69493103027344, + "logps/rejected": -48.58404541015625, + "loss": 0.5231, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.2335205078125, + "rewards/margins": -0.5504082441329956, + "rewards/rejected": 1.7839287519454956, + "step": 623 + }, + { + "epoch": 0.1, + "learning_rate": 9.988860525797988e-06, + "logits/chosen": -0.3137136399745941, + "logits/rejected": -0.3450758755207062, + "logps/chosen": -74.84001159667969, + "logps/rejected": -93.51246643066406, + "loss": 0.593, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.4720124006271362, + "rewards/margins": -0.4886031150817871, + "rewards/rejected": 1.9606155157089233, + "step": 624 + }, + { + "epoch": 0.1, + "learning_rate": 9.988772673645698e-06, + "logits/chosen": -0.45332077145576477, + "logits/rejected": -0.40045228600502014, + "logps/chosen": -117.0152816772461, + "logps/rejected": -50.254783630371094, + "loss": 0.2903, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.5291390419006348, + "rewards/margins": 0.7601243257522583, + "rewards/rejected": 1.7690147161483765, + "step": 625 + }, + { + "epoch": 0.1, + "learning_rate": 9.98868447681642e-06, + "logits/chosen": -0.6403884291648865, + "logits/rejected": -0.5079770684242249, + "logps/chosen": -106.66502380371094, + "logps/rejected": -38.02003479003906, + "loss": 0.2217, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.13519287109375, + "rewards/margins": 1.427711844444275, + "rewards/rejected": 1.707481026649475, + "step": 626 + }, + { + "epoch": 0.1, + "learning_rate": 9.988595935316248e-06, + "logits/chosen": -0.39981791377067566, + "logits/rejected": -0.37650126218795776, + "logps/chosen": -61.55650329589844, + "logps/rejected": -41.04154968261719, + "loss": 1.0388, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.27530837059021, + "rewards/margins": 0.7381341457366943, + "rewards/rejected": 1.5371742248535156, + "step": 627 + }, + { + "epoch": 0.1, + "learning_rate": 9.9885070491513e-06, + "logits/chosen": -0.856731116771698, + "logits/rejected": -0.8571852445602417, + "logps/chosen": -71.07108306884766, + "logps/rejected": -69.46580505371094, + "loss": 0.7193, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.0737937688827515, + "rewards/margins": -0.9637855291366577, + "rewards/rejected": 2.037579298019409, + "step": 628 + }, + { + "epoch": 0.1, + "learning_rate": 9.988417818327716e-06, + "logits/chosen": -0.6539154648780823, + "logits/rejected": -0.7042697668075562, + "logps/chosen": -90.8909912109375, + "logps/rejected": -98.13505554199219, + "loss": 2.6967, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.268652319908142, + "rewards/margins": -5.379286289215088, + "rewards/rejected": 6.6479387283325195, + "step": 629 + }, + { + "epoch": 0.1, + "learning_rate": 9.988328242851661e-06, + "logits/chosen": -0.8189685344696045, + "logits/rejected": -0.785676121711731, + "logps/chosen": -71.08846282958984, + "logps/rejected": -39.13788604736328, + "loss": 0.2224, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.7571083307266235, + "rewards/margins": 1.4287440776824951, + "rewards/rejected": 0.32836419343948364, + "step": 630 + }, + { + "epoch": 0.1, + "learning_rate": 9.988238322729325e-06, + "logits/chosen": -0.9448052048683167, + "logits/rejected": -1.0144925117492676, + "logps/chosen": -147.89451599121094, + "logps/rejected": -165.24220275878906, + "loss": 0.7224, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.474679708480835, + "rewards/margins": -1.167262315750122, + "rewards/rejected": 4.641942024230957, + "step": 631 + }, + { + "epoch": 0.1, + "learning_rate": 9.98814805796692e-06, + "logits/chosen": -0.7101930975914001, + "logits/rejected": -0.742830753326416, + "logps/chosen": -42.318668365478516, + "logps/rejected": -42.81855773925781, + "loss": 0.8876, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.7608398795127869, + "rewards/margins": -0.45974498987197876, + "rewards/rejected": 1.2205848693847656, + "step": 632 + }, + { + "epoch": 0.1, + "learning_rate": 9.988057448570682e-06, + "logits/chosen": -0.6250888109207153, + "logits/rejected": -0.5680675506591797, + "logps/chosen": -66.46504211425781, + "logps/rejected": -44.130821228027344, + "loss": 0.8387, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.1119720935821533, + "rewards/margins": -0.44869160652160645, + "rewards/rejected": 2.5606637001037598, + "step": 633 + }, + { + "epoch": 0.1, + "learning_rate": 9.987966494546873e-06, + "logits/chosen": -0.6141804456710815, + "logits/rejected": -0.5908210873603821, + "logps/chosen": -59.402252197265625, + "logps/rejected": -48.2764892578125, + "loss": 0.8065, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.45776137709617615, + "rewards/margins": 0.45350876450538635, + "rewards/rejected": 0.004252624697983265, + "step": 634 + }, + { + "epoch": 0.1, + "learning_rate": 9.987875195901776e-06, + "logits/chosen": -0.7467438578605652, + "logits/rejected": -0.6667758226394653, + "logps/chosen": -130.7857208251953, + "logps/rejected": -72.0806884765625, + "loss": 0.0999, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.1623735427856445, + "rewards/margins": 1.8987550735473633, + "rewards/rejected": 2.2636184692382812, + "step": 635 + }, + { + "epoch": 0.1, + "learning_rate": 9.987783552641698e-06, + "logits/chosen": -0.8594380021095276, + "logits/rejected": -0.7966352105140686, + "logps/chosen": -70.06678009033203, + "logps/rejected": -23.16340446472168, + "loss": 0.4809, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.7419151663780212, + "rewards/margins": -0.2722840905189514, + "rewards/rejected": 1.0141992568969727, + "step": 636 + }, + { + "epoch": 0.1, + "learning_rate": 9.987691564772971e-06, + "logits/chosen": -0.8423760533332825, + "logits/rejected": -0.7700189352035522, + "logps/chosen": -122.92497253417969, + "logps/rejected": -91.70896911621094, + "loss": 0.1356, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.993945598602295, + "rewards/margins": 2.94803786277771, + "rewards/rejected": 3.045907735824585, + "step": 637 + }, + { + "epoch": 0.1, + "learning_rate": 9.987599232301952e-06, + "logits/chosen": -0.3650670051574707, + "logits/rejected": -0.3910315930843353, + "logps/chosen": -75.89303588867188, + "logps/rejected": -86.87428283691406, + "loss": 0.6138, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.253657579421997, + "rewards/margins": -0.15039825439453125, + "rewards/rejected": 1.4040558338165283, + "step": 638 + }, + { + "epoch": 0.1, + "learning_rate": 9.987506555235018e-06, + "logits/chosen": -1.0000598430633545, + "logits/rejected": -1.0087051391601562, + "logps/chosen": -50.51163864135742, + "logps/rejected": -122.92802429199219, + "loss": 0.6741, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.108742117881775, + "rewards/margins": -0.6688747406005859, + "rewards/rejected": 1.7776168584823608, + "step": 639 + }, + { + "epoch": 0.1, + "learning_rate": 9.987413533578574e-06, + "logits/chosen": -0.584686279296875, + "logits/rejected": -0.6213027834892273, + "logps/chosen": -98.49008178710938, + "logps/rejected": -102.89219665527344, + "loss": 1.8385, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.0758652687072754, + "rewards/margins": -2.1738176345825195, + "rewards/rejected": 4.249682903289795, + "step": 640 + }, + { + "epoch": 0.1, + "learning_rate": 9.987320167339044e-06, + "logits/chosen": -0.6398187279701233, + "logits/rejected": -0.6046695709228516, + "logps/chosen": -77.07492065429688, + "logps/rejected": -60.48857498168945, + "loss": 0.2695, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.4641236066818237, + "rewards/margins": 0.40347182750701904, + "rewards/rejected": 1.0606517791748047, + "step": 641 + }, + { + "epoch": 0.1, + "learning_rate": 9.987226456522884e-06, + "logits/chosen": -0.9370374083518982, + "logits/rejected": -0.8821579217910767, + "logps/chosen": -67.84274291992188, + "logps/rejected": -88.51644134521484, + "loss": 1.1746, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.0950669050216675, + "rewards/margins": -1.7886139154434204, + "rewards/rejected": 2.883680820465088, + "step": 642 + }, + { + "epoch": 0.1, + "learning_rate": 9.987132401136563e-06, + "logits/chosen": -0.5127289891242981, + "logits/rejected": -0.3742266893386841, + "logps/chosen": -73.47715759277344, + "logps/rejected": -142.08192443847656, + "loss": 1.1539, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.8567207455635071, + "rewards/margins": -1.754300832748413, + "rewards/rejected": 2.6110215187072754, + "step": 643 + }, + { + "epoch": 0.1, + "learning_rate": 9.987038001186585e-06, + "logits/chosen": -0.14558377861976624, + "logits/rejected": -0.14558377861976624, + "logps/chosen": -20.45069122314453, + "logps/rejected": -20.45069122314453, + "loss": 0.3783, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.10688533633947372, + "rewards/margins": 0.0, + "rewards/rejected": 0.10688533633947372, + "step": 644 + }, + { + "epoch": 0.1, + "learning_rate": 9.986943256679464e-06, + "logits/chosen": -0.2180308848619461, + "logits/rejected": -0.2180308848619461, + "logps/chosen": -43.94667053222656, + "logps/rejected": -43.94667053222656, + "loss": 0.6496, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.3604744076728821, + "rewards/margins": 0.0, + "rewards/rejected": 0.3604744076728821, + "step": 645 + }, + { + "epoch": 0.1, + "learning_rate": 9.986848167621754e-06, + "logits/chosen": -1.045362114906311, + "logits/rejected": -1.0161367654800415, + "logps/chosen": -196.10842895507812, + "logps/rejected": -134.11703491210938, + "loss": 0.4593, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.2908172607421875, + "rewards/margins": 3.1480133533477783, + "rewards/rejected": 3.142803907394409, + "step": 646 + }, + { + "epoch": 0.11, + "learning_rate": 9.986752734020022e-06, + "logits/chosen": -0.564973771572113, + "logits/rejected": -0.3843550384044647, + "logps/chosen": -87.99601745605469, + "logps/rejected": -81.25376892089844, + "loss": 0.5668, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.5288467407226562, + "rewards/margins": -0.5653665065765381, + "rewards/rejected": 2.0942132472991943, + "step": 647 + }, + { + "epoch": 0.11, + "learning_rate": 9.98665695588086e-06, + "logits/chosen": -0.6258626580238342, + "logits/rejected": -0.6302114129066467, + "logps/chosen": -50.70026397705078, + "logps/rejected": -136.0522003173828, + "loss": 1.5829, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.7034409046173096, + "rewards/margins": -2.533796548843384, + "rewards/rejected": 5.237237453460693, + "step": 648 + }, + { + "epoch": 0.11, + "learning_rate": 9.986560833210888e-06, + "logits/chosen": -0.6908676624298096, + "logits/rejected": -0.808332085609436, + "logps/chosen": -93.14515686035156, + "logps/rejected": -119.59625244140625, + "loss": 1.0704, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.6508628726005554, + "rewards/margins": -0.8763893246650696, + "rewards/rejected": 1.527252197265625, + "step": 649 + }, + { + "epoch": 0.11, + "learning_rate": 9.986464366016743e-06, + "logits/chosen": -0.813631534576416, + "logits/rejected": -0.8129985928535461, + "logps/chosen": -176.49917602539062, + "logps/rejected": -118.55587768554688, + "loss": 0.0583, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.015042304992676, + "rewards/margins": 2.8164262771606445, + "rewards/rejected": 3.1986160278320312, + "step": 650 + }, + { + "epoch": 0.11, + "learning_rate": 9.986367554305096e-06, + "logits/chosen": -0.45887017250061035, + "logits/rejected": -0.45887017250061035, + "logps/chosen": -9.427844047546387, + "logps/rejected": -9.427844047546387, + "loss": 0.9127, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.04704542085528374, + "rewards/margins": 0.0, + "rewards/rejected": 0.04704542085528374, + "step": 651 + }, + { + "epoch": 0.11, + "learning_rate": 9.98627039808263e-06, + "logits/chosen": -1.0168206691741943, + "logits/rejected": -0.9673035144805908, + "logps/chosen": -139.7681427001953, + "logps/rejected": -62.604827880859375, + "loss": 0.0878, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.543544292449951, + "rewards/margins": 2.453122854232788, + "rewards/rejected": 2.090421438217163, + "step": 652 + }, + { + "epoch": 0.11, + "learning_rate": 9.986172897356062e-06, + "logits/chosen": -0.704340934753418, + "logits/rejected": -0.719369649887085, + "logps/chosen": -84.87184143066406, + "logps/rejected": -44.0024299621582, + "loss": 2.4204, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.2258087396621704, + "rewards/margins": -0.9033900499343872, + "rewards/rejected": 2.1291987895965576, + "step": 653 + }, + { + "epoch": 0.11, + "learning_rate": 9.986075052132124e-06, + "logits/chosen": -0.2950226962566376, + "logits/rejected": -0.32692834734916687, + "logps/chosen": -68.18617248535156, + "logps/rejected": -40.88701248168945, + "loss": 0.4448, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.5196990966796875, + "rewards/margins": 0.6149883270263672, + "rewards/rejected": 0.9047107696533203, + "step": 654 + }, + { + "epoch": 0.11, + "learning_rate": 9.98597686241758e-06, + "logits/chosen": -1.0147255659103394, + "logits/rejected": -1.0262459516525269, + "logps/chosen": -85.23682403564453, + "logps/rejected": -95.66676330566406, + "loss": 1.3644, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.7349166870117188, + "rewards/margins": -2.5006957054138184, + "rewards/rejected": 4.235612392425537, + "step": 655 + }, + { + "epoch": 0.11, + "learning_rate": 9.985878328219211e-06, + "logits/chosen": -0.7921135425567627, + "logits/rejected": -0.7176812887191772, + "logps/chosen": -138.6485137939453, + "logps/rejected": -151.19888305664062, + "loss": 2.2653, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.8354218006134033, + "rewards/margins": -2.5980165004730225, + "rewards/rejected": 5.433438301086426, + "step": 656 + }, + { + "epoch": 0.11, + "learning_rate": 9.985779449543829e-06, + "logits/chosen": -0.7985129952430725, + "logits/rejected": -0.7935410141944885, + "logps/chosen": -253.089111328125, + "logps/rejected": -48.855247497558594, + "loss": 0.0548, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.4942595958709717, + "rewards/margins": 2.289167642593384, + "rewards/rejected": 0.20509186387062073, + "step": 657 + }, + { + "epoch": 0.11, + "learning_rate": 9.985680226398261e-06, + "logits/chosen": -0.8076140284538269, + "logits/rejected": -0.8076140284538269, + "logps/chosen": -94.68931579589844, + "logps/rejected": -94.68931579589844, + "loss": 0.3751, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.7465102672576904, + "rewards/margins": 0.0, + "rewards/rejected": 2.7465102672576904, + "step": 658 + }, + { + "epoch": 0.11, + "learning_rate": 9.985580658789365e-06, + "logits/chosen": -0.725974440574646, + "logits/rejected": -0.6181625127792358, + "logps/chosen": -285.51806640625, + "logps/rejected": -68.14271545410156, + "loss": 0.8676, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.5254700183868408, + "rewards/margins": -0.6185410022735596, + "rewards/rejected": 2.1440110206604004, + "step": 659 + }, + { + "epoch": 0.11, + "learning_rate": 9.985480746724019e-06, + "logits/chosen": -0.17776572704315186, + "logits/rejected": -0.16934533417224884, + "logps/chosen": -52.8661994934082, + "logps/rejected": -42.09904479980469, + "loss": 1.0319, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3771137297153473, + "rewards/margins": 0.15217971801757812, + "rewards/rejected": 0.22493401169776917, + "step": 660 + }, + { + "epoch": 0.11, + "learning_rate": 9.985380490209127e-06, + "logits/chosen": -0.549851655960083, + "logits/rejected": -0.5734078884124756, + "logps/chosen": -62.10917663574219, + "logps/rejected": -137.49710083007812, + "loss": 0.649, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.8695572018623352, + "rewards/margins": 0.37025147676467896, + "rewards/rejected": 0.49930572509765625, + "step": 661 + }, + { + "epoch": 0.11, + "learning_rate": 9.985279889251616e-06, + "logits/chosen": -0.46842318773269653, + "logits/rejected": -0.5283705592155457, + "logps/chosen": -62.90419006347656, + "logps/rejected": -72.02106475830078, + "loss": 0.8015, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.2052139043807983, + "rewards/margins": -0.39468157291412354, + "rewards/rejected": 1.5998954772949219, + "step": 662 + }, + { + "epoch": 0.11, + "learning_rate": 9.985178943858434e-06, + "logits/chosen": -0.71484375, + "logits/rejected": -0.9001825451850891, + "logps/chosen": -398.7375183105469, + "logps/rejected": -162.88333129882812, + "loss": 0.3857, + "rewards/accuracies": 0.0, + "rewards/chosen": 6.569723606109619, + "rewards/margins": -0.02287006378173828, + "rewards/rejected": 6.592593669891357, + "step": 663 + }, + { + "epoch": 0.11, + "learning_rate": 9.985077654036559e-06, + "logits/chosen": -0.9574751853942871, + "logits/rejected": -0.9687936305999756, + "logps/chosen": -125.76007843017578, + "logps/rejected": -172.48109436035156, + "loss": 1.5127, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.8466240167617798, + "rewards/margins": -0.596642255783081, + "rewards/rejected": 1.4432662725448608, + "step": 664 + }, + { + "epoch": 0.11, + "learning_rate": 9.984976019792984e-06, + "logits/chosen": -0.9702019095420837, + "logits/rejected": -0.9650341272354126, + "logps/chosen": -92.13566589355469, + "logps/rejected": -77.30703735351562, + "loss": 0.9327, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.4910614490509033, + "rewards/margins": 0.0026520490646362305, + "rewards/rejected": 1.488409399986267, + "step": 665 + }, + { + "epoch": 0.11, + "learning_rate": 9.984874041134738e-06, + "logits/chosen": -0.8387097716331482, + "logits/rejected": -0.8638694286346436, + "logps/chosen": -59.51205825805664, + "logps/rejected": -136.78802490234375, + "loss": 1.5992, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.2802531719207764, + "rewards/margins": -3.0291125774383545, + "rewards/rejected": 5.309365749359131, + "step": 666 + }, + { + "epoch": 0.11, + "learning_rate": 9.984771718068863e-06, + "logits/chosen": -0.49927303194999695, + "logits/rejected": -0.4909573495388031, + "logps/chosen": -65.36701202392578, + "logps/rejected": -100.27182006835938, + "loss": 0.513, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.4457405805587769, + "rewards/margins": -0.5475432872772217, + "rewards/rejected": 1.9932838678359985, + "step": 667 + }, + { + "epoch": 0.11, + "learning_rate": 9.984669050602426e-06, + "logits/chosen": -0.5216274261474609, + "logits/rejected": -0.543450117111206, + "logps/chosen": -71.85025024414062, + "logps/rejected": -84.34861755371094, + "loss": 0.3317, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.2983719110488892, + "rewards/margins": 0.16286396980285645, + "rewards/rejected": 1.1355079412460327, + "step": 668 + }, + { + "epoch": 0.11, + "learning_rate": 9.984566038742524e-06, + "logits/chosen": -0.5951728224754333, + "logits/rejected": -0.5364352464675903, + "logps/chosen": -76.56573486328125, + "logps/rejected": -71.78386688232422, + "loss": 1.6892, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.0313050746917725, + "rewards/margins": -1.4186835289001465, + "rewards/rejected": 3.449988603591919, + "step": 669 + }, + { + "epoch": 0.11, + "learning_rate": 9.984462682496274e-06, + "logits/chosen": -0.41992297768592834, + "logits/rejected": -0.3754485845565796, + "logps/chosen": -68.16712951660156, + "logps/rejected": -128.62290954589844, + "loss": 1.1752, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.5581512451171875, + "rewards/margins": -2.1635377407073975, + "rewards/rejected": 3.721688985824585, + "step": 670 + }, + { + "epoch": 0.11, + "learning_rate": 9.984358981870815e-06, + "logits/chosen": -0.6687353849411011, + "logits/rejected": -0.6875959634780884, + "logps/chosen": -38.18956756591797, + "logps/rejected": -74.7028579711914, + "loss": 0.8586, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.631098210811615, + "rewards/margins": -1.388763427734375, + "rewards/rejected": 2.0198616981506348, + "step": 671 + }, + { + "epoch": 0.11, + "learning_rate": 9.984254936873315e-06, + "logits/chosen": -0.7489427328109741, + "logits/rejected": -0.6989707946777344, + "logps/chosen": -100.41250610351562, + "logps/rejected": -117.50641632080078, + "loss": 0.1125, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.6974761486053467, + "rewards/margins": 1.437502145767212, + "rewards/rejected": 2.2599740028381348, + "step": 672 + }, + { + "epoch": 0.11, + "learning_rate": 9.984150547510959e-06, + "logits/chosen": -0.5111665725708008, + "logits/rejected": -0.5512990951538086, + "logps/chosen": -4.6757402420043945, + "logps/rejected": -34.67827606201172, + "loss": 0.4209, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.25917983055114746, + "rewards/margins": -0.17344766855239868, + "rewards/rejected": 0.43262749910354614, + "step": 673 + }, + { + "epoch": 0.11, + "learning_rate": 9.984045813790959e-06, + "logits/chosen": -1.3778038024902344, + "logits/rejected": -1.4580962657928467, + "logps/chosen": -148.58041381835938, + "logps/rejected": -171.2393035888672, + "loss": 1.2217, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.007718086242676, + "rewards/margins": 0.11389303207397461, + "rewards/rejected": 5.893825054168701, + "step": 674 + }, + { + "epoch": 0.11, + "learning_rate": 9.983940735720553e-06, + "logits/chosen": -0.8209161162376404, + "logits/rejected": -0.8003697991371155, + "logps/chosen": -71.48531341552734, + "logps/rejected": -50.83747482299805, + "loss": 1.1157, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.2456505298614502, + "rewards/margins": -0.2978907823562622, + "rewards/rejected": 1.5435413122177124, + "step": 675 + }, + { + "epoch": 0.11, + "learning_rate": 9.983835313307003e-06, + "logits/chosen": -0.4640403985977173, + "logits/rejected": -0.447530597448349, + "logps/chosen": -32.83407211303711, + "logps/rejected": -24.87900161743164, + "loss": 0.297, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.07208748161792755, + "rewards/margins": 0.2945674657821655, + "rewards/rejected": -0.3666549623012543, + "step": 676 + }, + { + "epoch": 0.11, + "learning_rate": 9.983729546557587e-06, + "logits/chosen": -0.8395902514457703, + "logits/rejected": -0.7950320839881897, + "logps/chosen": -57.89530944824219, + "logps/rejected": -53.7486572265625, + "loss": 0.8276, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.4872452020645142, + "rewards/margins": -0.6974509954452515, + "rewards/rejected": 2.1846961975097656, + "step": 677 + }, + { + "epoch": 0.11, + "learning_rate": 9.98362343547962e-06, + "logits/chosen": -0.8199606537818909, + "logits/rejected": -0.813235878944397, + "logps/chosen": -68.75814819335938, + "logps/rejected": -24.485570907592773, + "loss": 0.2787, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.065072774887085, + "rewards/margins": 0.38597941398620605, + "rewards/rejected": 1.679093360900879, + "step": 678 + }, + { + "epoch": 0.11, + "learning_rate": 9.983516980080426e-06, + "logits/chosen": -0.6064648628234863, + "logits/rejected": -0.5256586670875549, + "logps/chosen": -89.66605377197266, + "logps/rejected": -83.82986450195312, + "loss": 0.1091, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.882650136947632, + "rewards/margins": 2.7715203762054443, + "rewards/rejected": 1.1111297607421875, + "step": 679 + }, + { + "epoch": 0.11, + "learning_rate": 9.983410180367364e-06, + "logits/chosen": -0.5197454690933228, + "logits/rejected": -0.5118657350540161, + "logps/chosen": -162.50341796875, + "logps/rejected": -148.8353271484375, + "loss": 0.8822, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.572762966156006, + "rewards/margins": 4.787075519561768, + "rewards/rejected": 0.785687267780304, + "step": 680 + }, + { + "epoch": 0.11, + "learning_rate": 9.983303036347813e-06, + "logits/chosen": -0.7551991939544678, + "logits/rejected": -0.731501042842865, + "logps/chosen": -69.3597412109375, + "logps/rejected": -98.59440612792969, + "loss": 1.0804, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.1989433765411377, + "rewards/margins": -2.0315589904785156, + "rewards/rejected": 3.2305023670196533, + "step": 681 + }, + { + "epoch": 0.11, + "learning_rate": 9.983195548029173e-06, + "logits/chosen": -0.5160420536994934, + "logits/rejected": -0.5173275470733643, + "logps/chosen": -13.774977684020996, + "logps/rejected": -17.939359664916992, + "loss": 0.3644, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5966975092887878, + "rewards/margins": 0.097502201795578, + "rewards/rejected": 0.49919530749320984, + "step": 682 + }, + { + "epoch": 0.11, + "learning_rate": 9.983087715418873e-06, + "logits/chosen": -0.771435022354126, + "logits/rejected": -0.7641526460647583, + "logps/chosen": -88.5159912109375, + "logps/rejected": -59.309967041015625, + "loss": 1.9085, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.532696545124054, + "rewards/margins": -1.023850917816162, + "rewards/rejected": 1.5565475225448608, + "step": 683 + }, + { + "epoch": 0.11, + "learning_rate": 9.982979538524363e-06, + "logits/chosen": -0.8566082715988159, + "logits/rejected": -0.6943625211715698, + "logps/chosen": -206.08523559570312, + "logps/rejected": -73.67176818847656, + "loss": 0.0071, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.866906642913818, + "rewards/margins": 4.8726019859313965, + "rewards/rejected": 0.9943046569824219, + "step": 684 + }, + { + "epoch": 0.11, + "learning_rate": 9.982871017353114e-06, + "logits/chosen": -0.628173828125, + "logits/rejected": -0.6249974966049194, + "logps/chosen": -55.04767608642578, + "logps/rejected": -21.694473266601562, + "loss": 0.2115, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.7369186878204346, + "rewards/margins": 0.7814610004425049, + "rewards/rejected": 0.9554576873779297, + "step": 685 + }, + { + "epoch": 0.11, + "learning_rate": 9.982762151912628e-06, + "logits/chosen": -0.8918871283531189, + "logits/rejected": -0.8324743509292603, + "logps/chosen": -91.66557312011719, + "logps/rejected": -32.19429016113281, + "loss": 0.3047, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.1295150518417358, + "rewards/margins": 0.9376128911972046, + "rewards/rejected": 0.19190216064453125, + "step": 686 + }, + { + "epoch": 0.11, + "learning_rate": 9.982652942210425e-06, + "logits/chosen": -0.7589506506919861, + "logits/rejected": -0.8069684505462646, + "logps/chosen": -132.6678009033203, + "logps/rejected": -101.17501831054688, + "loss": 0.7127, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.0789031982421875, + "rewards/margins": 1.219355821609497, + "rewards/rejected": 3.8595473766326904, + "step": 687 + }, + { + "epoch": 0.11, + "learning_rate": 9.982543388254047e-06, + "logits/chosen": -0.8278665542602539, + "logits/rejected": -0.7859882712364197, + "logps/chosen": -108.24244689941406, + "logps/rejected": -72.0189208984375, + "loss": 0.7691, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.0158042907714844, + "rewards/margins": -0.07942426204681396, + "rewards/rejected": 1.0952285528182983, + "step": 688 + }, + { + "epoch": 0.11, + "learning_rate": 9.982433490051069e-06, + "logits/chosen": -0.5733004212379456, + "logits/rejected": -0.5664175152778625, + "logps/chosen": -58.918434143066406, + "logps/rejected": -77.808837890625, + "loss": 0.6831, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.663011908531189, + "rewards/margins": -0.2615257501602173, + "rewards/rejected": 1.9245376586914062, + "step": 689 + }, + { + "epoch": 0.11, + "learning_rate": 9.98232324760908e-06, + "logits/chosen": -0.5516432523727417, + "logits/rejected": -0.5597972869873047, + "logps/chosen": -56.268455505371094, + "logps/rejected": -70.02266693115234, + "loss": 1.1404, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.3540626764297485, + "rewards/margins": 0.15822601318359375, + "rewards/rejected": 1.1958366632461548, + "step": 690 + }, + { + "epoch": 0.11, + "learning_rate": 9.9822126609357e-06, + "logits/chosen": -0.6134743690490723, + "logits/rejected": -0.6388095617294312, + "logps/chosen": -104.99285125732422, + "logps/rejected": -171.84967041015625, + "loss": 2.3428, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.5437018871307373, + "rewards/margins": -4.661138534545898, + "rewards/rejected": 8.204840660095215, + "step": 691 + }, + { + "epoch": 0.11, + "learning_rate": 9.982101730038564e-06, + "logits/chosen": -1.2596445083618164, + "logits/rejected": -1.257311463356018, + "logps/chosen": -51.919044494628906, + "logps/rejected": -39.35533905029297, + "loss": 0.9258, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.8664238452911377, + "rewards/margins": 1.538130283355713, + "rewards/rejected": 0.3282936215400696, + "step": 692 + }, + { + "epoch": 0.11, + "learning_rate": 9.981990454925341e-06, + "logits/chosen": -0.6464782953262329, + "logits/rejected": -0.591360330581665, + "logps/chosen": -48.789669036865234, + "logps/rejected": -68.56466674804688, + "loss": 1.3705, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.7773135900497437, + "rewards/margins": -0.4301656484603882, + "rewards/rejected": 2.207479238510132, + "step": 693 + }, + { + "epoch": 0.11, + "learning_rate": 9.981878835603718e-06, + "logits/chosen": -0.6871005296707153, + "logits/rejected": -0.7052403688430786, + "logps/chosen": -87.34759521484375, + "logps/rejected": -116.1009750366211, + "loss": 0.5623, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.7611520290374756, + "rewards/margins": -0.70235276222229, + "rewards/rejected": 4.463504791259766, + "step": 694 + }, + { + "epoch": 0.11, + "learning_rate": 9.981766872081404e-06, + "logits/chosen": -0.6069501042366028, + "logits/rejected": -0.6069501042366028, + "logps/chosen": -24.99150848388672, + "logps/rejected": -24.99150848388672, + "loss": 0.4067, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.8381744623184204, + "rewards/margins": 0.0, + "rewards/rejected": 1.8381744623184204, + "step": 695 + }, + { + "epoch": 0.11, + "learning_rate": 9.98165456436614e-06, + "logits/chosen": -0.7236467599868774, + "logits/rejected": -0.7748644948005676, + "logps/chosen": -41.43053436279297, + "logps/rejected": -105.48233032226562, + "loss": 1.2469, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.340439558029175, + "rewards/margins": -1.9600822925567627, + "rewards/rejected": 5.3005218505859375, + "step": 696 + }, + { + "epoch": 0.11, + "learning_rate": 9.981541912465681e-06, + "logits/chosen": -0.7225365042686462, + "logits/rejected": -0.6595582365989685, + "logps/chosen": -66.67213439941406, + "logps/rejected": -68.30500030517578, + "loss": 0.453, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.8325432538986206, + "rewards/margins": 0.2237091064453125, + "rewards/rejected": 1.608834147453308, + "step": 697 + }, + { + "epoch": 0.11, + "learning_rate": 9.981428916387812e-06, + "logits/chosen": -0.7968806028366089, + "logits/rejected": -0.8357089161872864, + "logps/chosen": -111.84918212890625, + "logps/rejected": -257.4880676269531, + "loss": 2.5464, + "rewards/accuracies": 0.0, + "rewards/chosen": 5.087730407714844, + "rewards/margins": -5.0584516525268555, + "rewards/rejected": 10.1461820602417, + "step": 698 + }, + { + "epoch": 0.11, + "learning_rate": 9.98131557614034e-06, + "logits/chosen": -0.8389581441879272, + "logits/rejected": -0.8516178727149963, + "logps/chosen": -110.79090881347656, + "logps/rejected": -56.402706146240234, + "loss": 0.6169, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.8531387448310852, + "rewards/margins": -0.6694889664649963, + "rewards/rejected": 1.5226277112960815, + "step": 699 + }, + { + "epoch": 0.11, + "learning_rate": 9.981201891731094e-06, + "logits/chosen": -1.2078579664230347, + "logits/rejected": -1.1840815544128418, + "logps/chosen": -51.682464599609375, + "logps/rejected": -82.22409057617188, + "loss": 0.2516, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.099522352218628, + "rewards/margins": 0.44359421730041504, + "rewards/rejected": 2.655928134918213, + "step": 700 + }, + { + "epoch": 0.11, + "learning_rate": 9.981087863167932e-06, + "logits/chosen": -0.4789288341999054, + "logits/rejected": -0.4789288341999054, + "logps/chosen": -65.87260437011719, + "logps/rejected": -65.87260437011719, + "loss": 0.3816, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.401265859603882, + "rewards/margins": 0.0, + "rewards/rejected": 2.401265859603882, + "step": 701 + }, + { + "epoch": 0.11, + "learning_rate": 9.980973490458728e-06, + "logits/chosen": -0.6903930902481079, + "logits/rejected": -0.5959444046020508, + "logps/chosen": -42.16409683227539, + "logps/rejected": -31.46219253540039, + "loss": 0.3795, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.4393504858016968, + "rewards/margins": 0.0812220573425293, + "rewards/rejected": 1.3581284284591675, + "step": 702 + }, + { + "epoch": 0.11, + "learning_rate": 9.980858773611387e-06, + "logits/chosen": -0.3320261538028717, + "logits/rejected": -0.3320261538028717, + "logps/chosen": -35.938270568847656, + "logps/rejected": -35.938270568847656, + "loss": 1.2857, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.048766326159238815, + "rewards/margins": 0.0, + "rewards/rejected": 0.048766326159238815, + "step": 703 + }, + { + "epoch": 0.11, + "learning_rate": 9.980743712633835e-06, + "logits/chosen": -0.8363443613052368, + "logits/rejected": -0.7180845141410828, + "logps/chosen": -90.62977600097656, + "logps/rejected": -36.146785736083984, + "loss": 2.1641, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.518277883529663, + "rewards/margins": 2.1766960620880127, + "rewards/rejected": 0.3415817320346832, + "step": 704 + }, + { + "epoch": 0.11, + "learning_rate": 9.98062830753402e-06, + "logits/chosen": -0.725279688835144, + "logits/rejected": -0.5752593874931335, + "logps/chosen": -95.86398315429688, + "logps/rejected": -104.23431396484375, + "loss": 0.2446, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.8849105834960938, + "rewards/margins": 0.6918089389801025, + "rewards/rejected": 2.193101644515991, + "step": 705 + }, + { + "epoch": 0.11, + "learning_rate": 9.980512558319915e-06, + "logits/chosen": -0.6538907289505005, + "logits/rejected": -0.6675034165382385, + "logps/chosen": -75.74072265625, + "logps/rejected": -70.90702819824219, + "loss": 0.4636, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.810927629470825, + "rewards/margins": 1.3719696998596191, + "rewards/rejected": 2.438957929611206, + "step": 706 + }, + { + "epoch": 0.11, + "learning_rate": 9.98039646499952e-06, + "logits/chosen": -0.4990323781967163, + "logits/rejected": -0.4972372055053711, + "logps/chosen": -9.28506088256836, + "logps/rejected": -2.883862018585205, + "loss": 0.6524, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.02469453774392605, + "rewards/margins": -0.29198989272117615, + "rewards/rejected": 0.26729536056518555, + "step": 707 + }, + { + "epoch": 0.11, + "learning_rate": 9.980280027580853e-06, + "logits/chosen": -0.5909301042556763, + "logits/rejected": -0.5809358954429626, + "logps/chosen": -64.16731262207031, + "logps/rejected": -54.79951477050781, + "loss": 0.9913, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.5725166201591492, + "rewards/margins": -0.7030236124992371, + "rewards/rejected": 1.2755402326583862, + "step": 708 + }, + { + "epoch": 0.12, + "learning_rate": 9.980163246071962e-06, + "logits/chosen": -0.9997628331184387, + "logits/rejected": -0.9594684839248657, + "logps/chosen": -85.4676742553711, + "logps/rejected": -34.50508117675781, + "loss": 0.2459, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.8503258228302, + "rewards/margins": 2.4721245765686035, + "rewards/rejected": 0.37820130586624146, + "step": 709 + }, + { + "epoch": 0.12, + "learning_rate": 9.980046120480911e-06, + "logits/chosen": -0.8633772134780884, + "logits/rejected": -0.8950476050376892, + "logps/chosen": -99.70642852783203, + "logps/rejected": -96.63680267333984, + "loss": 0.891, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.8967941999435425, + "rewards/margins": -1.394457221031189, + "rewards/rejected": 3.2912514209747314, + "step": 710 + }, + { + "epoch": 0.12, + "learning_rate": 9.979928650815796e-06, + "logits/chosen": -0.8101614713668823, + "logits/rejected": -0.8318270444869995, + "logps/chosen": -108.68179321289062, + "logps/rejected": -62.80309295654297, + "loss": 0.7555, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.8103469610214233, + "rewards/margins": -0.33868634700775146, + "rewards/rejected": 2.149033308029175, + "step": 711 + }, + { + "epoch": 0.12, + "learning_rate": 9.979810837084731e-06, + "logits/chosen": -0.7607553005218506, + "logits/rejected": -0.6503674387931824, + "logps/chosen": -44.083396911621094, + "logps/rejected": -43.78749465942383, + "loss": 2.0718, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.5731128454208374, + "rewards/margins": 0.034777045249938965, + "rewards/rejected": 1.5383358001708984, + "step": 712 + }, + { + "epoch": 0.12, + "learning_rate": 9.979692679295856e-06, + "logits/chosen": -0.7119683027267456, + "logits/rejected": -0.651546061038971, + "logps/chosen": -50.968055725097656, + "logps/rejected": -51.50328063964844, + "loss": 0.2821, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.971583604812622, + "rewards/margins": 0.35766148567199707, + "rewards/rejected": 1.613922119140625, + "step": 713 + }, + { + "epoch": 0.12, + "learning_rate": 9.979574177457337e-06, + "logits/chosen": -0.7169824242591858, + "logits/rejected": -0.6661929488182068, + "logps/chosen": -43.15239715576172, + "logps/rejected": -58.29057312011719, + "loss": 1.301, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.9822078943252563, + "rewards/margins": -0.1890476942062378, + "rewards/rejected": 2.171255588531494, + "step": 714 + }, + { + "epoch": 0.12, + "learning_rate": 9.97945533157736e-06, + "logits/chosen": -0.5118880271911621, + "logits/rejected": -0.49669408798217773, + "logps/chosen": -63.827049255371094, + "logps/rejected": -112.16144561767578, + "loss": 0.1909, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.4813125133514404, + "rewards/margins": 0.8720420598983765, + "rewards/rejected": 1.609270453453064, + "step": 715 + }, + { + "epoch": 0.12, + "learning_rate": 9.979336141664134e-06, + "logits/chosen": -0.34294894337654114, + "logits/rejected": -0.27955588698387146, + "logps/chosen": -68.05892944335938, + "logps/rejected": -74.95826721191406, + "loss": 0.7538, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.8595108389854431, + "rewards/margins": -0.8711349368095398, + "rewards/rejected": 1.730645775794983, + "step": 716 + }, + { + "epoch": 0.12, + "learning_rate": 9.979216607725895e-06, + "logits/chosen": -1.0044941902160645, + "logits/rejected": -0.9510228633880615, + "logps/chosen": -50.68760681152344, + "logps/rejected": -58.18498992919922, + "loss": 0.6735, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.7061538696289062, + "rewards/margins": -0.15066981315612793, + "rewards/rejected": 2.856823682785034, + "step": 717 + }, + { + "epoch": 0.12, + "learning_rate": 9.979096729770902e-06, + "logits/chosen": -0.7732868790626526, + "logits/rejected": -0.7027267813682556, + "logps/chosen": -127.08058166503906, + "logps/rejected": -88.22625732421875, + "loss": 0.3727, + "rewards/accuracies": 1.0, + "rewards/chosen": 7.535016059875488, + "rewards/margins": 3.119197368621826, + "rewards/rejected": 4.415818691253662, + "step": 718 + }, + { + "epoch": 0.12, + "learning_rate": 9.97897650780744e-06, + "logits/chosen": -0.3341648578643799, + "logits/rejected": -0.3280927836894989, + "logps/chosen": -48.374141693115234, + "logps/rejected": -19.0304012298584, + "loss": 0.4454, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.21959877014160156, + "rewards/margins": -0.11476212739944458, + "rewards/rejected": 0.33436089754104614, + "step": 719 + }, + { + "epoch": 0.12, + "learning_rate": 9.97885594184381e-06, + "logits/chosen": -1.0811896324157715, + "logits/rejected": -0.8823473453521729, + "logps/chosen": -108.189697265625, + "logps/rejected": -27.684436798095703, + "loss": 0.0733, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.861900329589844, + "rewards/margins": 6.461861610412598, + "rewards/rejected": 0.40003854036331177, + "step": 720 + }, + { + "epoch": 0.12, + "learning_rate": 9.978735031888347e-06, + "logits/chosen": -0.6438223123550415, + "logits/rejected": -0.5720660090446472, + "logps/chosen": -192.59848022460938, + "logps/rejected": -79.13803100585938, + "loss": 0.5952, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.4182159900665283, + "rewards/margins": -0.8116354942321777, + "rewards/rejected": 3.229851484298706, + "step": 721 + }, + { + "epoch": 0.12, + "learning_rate": 9.978613777949401e-06, + "logits/chosen": -1.0370086431503296, + "logits/rejected": -0.9818555116653442, + "logps/chosen": -121.3179931640625, + "logps/rejected": -106.36182403564453, + "loss": 2.3261, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.1821823120117188, + "rewards/margins": -3.1626200675964355, + "rewards/rejected": 4.344802379608154, + "step": 722 + }, + { + "epoch": 0.12, + "learning_rate": 9.978492180035351e-06, + "logits/chosen": -0.7498911023139954, + "logits/rejected": -0.6494044065475464, + "logps/chosen": -109.53337097167969, + "logps/rejected": -66.04582214355469, + "loss": 0.0741, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.4956817626953125, + "rewards/margins": 2.1690521240234375, + "rewards/rejected": 2.326629638671875, + "step": 723 + }, + { + "epoch": 0.12, + "learning_rate": 9.9783702381546e-06, + "logits/chosen": -0.6095611453056335, + "logits/rejected": -0.6297899484634399, + "logps/chosen": -56.590572357177734, + "logps/rejected": -67.90437316894531, + "loss": 0.5401, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.3443844318389893, + "rewards/margins": -0.1413288116455078, + "rewards/rejected": 1.485713243484497, + "step": 724 + }, + { + "epoch": 0.12, + "learning_rate": 9.978247952315569e-06, + "logits/chosen": -0.8449879884719849, + "logits/rejected": -0.8651789426803589, + "logps/chosen": -97.14952087402344, + "logps/rejected": -104.9227066040039, + "loss": 0.5252, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.0902503728866577, + "rewards/margins": -0.36321496963500977, + "rewards/rejected": 1.4534653425216675, + "step": 725 + }, + { + "epoch": 0.12, + "learning_rate": 9.978125322526711e-06, + "logits/chosen": -0.15690600872039795, + "logits/rejected": -0.15690600872039795, + "logps/chosen": -1.8873647451400757, + "logps/rejected": -1.8873647451400757, + "loss": 1.3135, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.308817058801651, + "rewards/margins": 0.0, + "rewards/rejected": 0.308817058801651, + "step": 726 + }, + { + "epoch": 0.12, + "learning_rate": 9.978002348796496e-06, + "logits/chosen": -0.8101252317428589, + "logits/rejected": -0.8576610088348389, + "logps/chosen": -45.248924255371094, + "logps/rejected": -99.50193786621094, + "loss": 2.4075, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.3577873706817627, + "rewards/margins": -2.944154977798462, + "rewards/rejected": 5.301942348480225, + "step": 727 + }, + { + "epoch": 0.12, + "learning_rate": 9.977879031133422e-06, + "logits/chosen": -1.123915433883667, + "logits/rejected": -0.8997319340705872, + "logps/chosen": -106.5888442993164, + "logps/rejected": -26.28521156311035, + "loss": 0.0073, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.219552040100098, + "rewards/margins": 4.559628963470459, + "rewards/rejected": 0.6599230170249939, + "step": 728 + }, + { + "epoch": 0.12, + "learning_rate": 9.977755369546007e-06, + "logits/chosen": -0.6727814674377441, + "logits/rejected": -0.602232038974762, + "logps/chosen": -100.19525146484375, + "logps/rejected": -50.14093780517578, + "loss": 0.2848, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.8759812116622925, + "rewards/margins": 0.8217365741729736, + "rewards/rejected": 1.0542446374893188, + "step": 729 + }, + { + "epoch": 0.12, + "learning_rate": 9.977631364042796e-06, + "logits/chosen": -1.0758191347122192, + "logits/rejected": -0.9245602488517761, + "logps/chosen": -102.80347442626953, + "logps/rejected": -20.636653900146484, + "loss": 0.2359, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.974524974822998, + "rewards/margins": 5.602066993713379, + "rewards/rejected": 0.3724580705165863, + "step": 730 + }, + { + "epoch": 0.12, + "learning_rate": 9.977507014632357e-06, + "logits/chosen": -0.4112186133861542, + "logits/rejected": -0.40448683500289917, + "logps/chosen": -37.22536087036133, + "logps/rejected": -48.572654724121094, + "loss": 0.3319, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.9165577292442322, + "rewards/margins": 0.1457080841064453, + "rewards/rejected": 0.7708496451377869, + "step": 731 + }, + { + "epoch": 0.12, + "learning_rate": 9.97738232132328e-06, + "logits/chosen": -0.3487436771392822, + "logits/rejected": -0.47225236892700195, + "logps/chosen": -153.87911987304688, + "logps/rejected": -110.6546630859375, + "loss": 0.2262, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.854107618331909, + "rewards/margins": 0.7287368774414062, + "rewards/rejected": 3.125370740890503, + "step": 732 + }, + { + "epoch": 0.12, + "learning_rate": 9.97725728412418e-06, + "logits/chosen": -0.5203855037689209, + "logits/rejected": -0.5761988759040833, + "logps/chosen": -69.71732330322266, + "logps/rejected": -58.812992095947266, + "loss": 1.278, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.2526527643203735, + "rewards/margins": 0.04611015319824219, + "rewards/rejected": 1.2065426111221313, + "step": 733 + }, + { + "epoch": 0.12, + "learning_rate": 9.9771319030437e-06, + "logits/chosen": -0.3401697874069214, + "logits/rejected": -0.399186909198761, + "logps/chosen": -55.07588577270508, + "logps/rejected": -138.22872924804688, + "loss": 0.2617, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.955025851726532, + "rewards/margins": 1.0069301128387451, + "rewards/rejected": -0.05190429836511612, + "step": 734 + }, + { + "epoch": 0.12, + "learning_rate": 9.977006178090498e-06, + "logits/chosen": -0.5961336493492126, + "logits/rejected": -0.69386225938797, + "logps/chosen": -68.68577575683594, + "logps/rejected": -102.98311614990234, + "loss": 1.8305, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.487834930419922, + "rewards/margins": -2.608555793762207, + "rewards/rejected": 5.096390724182129, + "step": 735 + }, + { + "epoch": 0.12, + "learning_rate": 9.976880109273262e-06, + "logits/chosen": -0.8270601630210876, + "logits/rejected": -0.8441038131713867, + "logps/chosen": -98.46324920654297, + "logps/rejected": -122.89474487304688, + "loss": 0.9854, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.673324108123779, + "rewards/margins": 0.5040798187255859, + "rewards/rejected": 4.169244289398193, + "step": 736 + }, + { + "epoch": 0.12, + "learning_rate": 9.976753696600701e-06, + "logits/chosen": -0.7772899866104126, + "logits/rejected": -0.6128539443016052, + "logps/chosen": -110.50951385498047, + "logps/rejected": -72.44187927246094, + "loss": 0.4947, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.038515567779541, + "rewards/margins": 0.48912739753723145, + "rewards/rejected": 2.5493881702423096, + "step": 737 + }, + { + "epoch": 0.12, + "learning_rate": 9.976626940081553e-06, + "logits/chosen": -1.7684682607650757, + "logits/rejected": -1.8079043626785278, + "logps/chosen": -56.303653717041016, + "logps/rejected": -22.793790817260742, + "loss": 0.1926, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.2418384552001953, + "rewards/margins": 0.9318687319755554, + "rewards/rejected": 0.3099697232246399, + "step": 738 + }, + { + "epoch": 0.12, + "learning_rate": 9.97649983972457e-06, + "logits/chosen": -0.6856249570846558, + "logits/rejected": -0.7009227275848389, + "logps/chosen": -241.09239196777344, + "logps/rejected": -240.5465087890625, + "loss": 0.794, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.339575171470642, + "rewards/margins": -1.351873755455017, + "rewards/rejected": 2.691448926925659, + "step": 739 + }, + { + "epoch": 0.12, + "learning_rate": 9.976372395538537e-06, + "logits/chosen": -0.8247346878051758, + "logits/rejected": -0.786593496799469, + "logps/chosen": -90.73246765136719, + "logps/rejected": -94.75305938720703, + "loss": 0.9327, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.24838104844093323, + "rewards/margins": -1.6751670837402344, + "rewards/rejected": 1.9235481023788452, + "step": 740 + }, + { + "epoch": 0.12, + "learning_rate": 9.976244607532259e-06, + "logits/chosen": -0.7864727973937988, + "logits/rejected": -0.8422004580497742, + "logps/chosen": -88.15180969238281, + "logps/rejected": -85.49164581298828, + "loss": 2.0455, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.7596748471260071, + "rewards/margins": -1.1889586448669434, + "rewards/rejected": 1.9486335515975952, + "step": 741 + }, + { + "epoch": 0.12, + "learning_rate": 9.976116475714564e-06, + "logits/chosen": -0.6893150806427002, + "logits/rejected": -0.6389269828796387, + "logps/chosen": -77.76048278808594, + "logps/rejected": -84.30455780029297, + "loss": 0.4988, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.67331862449646, + "rewards/margins": -0.06494283676147461, + "rewards/rejected": 2.7382614612579346, + "step": 742 + }, + { + "epoch": 0.12, + "learning_rate": 9.975988000094303e-06, + "logits/chosen": -0.8817047476768494, + "logits/rejected": -0.8812268972396851, + "logps/chosen": -232.9547576904297, + "logps/rejected": -157.13296508789062, + "loss": 0.3204, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.7204055786132812, + "rewards/margins": 0.8243544101715088, + "rewards/rejected": 2.8960511684417725, + "step": 743 + }, + { + "epoch": 0.12, + "learning_rate": 9.975859180680356e-06, + "logits/chosen": -0.6516534090042114, + "logits/rejected": -0.6321042776107788, + "logps/chosen": -170.09402465820312, + "logps/rejected": -51.59013748168945, + "loss": 0.0628, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.069369792938232, + "rewards/margins": 2.0452589988708496, + "rewards/rejected": 2.024110794067383, + "step": 744 + }, + { + "epoch": 0.12, + "learning_rate": 9.975730017481622e-06, + "logits/chosen": -0.7604137063026428, + "logits/rejected": -0.7604137063026428, + "logps/chosen": -95.62124633789062, + "logps/rejected": -95.62124633789062, + "loss": 0.9425, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.43867564201355, + "rewards/margins": 0.0, + "rewards/rejected": 2.43867564201355, + "step": 745 + }, + { + "epoch": 0.12, + "learning_rate": 9.975600510507025e-06, + "logits/chosen": -0.9037715196609497, + "logits/rejected": -0.6899211406707764, + "logps/chosen": -95.40794372558594, + "logps/rejected": -29.205137252807617, + "loss": 0.8386, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.363513231277466, + "rewards/margins": 3.1853904724121094, + "rewards/rejected": 0.17812271416187286, + "step": 746 + }, + { + "epoch": 0.12, + "learning_rate": 9.975470659765512e-06, + "logits/chosen": -0.7996866703033447, + "logits/rejected": -0.7478358149528503, + "logps/chosen": -77.67544555664062, + "logps/rejected": -84.69586944580078, + "loss": 0.3673, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.630548119544983, + "rewards/margins": 0.12226486206054688, + "rewards/rejected": 1.508283257484436, + "step": 747 + }, + { + "epoch": 0.12, + "learning_rate": 9.975340465266054e-06, + "logits/chosen": -0.6691012978553772, + "logits/rejected": -0.6127352118492126, + "logps/chosen": -149.63461303710938, + "logps/rejected": -76.19654846191406, + "loss": 0.0502, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.64216947555542, + "rewards/margins": 2.318351984024048, + "rewards/rejected": 2.323817491531372, + "step": 748 + }, + { + "epoch": 0.12, + "learning_rate": 9.975209927017645e-06, + "logits/chosen": -0.7360288500785828, + "logits/rejected": -0.6519693732261658, + "logps/chosen": -71.50254821777344, + "logps/rejected": -21.51930809020996, + "loss": 0.1462, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.8660354614257812, + "rewards/margins": 1.6881967782974243, + "rewards/rejected": 0.17783871293067932, + "step": 749 + }, + { + "epoch": 0.12, + "learning_rate": 9.97507904502931e-06, + "logits/chosen": -0.055586330592632294, + "logits/rejected": -0.055586330592632294, + "logps/chosen": -78.05609893798828, + "logps/rejected": -78.05609893798828, + "loss": 1.2891, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.050849150866270065, + "rewards/margins": 0.0, + "rewards/rejected": -0.050849150866270065, + "step": 750 + }, + { + "epoch": 0.12, + "learning_rate": 9.974947819310086e-06, + "logits/chosen": -0.8824347853660583, + "logits/rejected": -0.720232367515564, + "logps/chosen": -122.73779296875, + "logps/rejected": -109.64802551269531, + "loss": 0.2067, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.627227783203125, + "rewards/margins": 4.204503059387207, + "rewards/rejected": 1.422724962234497, + "step": 751 + }, + { + "epoch": 0.12, + "learning_rate": 9.974816249869041e-06, + "logits/chosen": -0.6821689605712891, + "logits/rejected": -0.6879473924636841, + "logps/chosen": -52.905548095703125, + "logps/rejected": -147.01983642578125, + "loss": 0.5426, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.6408478021621704, + "rewards/margins": -0.18738102912902832, + "rewards/rejected": 1.8282288312911987, + "step": 752 + }, + { + "epoch": 0.12, + "learning_rate": 9.974684336715264e-06, + "logits/chosen": -0.9630001187324524, + "logits/rejected": -0.8853737115859985, + "logps/chosen": -272.4930419921875, + "logps/rejected": -117.11151123046875, + "loss": 0.4273, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.277740478515625, + "rewards/margins": 0.3814406394958496, + "rewards/rejected": 5.896299839019775, + "step": 753 + }, + { + "epoch": 0.12, + "learning_rate": 9.974552079857873e-06, + "logits/chosen": -0.9850178360939026, + "logits/rejected": -0.921867847442627, + "logps/chosen": -143.8564910888672, + "logps/rejected": -149.27053833007812, + "loss": 0.4673, + "rewards/accuracies": 0.0, + "rewards/chosen": 4.284923076629639, + "rewards/margins": -0.41149139404296875, + "rewards/rejected": 4.696414470672607, + "step": 754 + }, + { + "epoch": 0.12, + "learning_rate": 9.974419479306001e-06, + "logits/chosen": -0.8730838298797607, + "logits/rejected": -0.9178008437156677, + "logps/chosen": -69.26423645019531, + "logps/rejected": -114.66316223144531, + "loss": 1.2644, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.6860610842704773, + "rewards/margins": -2.3982512950897217, + "rewards/rejected": 3.0843124389648438, + "step": 755 + }, + { + "epoch": 0.12, + "learning_rate": 9.97428653506881e-06, + "logits/chosen": -0.75432288646698, + "logits/rejected": -0.773086667060852, + "logps/chosen": -62.75062561035156, + "logps/rejected": -50.838417053222656, + "loss": 0.2739, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.160695791244507, + "rewards/margins": 0.6283951997756958, + "rewards/rejected": 1.532300591468811, + "step": 756 + }, + { + "epoch": 0.12, + "learning_rate": 9.97415324715549e-06, + "logits/chosen": -0.8135929107666016, + "logits/rejected": -0.7644964456558228, + "logps/chosen": -110.28153228759766, + "logps/rejected": -11.57184886932373, + "loss": 1.2269, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.3233985900878906, + "rewards/margins": 0.558360755443573, + "rewards/rejected": 0.7650378346443176, + "step": 757 + }, + { + "epoch": 0.12, + "learning_rate": 9.974019615575245e-06, + "logits/chosen": -0.8932351469993591, + "logits/rejected": -0.8329649567604065, + "logps/chosen": -72.3548812866211, + "logps/rejected": -81.20051574707031, + "loss": 1.2248, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.467836856842041, + "rewards/margins": -0.8971602916717529, + "rewards/rejected": 3.364997148513794, + "step": 758 + }, + { + "epoch": 0.12, + "learning_rate": 9.973885640337308e-06, + "logits/chosen": -0.8336666226387024, + "logits/rejected": -0.6620884537696838, + "logps/chosen": -132.40765380859375, + "logps/rejected": -48.66038513183594, + "loss": 1.3414, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.854016125202179, + "rewards/margins": -0.8546509146690369, + "rewards/rejected": 1.7086670398712158, + "step": 759 + }, + { + "epoch": 0.12, + "learning_rate": 9.973751321450937e-06, + "logits/chosen": -0.9625693559646606, + "logits/rejected": -0.9681020975112915, + "logps/chosen": -88.091064453125, + "logps/rejected": -92.56453704833984, + "loss": 2.0433, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.408644199371338, + "rewards/margins": -2.132587432861328, + "rewards/rejected": 5.541231632232666, + "step": 760 + }, + { + "epoch": 0.12, + "learning_rate": 9.973616658925413e-06, + "logits/chosen": -0.8045743107795715, + "logits/rejected": -0.8045743107795715, + "logps/chosen": -71.41626739501953, + "logps/rejected": -71.41626739501953, + "loss": 0.7443, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.7911186218261719, + "rewards/margins": 0.0, + "rewards/rejected": 1.7911186218261719, + "step": 761 + }, + { + "epoch": 0.12, + "learning_rate": 9.973481652770039e-06, + "logits/chosen": -0.9526354670524597, + "logits/rejected": -0.9526354670524597, + "logps/chosen": -88.35436248779297, + "logps/rejected": -88.35436248779297, + "loss": 0.6311, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.6373062133789062, + "rewards/margins": 0.0, + "rewards/rejected": 1.6373062133789062, + "step": 762 + }, + { + "epoch": 0.12, + "learning_rate": 9.97334630299414e-06, + "logits/chosen": -0.5592164397239685, + "logits/rejected": -0.5592164397239685, + "logps/chosen": -2.8131256103515625, + "logps/rejected": -2.8131256103515625, + "loss": 0.3908, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.5831772089004517, + "rewards/margins": 0.0, + "rewards/rejected": 0.5831772089004517, + "step": 763 + }, + { + "epoch": 0.12, + "learning_rate": 9.973210609607071e-06, + "logits/chosen": -0.5706939697265625, + "logits/rejected": -0.36234453320503235, + "logps/chosen": -62.86003875732422, + "logps/rejected": -15.108999252319336, + "loss": 0.2441, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.8553078174591064, + "rewards/margins": 2.302654981613159, + "rewards/rejected": 0.5526527762413025, + "step": 764 + }, + { + "epoch": 0.12, + "learning_rate": 9.973074572618206e-06, + "logits/chosen": -0.3059973418712616, + "logits/rejected": -0.3312360644340515, + "logps/chosen": -100.76028442382812, + "logps/rejected": -69.91593933105469, + "loss": 0.7243, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.3585060834884644, + "rewards/margins": -0.009485602378845215, + "rewards/rejected": 1.3679916858673096, + "step": 765 + }, + { + "epoch": 0.12, + "learning_rate": 9.972938192036945e-06, + "logits/chosen": -0.7418822050094604, + "logits/rejected": -0.7372663617134094, + "logps/chosen": -97.48906707763672, + "logps/rejected": -78.61988830566406, + "loss": 0.7143, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.47004929184913635, + "rewards/margins": -1.1130523681640625, + "rewards/rejected": 1.5831016302108765, + "step": 766 + }, + { + "epoch": 0.12, + "learning_rate": 9.972801467872706e-06, + "logits/chosen": -0.5112615823745728, + "logits/rejected": -0.40545010566711426, + "logps/chosen": -53.01274871826172, + "logps/rejected": -71.49287414550781, + "loss": 0.6491, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.382311224937439, + "rewards/margins": 0.0305023193359375, + "rewards/rejected": 1.3518089056015015, + "step": 767 + }, + { + "epoch": 0.12, + "learning_rate": 9.972664400134942e-06, + "logits/chosen": -0.45061638951301575, + "logits/rejected": -0.45061638951301575, + "logps/chosen": -46.64482879638672, + "logps/rejected": -46.64482879638672, + "loss": 1.1271, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.5338203310966492, + "rewards/margins": 0.0, + "rewards/rejected": 0.5338203310966492, + "step": 768 + }, + { + "epoch": 0.12, + "learning_rate": 9.972526988833118e-06, + "logits/chosen": -0.5560670495033264, + "logits/rejected": -0.5560670495033264, + "logps/chosen": -34.687744140625, + "logps/rejected": -34.687744140625, + "loss": 0.5056, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.6313194036483765, + "rewards/margins": 0.0, + "rewards/rejected": 1.6313194036483765, + "step": 769 + }, + { + "epoch": 0.12, + "learning_rate": 9.972389233976729e-06, + "logits/chosen": -1.2343717813491821, + "logits/rejected": -1.1682144403457642, + "logps/chosen": -74.24700164794922, + "logps/rejected": -60.446510314941406, + "loss": 1.2534, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.2197625637054443, + "rewards/margins": 1.922255516052246, + "rewards/rejected": 0.297507107257843, + "step": 770 + }, + { + "epoch": 0.13, + "learning_rate": 9.972251135575294e-06, + "logits/chosen": -0.45120301842689514, + "logits/rejected": -0.4608212113380432, + "logps/chosen": -8.031767845153809, + "logps/rejected": -1.9054582118988037, + "loss": 0.8767, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.12226714938879013, + "rewards/margins": -0.1863585114479065, + "rewards/rejected": 0.3086256682872772, + "step": 771 + }, + { + "epoch": 0.13, + "learning_rate": 9.972112693638354e-06, + "logits/chosen": -0.3847830593585968, + "logits/rejected": -0.343871146440506, + "logps/chosen": -6.367544174194336, + "logps/rejected": -18.24795150756836, + "loss": 0.9623, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.805946946144104, + "rewards/margins": 0.4610058069229126, + "rewards/rejected": 0.3449411392211914, + "step": 772 + }, + { + "epoch": 0.13, + "learning_rate": 9.971973908175472e-06, + "logits/chosen": -0.599999189376831, + "logits/rejected": -0.6163357496261597, + "logps/chosen": -72.651611328125, + "logps/rejected": -112.73307800292969, + "loss": 1.3305, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.6075286865234375, + "rewards/margins": 1.2280197143554688, + "rewards/rejected": 2.3795089721679688, + "step": 773 + }, + { + "epoch": 0.13, + "learning_rate": 9.971834779196238e-06, + "logits/chosen": -0.9626277685165405, + "logits/rejected": -0.9347630143165588, + "logps/chosen": -97.42230224609375, + "logps/rejected": -87.0807113647461, + "loss": 0.2495, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.118707299232483, + "rewards/margins": 0.4367508292198181, + "rewards/rejected": 0.6819564700126648, + "step": 774 + }, + { + "epoch": 0.13, + "learning_rate": 9.971695306710267e-06, + "logits/chosen": -0.3675936162471771, + "logits/rejected": -0.2716514766216278, + "logps/chosen": -51.53466796875, + "logps/rejected": -42.7199821472168, + "loss": 0.8584, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.8103821277618408, + "rewards/margins": 0.5764843225479126, + "rewards/rejected": 1.2338978052139282, + "step": 775 + }, + { + "epoch": 0.13, + "learning_rate": 9.971555490727191e-06, + "logits/chosen": -1.074575424194336, + "logits/rejected": -0.8516793847084045, + "logps/chosen": -131.7430877685547, + "logps/rejected": -113.00077819824219, + "loss": 0.1304, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.720719814300537, + "rewards/margins": 1.4920485019683838, + "rewards/rejected": 3.2286713123321533, + "step": 776 + }, + { + "epoch": 0.13, + "learning_rate": 9.971415331256673e-06, + "logits/chosen": -0.39264604449272156, + "logits/rejected": -0.4058440029621124, + "logps/chosen": -82.56461334228516, + "logps/rejected": -100.67041015625, + "loss": 1.659, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.4810523986816406, + "rewards/margins": 2.456127166748047, + "rewards/rejected": 1.0249252319335938, + "step": 777 + }, + { + "epoch": 0.13, + "learning_rate": 9.971274828308396e-06, + "logits/chosen": -0.6165314316749573, + "logits/rejected": -0.6165314316749573, + "logps/chosen": -66.97494506835938, + "logps/rejected": -66.97494506835938, + "loss": 0.3795, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.26601943373680115, + "rewards/margins": 0.0, + "rewards/rejected": 0.26601943373680115, + "step": 778 + }, + { + "epoch": 0.13, + "learning_rate": 9.971133981892065e-06, + "logits/chosen": -0.6411539316177368, + "logits/rejected": -0.6849163174629211, + "logps/chosen": -52.87580871582031, + "logps/rejected": -114.76968383789062, + "loss": 1.7628, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.3973671197891235, + "rewards/margins": -3.48848819732666, + "rewards/rejected": 4.885855197906494, + "step": 779 + }, + { + "epoch": 0.13, + "learning_rate": 9.970992792017413e-06, + "logits/chosen": -0.7167688608169556, + "logits/rejected": -0.6449213027954102, + "logps/chosen": -78.53016662597656, + "logps/rejected": -53.62242126464844, + "loss": 0.4677, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.826342761516571, + "rewards/margins": -0.3910446763038635, + "rewards/rejected": 1.2173874378204346, + "step": 780 + }, + { + "epoch": 0.13, + "learning_rate": 9.970851258694198e-06, + "logits/chosen": -1.0351130962371826, + "logits/rejected": -0.9118568897247314, + "logps/chosen": -94.45150756835938, + "logps/rejected": -22.35158920288086, + "loss": 0.3038, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.793694972991943, + "rewards/margins": 5.598422527313232, + "rewards/rejected": 0.19527225196361542, + "step": 781 + }, + { + "epoch": 0.13, + "learning_rate": 9.970709381932193e-06, + "logits/chosen": -1.0120006799697876, + "logits/rejected": -1.0883475542068481, + "logps/chosen": -264.22589111328125, + "logps/rejected": -79.28524780273438, + "loss": 0.3032, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.073568820953369, + "rewards/margins": 3.226318359375, + "rewards/rejected": 2.847250461578369, + "step": 782 + }, + { + "epoch": 0.13, + "learning_rate": 9.970567161741204e-06, + "logits/chosen": -0.7255720496177673, + "logits/rejected": -0.6922754049301147, + "logps/chosen": -180.78131103515625, + "logps/rejected": -97.31100463867188, + "loss": 0.646, + "rewards/accuracies": 0.0, + "rewards/chosen": 4.4408416748046875, + "rewards/margins": -0.8972716331481934, + "rewards/rejected": 5.338113307952881, + "step": 783 + }, + { + "epoch": 0.13, + "learning_rate": 9.970424598131056e-06, + "logits/chosen": -0.8775354623794556, + "logits/rejected": -0.9428940415382385, + "logps/chosen": -105.85780334472656, + "logps/rejected": -301.455078125, + "loss": 0.9886, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.5154389142990112, + "rewards/margins": 1.1984772682189941, + "rewards/rejected": 0.3169616758823395, + "step": 784 + }, + { + "epoch": 0.13, + "learning_rate": 9.970281691111598e-06, + "logits/chosen": -0.5950841903686523, + "logits/rejected": -0.5950841903686523, + "logps/chosen": -103.02847290039062, + "logps/rejected": -103.02847290039062, + "loss": 0.7171, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.9780441522598267, + "rewards/margins": 0.0, + "rewards/rejected": 1.9780441522598267, + "step": 785 + }, + { + "epoch": 0.13, + "learning_rate": 9.970138440692706e-06, + "logits/chosen": -0.6145147085189819, + "logits/rejected": -0.4754642844200134, + "logps/chosen": -58.220558166503906, + "logps/rejected": -99.05358123779297, + "loss": 0.9036, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.471303701400757, + "rewards/margins": -0.09644603729248047, + "rewards/rejected": 2.5677497386932373, + "step": 786 + }, + { + "epoch": 0.13, + "learning_rate": 9.969994846884274e-06, + "logits/chosen": -0.5990195870399475, + "logits/rejected": -0.5993683934211731, + "logps/chosen": -70.73823547363281, + "logps/rejected": -44.76927185058594, + "loss": 0.483, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.083827257156372, + "rewards/margins": -0.34385788440704346, + "rewards/rejected": 1.4276851415634155, + "step": 787 + }, + { + "epoch": 0.13, + "learning_rate": 9.969850909696225e-06, + "logits/chosen": -0.5432019829750061, + "logits/rejected": -0.5271276831626892, + "logps/chosen": -23.007436752319336, + "logps/rejected": -4.950711250305176, + "loss": 1.2481, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.05531749874353409, + "rewards/margins": -0.1566048264503479, + "rewards/rejected": 0.2119223177433014, + "step": 788 + }, + { + "epoch": 0.13, + "learning_rate": 9.969706629138504e-06, + "logits/chosen": -0.6482958197593689, + "logits/rejected": -0.7182241082191467, + "logps/chosen": -50.929779052734375, + "logps/rejected": -90.01316833496094, + "loss": 1.3225, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.8949050903320312, + "rewards/margins": -2.116629123687744, + "rewards/rejected": 4.011534214019775, + "step": 789 + }, + { + "epoch": 0.13, + "learning_rate": 9.96956200522108e-06, + "logits/chosen": -0.6221119165420532, + "logits/rejected": -0.7127993702888489, + "logps/chosen": -79.39128112792969, + "logps/rejected": -101.17176055908203, + "loss": 1.3572, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.9952422976493835, + "rewards/margins": -1.4899132251739502, + "rewards/rejected": 2.4851555824279785, + "step": 790 + }, + { + "epoch": 0.13, + "learning_rate": 9.969417037953942e-06, + "logits/chosen": -0.8954205513000488, + "logits/rejected": -0.9869446754455566, + "logps/chosen": -129.93470764160156, + "logps/rejected": -158.93734741210938, + "loss": 2.0845, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.3802597522735596, + "rewards/margins": -4.091468811035156, + "rewards/rejected": 5.471728801727295, + "step": 791 + }, + { + "epoch": 0.13, + "learning_rate": 9.969271727347107e-06, + "logits/chosen": -0.7131254076957703, + "logits/rejected": -0.8279193043708801, + "logps/chosen": -226.70132446289062, + "logps/rejected": -65.21820068359375, + "loss": 0.4397, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.3709168434143066, + "rewards/margins": -0.34090352058410645, + "rewards/rejected": 3.711820363998413, + "step": 792 + }, + { + "epoch": 0.13, + "learning_rate": 9.969126073410618e-06, + "logits/chosen": -1.0324817895889282, + "logits/rejected": -0.8575195074081421, + "logps/chosen": -128.83868408203125, + "logps/rejected": -65.94709014892578, + "loss": 0.0715, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.404040813446045, + "rewards/margins": 2.538865804672241, + "rewards/rejected": 2.8651750087738037, + "step": 793 + }, + { + "epoch": 0.13, + "learning_rate": 9.968980076154533e-06, + "logits/chosen": -0.8110722303390503, + "logits/rejected": -0.7057043313980103, + "logps/chosen": -85.26331329345703, + "logps/rejected": -37.06777572631836, + "loss": 0.3042, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.0150543451309204, + "rewards/margins": 1.0734065771102905, + "rewards/rejected": -0.058352280408144, + "step": 794 + }, + { + "epoch": 0.13, + "learning_rate": 9.968833735588943e-06, + "logits/chosen": -1.0163277387619019, + "logits/rejected": -0.8826862573623657, + "logps/chosen": -96.05807495117188, + "logps/rejected": -22.06345558166504, + "loss": 0.009, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.909027099609375, + "rewards/margins": 5.562383651733398, + "rewards/rejected": 0.34664344787597656, + "step": 795 + }, + { + "epoch": 0.13, + "learning_rate": 9.968687051723958e-06, + "logits/chosen": -0.4647754728794098, + "logits/rejected": -0.4261152446269989, + "logps/chosen": -31.309627532958984, + "logps/rejected": -84.06110382080078, + "loss": 0.8638, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.8205150365829468, + "rewards/margins": -0.5273679494857788, + "rewards/rejected": 2.3478829860687256, + "step": 796 + }, + { + "epoch": 0.13, + "learning_rate": 9.96854002456971e-06, + "logits/chosen": -0.7942611575126648, + "logits/rejected": -0.7545064687728882, + "logps/chosen": -95.14471435546875, + "logps/rejected": -90.38497161865234, + "loss": 0.4201, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.5928688049316406, + "rewards/margins": 0.36849522590637207, + "rewards/rejected": 2.2243735790252686, + "step": 797 + }, + { + "epoch": 0.13, + "learning_rate": 9.968392654136361e-06, + "logits/chosen": -1.0992114543914795, + "logits/rejected": -0.874001681804657, + "logps/chosen": -124.51718139648438, + "logps/rejected": -152.4547576904297, + "loss": 0.4037, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.696954250335693, + "rewards/margins": 3.386420965194702, + "rewards/rejected": 2.310533285140991, + "step": 798 + }, + { + "epoch": 0.13, + "learning_rate": 9.96824494043409e-06, + "logits/chosen": -1.0222059488296509, + "logits/rejected": -1.046176791191101, + "logps/chosen": -170.411865234375, + "logps/rejected": -131.1260986328125, + "loss": 0.3235, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.165935039520264, + "rewards/margins": 0.17305946350097656, + "rewards/rejected": 5.992875576019287, + "step": 799 + }, + { + "epoch": 0.13, + "learning_rate": 9.968096883473104e-06, + "logits/chosen": -0.818676769733429, + "logits/rejected": -1.0629231929779053, + "logps/chosen": -54.72864532470703, + "logps/rejected": -33.74652099609375, + "loss": 0.4985, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.4079430103302, + "rewards/margins": 0.5415439605712891, + "rewards/rejected": 1.8663990497589111, + "step": 800 + }, + { + "epoch": 0.13, + "learning_rate": 9.967948483263631e-06, + "logits/chosen": -0.7774196267127991, + "logits/rejected": -0.6108137369155884, + "logps/chosen": -80.90684509277344, + "logps/rejected": -58.868167877197266, + "loss": 0.3532, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.9707550406455994, + "rewards/margins": 0.7959918975830078, + "rewards/rejected": 0.17476311326026917, + "step": 801 + }, + { + "epoch": 0.13, + "learning_rate": 9.967799739815925e-06, + "logits/chosen": -0.6439557075500488, + "logits/rejected": -0.6608946919441223, + "logps/chosen": -69.0820083618164, + "logps/rejected": -61.57183837890625, + "loss": 0.7632, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.9369430541992188, + "rewards/margins": -1.2247207164764404, + "rewards/rejected": 2.161663770675659, + "step": 802 + }, + { + "epoch": 0.13, + "learning_rate": 9.967650653140264e-06, + "logits/chosen": -0.5420755743980408, + "logits/rejected": -0.5187241435050964, + "logps/chosen": -161.85147094726562, + "logps/rejected": -120.42912292480469, + "loss": 0.6785, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.079092502593994, + "rewards/margins": 2.1343369483947754, + "rewards/rejected": 0.9447555541992188, + "step": 803 + }, + { + "epoch": 0.13, + "learning_rate": 9.967501223246946e-06, + "logits/chosen": -0.5978953242301941, + "logits/rejected": -0.5713873505592346, + "logps/chosen": -68.08973693847656, + "logps/rejected": -104.41279602050781, + "loss": 0.3625, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.5291519165039062, + "rewards/margins": 0.04696810245513916, + "rewards/rejected": 1.482183814048767, + "step": 804 + }, + { + "epoch": 0.13, + "learning_rate": 9.967351450146296e-06, + "logits/chosen": -0.36014413833618164, + "logits/rejected": -0.34646227955818176, + "logps/chosen": -83.37818908691406, + "logps/rejected": -46.619712829589844, + "loss": 0.3885, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.3267059326171875, + "rewards/margins": 0.02783811092376709, + "rewards/rejected": 1.2988678216934204, + "step": 805 + }, + { + "epoch": 0.13, + "learning_rate": 9.967201333848664e-06, + "logits/chosen": -0.4029107093811035, + "logits/rejected": -0.4776621162891388, + "logps/chosen": -61.05686950683594, + "logps/rejected": -42.87542724609375, + "loss": 0.65, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.0076080560684204, + "rewards/margins": -0.8983886241912842, + "rewards/rejected": 1.9059966802597046, + "step": 806 + }, + { + "epoch": 0.13, + "learning_rate": 9.967050874364418e-06, + "logits/chosen": -0.9138996005058289, + "logits/rejected": -0.8308408260345459, + "logps/chosen": -102.12508392333984, + "logps/rejected": -56.0352783203125, + "loss": 0.0809, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.101655006408691, + "rewards/margins": 3.478893756866455, + "rewards/rejected": 0.6227611899375916, + "step": 807 + }, + { + "epoch": 0.13, + "learning_rate": 9.966900071703957e-06, + "logits/chosen": -0.34294798970222473, + "logits/rejected": -0.3429379463195801, + "logps/chosen": -0.6460649371147156, + "logps/rejected": -30.493106842041016, + "loss": 0.5305, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3277274966239929, + "rewards/margins": 0.2578786611557007, + "rewards/rejected": 0.06984882801771164, + "step": 808 + }, + { + "epoch": 0.13, + "learning_rate": 9.966748925877698e-06, + "logits/chosen": -0.2789071798324585, + "logits/rejected": -0.2849973440170288, + "logps/chosen": -8.961060523986816, + "logps/rejected": -6.524529933929443, + "loss": 0.747, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.11702489852905273, + "rewards/margins": -0.1957971602678299, + "rewards/rejected": 0.07877226173877716, + "step": 809 + }, + { + "epoch": 0.13, + "learning_rate": 9.966597436896085e-06, + "logits/chosen": -1.1979141235351562, + "logits/rejected": -1.243281602859497, + "logps/chosen": -38.992881774902344, + "logps/rejected": -147.27468872070312, + "loss": 1.3541, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.8018802404403687, + "rewards/margins": -2.637777328491211, + "rewards/rejected": 4.439657688140869, + "step": 810 + }, + { + "epoch": 0.13, + "learning_rate": 9.966445604769581e-06, + "logits/chosen": -0.8043023347854614, + "logits/rejected": -0.6802719831466675, + "logps/chosen": -58.29941940307617, + "logps/rejected": -16.585535049438477, + "loss": 0.0986, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.5222721099853516, + "rewards/margins": 2.24397611618042, + "rewards/rejected": 0.2782960832118988, + "step": 811 + }, + { + "epoch": 0.13, + "learning_rate": 9.96629342950868e-06, + "logits/chosen": -0.5489221811294556, + "logits/rejected": -0.548527181148529, + "logps/chosen": -49.52958679199219, + "logps/rejected": -54.085975646972656, + "loss": 1.8648, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.2942352294921875, + "rewards/margins": 0.2718474864959717, + "rewards/rejected": 1.0223877429962158, + "step": 812 + }, + { + "epoch": 0.13, + "learning_rate": 9.966140911123894e-06, + "logits/chosen": -1.2001445293426514, + "logits/rejected": -1.1553045511245728, + "logps/chosen": -113.11138916015625, + "logps/rejected": -93.42486572265625, + "loss": 1.2165, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.5736328363418579, + "rewards/margins": -1.9924567937850952, + "rewards/rejected": 2.566089630126953, + "step": 813 + }, + { + "epoch": 0.13, + "learning_rate": 9.965988049625763e-06, + "logits/chosen": -0.5525190234184265, + "logits/rejected": -0.5244165658950806, + "logps/chosen": -73.65011596679688, + "logps/rejected": -77.1724853515625, + "loss": 1.0381, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.5987838506698608, + "rewards/margins": 0.3196532726287842, + "rewards/rejected": 1.2791305780410767, + "step": 814 + }, + { + "epoch": 0.13, + "learning_rate": 9.965834845024844e-06, + "logits/chosen": -0.3797621428966522, + "logits/rejected": -0.35854461789131165, + "logps/chosen": -70.70515441894531, + "logps/rejected": -41.583553314208984, + "loss": 0.7924, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.985589623451233, + "rewards/margins": 0.4606907367706299, + "rewards/rejected": 1.524898886680603, + "step": 815 + }, + { + "epoch": 0.13, + "learning_rate": 9.965681297331726e-06, + "logits/chosen": -0.6548789143562317, + "logits/rejected": -0.6585872769355774, + "logps/chosen": -56.438087463378906, + "logps/rejected": -34.47900390625, + "loss": 1.9776, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.7247841358184814, + "rewards/margins": 0.48959851264953613, + "rewards/rejected": 1.2351856231689453, + "step": 816 + }, + { + "epoch": 0.13, + "learning_rate": 9.965527406557014e-06, + "logits/chosen": -0.28892260789871216, + "logits/rejected": -0.32653293013572693, + "logps/chosen": -108.88966369628906, + "logps/rejected": -79.98117065429688, + "loss": 0.6653, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.2841026782989502, + "rewards/margins": 0.11472547054290771, + "rewards/rejected": 1.1693772077560425, + "step": 817 + }, + { + "epoch": 0.13, + "learning_rate": 9.965373172711343e-06, + "logits/chosen": -0.6956498622894287, + "logits/rejected": -0.6376895904541016, + "logps/chosen": -56.42533874511719, + "logps/rejected": -72.95999145507812, + "loss": 0.3984, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.359708547592163, + "rewards/margins": -0.11994171142578125, + "rewards/rejected": 2.4796502590179443, + "step": 818 + }, + { + "epoch": 0.13, + "learning_rate": 9.96521859580537e-06, + "logits/chosen": -0.8669096827507019, + "logits/rejected": -0.8085384964942932, + "logps/chosen": -130.49392700195312, + "logps/rejected": -127.71807861328125, + "loss": 0.5378, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.699533224105835, + "rewards/margins": 2.443417549133301, + "rewards/rejected": 1.2561157941818237, + "step": 819 + }, + { + "epoch": 0.13, + "learning_rate": 9.965063675849773e-06, + "logits/chosen": -0.7510656118392944, + "logits/rejected": -0.7516551613807678, + "logps/chosen": -70.32103729248047, + "logps/rejected": -40.753929138183594, + "loss": 0.5101, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.781805396080017, + "rewards/margins": -0.48661959171295166, + "rewards/rejected": 2.2684249877929688, + "step": 820 + }, + { + "epoch": 0.13, + "learning_rate": 9.964908412855256e-06, + "logits/chosen": -0.847352921962738, + "logits/rejected": -0.8651353716850281, + "logps/chosen": -68.3536605834961, + "logps/rejected": -90.67646789550781, + "loss": 1.3518, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.585752248764038, + "rewards/margins": -2.2336418628692627, + "rewards/rejected": 4.819394111633301, + "step": 821 + }, + { + "epoch": 0.13, + "learning_rate": 9.964752806832545e-06, + "logits/chosen": -0.5339243412017822, + "logits/rejected": -0.4639873802661896, + "logps/chosen": -50.84976577758789, + "logps/rejected": -79.72331237792969, + "loss": 1.1559, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.961141586303711, + "rewards/margins": 0.6212283372879028, + "rewards/rejected": 1.339913249015808, + "step": 822 + }, + { + "epoch": 0.13, + "learning_rate": 9.964596857792392e-06, + "logits/chosen": -0.8144024014472961, + "logits/rejected": -0.8633828163146973, + "logps/chosen": -87.77348327636719, + "logps/rejected": -71.18363952636719, + "loss": 1.8845, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.3017241954803467, + "rewards/margins": -0.3501420021057129, + "rewards/rejected": 2.6518661975860596, + "step": 823 + }, + { + "epoch": 0.13, + "learning_rate": 9.964440565745574e-06, + "logits/chosen": -1.078415870666504, + "logits/rejected": -1.0093873739242554, + "logps/chosen": -34.81879425048828, + "logps/rejected": -38.79279327392578, + "loss": 0.678, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.9962440729141235, + "rewards/margins": -0.9319640398025513, + "rewards/rejected": 2.928208112716675, + "step": 824 + }, + { + "epoch": 0.13, + "learning_rate": 9.964283930702884e-06, + "logits/chosen": -1.2351871728897095, + "logits/rejected": -1.2056275606155396, + "logps/chosen": -97.76533508300781, + "logps/rejected": -65.99532318115234, + "loss": 0.1864, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.173886299133301, + "rewards/margins": 0.8467867374420166, + "rewards/rejected": 3.327099561691284, + "step": 825 + }, + { + "epoch": 0.13, + "learning_rate": 9.964126952675148e-06, + "logits/chosen": -1.1093857288360596, + "logits/rejected": -0.945509672164917, + "logps/chosen": -125.12197875976562, + "logps/rejected": -86.18827819824219, + "loss": 0.0611, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.704814434051514, + "rewards/margins": 2.8571977615356445, + "rewards/rejected": 1.8476165533065796, + "step": 826 + }, + { + "epoch": 0.13, + "learning_rate": 9.963969631673211e-06, + "logits/chosen": -0.7503805160522461, + "logits/rejected": -0.6900191307067871, + "logps/chosen": -124.9650650024414, + "logps/rejected": -203.818359375, + "loss": 0.5494, + "rewards/accuracies": 0.0, + "rewards/chosen": 6.412545204162598, + "rewards/margins": -0.661008358001709, + "rewards/rejected": 7.073553562164307, + "step": 827 + }, + { + "epoch": 0.13, + "learning_rate": 9.963811967707942e-06, + "logits/chosen": -0.8934618830680847, + "logits/rejected": -0.9335923194885254, + "logps/chosen": -157.01943969726562, + "logps/rejected": -78.62699890136719, + "loss": 0.1292, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.50956130027771, + "rewards/margins": 1.8893067836761475, + "rewards/rejected": 1.6202545166015625, + "step": 828 + }, + { + "epoch": 0.13, + "learning_rate": 9.963653960790233e-06, + "logits/chosen": -0.8036782145500183, + "logits/rejected": -0.8222174644470215, + "logps/chosen": -74.63716125488281, + "logps/rejected": -50.81040954589844, + "loss": 0.5256, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.6952400207519531, + "rewards/margins": -0.21567916870117188, + "rewards/rejected": 1.910919189453125, + "step": 829 + }, + { + "epoch": 0.13, + "learning_rate": 9.963495610931002e-06, + "logits/chosen": -1.0659695863723755, + "logits/rejected": -0.9739896059036255, + "logps/chosen": -136.5816650390625, + "logps/rejected": -41.65625762939453, + "loss": 0.2915, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.3954910039901733, + "rewards/margins": 1.2090251445770264, + "rewards/rejected": 0.18646584451198578, + "step": 830 + }, + { + "epoch": 0.13, + "learning_rate": 9.96333691814119e-06, + "logits/chosen": -0.9067656397819519, + "logits/rejected": -0.9239444136619568, + "logps/chosen": -96.82476043701172, + "logps/rejected": -79.28581237792969, + "loss": 0.9991, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.5902595520019531, + "rewards/margins": -0.5658501386642456, + "rewards/rejected": 1.1561096906661987, + "step": 831 + }, + { + "epoch": 0.14, + "learning_rate": 9.96317788243176e-06, + "logits/chosen": -0.7080153822898865, + "logits/rejected": -0.74676513671875, + "logps/chosen": -70.809814453125, + "logps/rejected": -107.74513244628906, + "loss": 1.0582, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.8236923217773438, + "rewards/margins": -1.9180970191955566, + "rewards/rejected": 3.7417893409729004, + "step": 832 + }, + { + "epoch": 0.14, + "learning_rate": 9.963018503813701e-06, + "logits/chosen": -1.0330488681793213, + "logits/rejected": -0.8570883870124817, + "logps/chosen": -143.6897430419922, + "logps/rejected": -164.29063415527344, + "loss": 0.1345, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.893003940582275, + "rewards/margins": 1.3562102317810059, + "rewards/rejected": 5.5367937088012695, + "step": 833 + }, + { + "epoch": 0.14, + "learning_rate": 9.962858782298023e-06, + "logits/chosen": -0.7120612263679504, + "logits/rejected": -0.6566850543022156, + "logps/chosen": -96.37162780761719, + "logps/rejected": -68.60818481445312, + "loss": 0.1218, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.4088776111602783, + "rewards/margins": 1.8180961608886719, + "rewards/rejected": 1.5907814502716064, + "step": 834 + }, + { + "epoch": 0.14, + "learning_rate": 9.962698717895763e-06, + "logits/chosen": -0.6961269378662109, + "logits/rejected": -0.5904051661491394, + "logps/chosen": -244.08790588378906, + "logps/rejected": -57.78496170043945, + "loss": 0.0195, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.534797668457031, + "rewards/margins": 3.334838390350342, + "rewards/rejected": 1.1999591588974, + "step": 835 + }, + { + "epoch": 0.14, + "learning_rate": 9.962538310617978e-06, + "logits/chosen": -0.5563033819198608, + "logits/rejected": -0.5890169739723206, + "logps/chosen": -4.39316463470459, + "logps/rejected": -19.63733673095703, + "loss": 0.562, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.46594658493995667, + "rewards/margins": -0.2295459806919098, + "rewards/rejected": 0.6954925656318665, + "step": 836 + }, + { + "epoch": 0.14, + "learning_rate": 9.962377560475753e-06, + "logits/chosen": -0.5650064945220947, + "logits/rejected": -0.5650064945220947, + "logps/chosen": -63.35743713378906, + "logps/rejected": -63.35743713378906, + "loss": 0.4365, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.1702423095703125, + "rewards/margins": 0.0, + "rewards/rejected": 1.1702423095703125, + "step": 837 + }, + { + "epoch": 0.14, + "learning_rate": 9.962216467480192e-06, + "logits/chosen": -0.9071982502937317, + "logits/rejected": -0.8868032097816467, + "logps/chosen": -154.41551208496094, + "logps/rejected": -73.87255859375, + "loss": 0.8503, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.6985092163085938, + "rewards/margins": -1.140695333480835, + "rewards/rejected": 2.8392045497894287, + "step": 838 + }, + { + "epoch": 0.14, + "learning_rate": 9.962055031642426e-06, + "logits/chosen": -0.31314024329185486, + "logits/rejected": -0.253339022397995, + "logps/chosen": -56.337547302246094, + "logps/rejected": -39.21654510498047, + "loss": 0.8814, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.9552696347236633, + "rewards/margins": -1.3714799880981445, + "rewards/rejected": 2.326749563217163, + "step": 839 + }, + { + "epoch": 0.14, + "learning_rate": 9.96189325297361e-06, + "logits/chosen": -0.7622537612915039, + "logits/rejected": -0.7488892078399658, + "logps/chosen": -121.53439331054688, + "logps/rejected": -141.6708221435547, + "loss": 0.7077, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.141351342201233, + "rewards/margins": 0.22339630126953125, + "rewards/rejected": 0.9179550409317017, + "step": 840 + }, + { + "epoch": 0.14, + "learning_rate": 9.961731131484919e-06, + "logits/chosen": -0.8999153971672058, + "logits/rejected": -0.8949922919273376, + "logps/chosen": -50.41140365600586, + "logps/rejected": -55.11891174316406, + "loss": 0.8552, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.6200626492500305, + "rewards/margins": -1.492816686630249, + "rewards/rejected": 2.1128792762756348, + "step": 841 + }, + { + "epoch": 0.14, + "learning_rate": 9.961568667187556e-06, + "logits/chosen": -0.6672860980033875, + "logits/rejected": -0.6143062710762024, + "logps/chosen": -66.46762084960938, + "logps/rejected": -74.6962890625, + "loss": 1.2548, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.233287811279297, + "rewards/margins": 0.16717147827148438, + "rewards/rejected": 2.0661163330078125, + "step": 842 + }, + { + "epoch": 0.14, + "learning_rate": 9.961405860092743e-06, + "logits/chosen": -0.9080029129981995, + "logits/rejected": -0.8659986257553101, + "logps/chosen": -95.04833221435547, + "logps/rejected": -61.06764221191406, + "loss": 1.6645, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.287403106689453, + "rewards/margins": 1.464490532875061, + "rewards/rejected": 1.822912573814392, + "step": 843 + }, + { + "epoch": 0.14, + "learning_rate": 9.961242710211733e-06, + "logits/chosen": -1.2191822528839111, + "logits/rejected": -1.2355529069900513, + "logps/chosen": -85.02940368652344, + "logps/rejected": -80.37759399414062, + "loss": 0.6276, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.7737915515899658, + "rewards/margins": -0.5799803733825684, + "rewards/rejected": 2.353771924972534, + "step": 844 + }, + { + "epoch": 0.14, + "learning_rate": 9.961079217555794e-06, + "logits/chosen": -0.6236562728881836, + "logits/rejected": -0.557502269744873, + "logps/chosen": -57.955501556396484, + "logps/rejected": -47.81119918823242, + "loss": 1.0105, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.0313968658447266, + "rewards/margins": -0.22073900699615479, + "rewards/rejected": 1.2521358728408813, + "step": 845 + }, + { + "epoch": 0.14, + "learning_rate": 9.960915382136223e-06, + "logits/chosen": -0.6157532930374146, + "logits/rejected": -0.6137509942054749, + "logps/chosen": -103.65718841552734, + "logps/rejected": -7.054934024810791, + "loss": 1.0019, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.254241943359375, + "rewards/margins": -0.12133261561393738, + "rewards/rejected": 0.3755745589733124, + "step": 846 + }, + { + "epoch": 0.14, + "learning_rate": 9.96075120396434e-06, + "logits/chosen": -0.8040477633476257, + "logits/rejected": -0.5825665593147278, + "logps/chosen": -132.96487426757812, + "logps/rejected": -55.61254119873047, + "loss": 0.1278, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.509225368499756, + "rewards/margins": 4.088388442993164, + "rewards/rejected": 1.4208366870880127, + "step": 847 + }, + { + "epoch": 0.14, + "learning_rate": 9.960586683051488e-06, + "logits/chosen": -0.9189759492874146, + "logits/rejected": -0.8065856099128723, + "logps/chosen": -107.10844421386719, + "logps/rejected": -178.23971557617188, + "loss": 0.2312, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.167027473449707, + "rewards/margins": 0.5907425880432129, + "rewards/rejected": 4.576284885406494, + "step": 848 + }, + { + "epoch": 0.14, + "learning_rate": 9.960421819409034e-06, + "logits/chosen": -0.7930181622505188, + "logits/rejected": -0.8035860657691956, + "logps/chosen": -55.89019775390625, + "logps/rejected": -79.22804260253906, + "loss": 0.5122, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.0963424444198608, + "rewards/margins": 0.045827388763427734, + "rewards/rejected": 1.050515055656433, + "step": 849 + }, + { + "epoch": 0.14, + "learning_rate": 9.960256613048367e-06, + "logits/chosen": -0.7085078358650208, + "logits/rejected": -0.5562200546264648, + "logps/chosen": -47.21018981933594, + "logps/rejected": -8.902487754821777, + "loss": 0.5597, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.5196815729141235, + "rewards/margins": 0.8233516812324524, + "rewards/rejected": 0.6963298916816711, + "step": 850 + }, + { + "epoch": 0.14, + "learning_rate": 9.960091063980903e-06, + "logits/chosen": -0.5919296741485596, + "logits/rejected": -0.5914081335067749, + "logps/chosen": -2.123589515686035, + "logps/rejected": -2.7510063648223877, + "loss": 0.8515, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.32784605026245117, + "rewards/margins": -0.008284658193588257, + "rewards/rejected": 0.33613070845603943, + "step": 851 + }, + { + "epoch": 0.14, + "learning_rate": 9.959925172218081e-06, + "logits/chosen": -0.5863509178161621, + "logits/rejected": -0.37005460262298584, + "logps/chosen": -82.92234802246094, + "logps/rejected": -29.853933334350586, + "loss": 0.441, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5745391845703125, + "rewards/margins": 0.6262460947036743, + "rewards/rejected": -0.051706887781620026, + "step": 852 + }, + { + "epoch": 0.14, + "learning_rate": 9.959758937771358e-06, + "logits/chosen": -0.8867890238761902, + "logits/rejected": -0.8326206803321838, + "logps/chosen": -89.41848754882812, + "logps/rejected": -104.55224609375, + "loss": 0.12, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.6795685291290283, + "rewards/margins": 1.3604798316955566, + "rewards/rejected": 0.31908875703811646, + "step": 853 + }, + { + "epoch": 0.14, + "learning_rate": 9.959592360652224e-06, + "logits/chosen": -0.9301928877830505, + "logits/rejected": -0.7643798589706421, + "logps/chosen": -104.00531005859375, + "logps/rejected": -31.197603225708008, + "loss": 0.1231, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.3830947875976562, + "rewards/margins": 1.2819592952728271, + "rewards/rejected": 0.10113544762134552, + "step": 854 + }, + { + "epoch": 0.14, + "learning_rate": 9.959425440872185e-06, + "logits/chosen": -0.583276093006134, + "logits/rejected": -0.5810020565986633, + "logps/chosen": -87.15623474121094, + "logps/rejected": -126.50165557861328, + "loss": 0.9172, + "rewards/accuracies": 0.0, + "rewards/chosen": 4.743524074554443, + "rewards/margins": -0.7609157562255859, + "rewards/rejected": 5.504439830780029, + "step": 855 + }, + { + "epoch": 0.14, + "learning_rate": 9.959258178442774e-06, + "logits/chosen": -0.5416884422302246, + "logits/rejected": -0.5462693572044373, + "logps/chosen": -2.7123544216156006, + "logps/rejected": -0.9318504929542542, + "loss": 0.4122, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.10645952075719833, + "rewards/margins": -0.12916803359985352, + "rewards/rejected": 0.23562756180763245, + "step": 856 + }, + { + "epoch": 0.14, + "learning_rate": 9.959090573375549e-06, + "logits/chosen": -0.6615266799926758, + "logits/rejected": -0.698823869228363, + "logps/chosen": -36.34294891357422, + "logps/rejected": -25.700267791748047, + "loss": 1.837, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.5913597345352173, + "rewards/margins": -0.4044628143310547, + "rewards/rejected": 1.995822548866272, + "step": 857 + }, + { + "epoch": 0.14, + "learning_rate": 9.958922625682088e-06, + "logits/chosen": -0.4955286383628845, + "logits/rejected": -0.5136004090309143, + "logps/chosen": -193.806884765625, + "logps/rejected": -67.37837219238281, + "loss": 0.1568, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.009400844573975, + "rewards/margins": 1.710294246673584, + "rewards/rejected": 2.2991065979003906, + "step": 858 + }, + { + "epoch": 0.14, + "learning_rate": 9.958754335373998e-06, + "logits/chosen": -0.3897514343261719, + "logits/rejected": -0.39692580699920654, + "logps/chosen": -7.6616973876953125, + "logps/rejected": -11.262872695922852, + "loss": 0.9467, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.6103169322013855, + "rewards/margins": -0.3486495018005371, + "rewards/rejected": 0.9589664340019226, + "step": 859 + }, + { + "epoch": 0.14, + "learning_rate": 9.9585857024629e-06, + "logits/chosen": -1.0768637657165527, + "logits/rejected": -0.8066959977149963, + "logps/chosen": -224.80609130859375, + "logps/rejected": -82.1808090209961, + "loss": 0.0131, + "rewards/accuracies": 1.0, + "rewards/chosen": 7.714111328125, + "rewards/margins": 3.9597465991973877, + "rewards/rejected": 3.7543647289276123, + "step": 860 + }, + { + "epoch": 0.14, + "learning_rate": 9.958416726960452e-06, + "logits/chosen": -1.0118465423583984, + "logits/rejected": -1.1343159675598145, + "logps/chosen": -195.80398559570312, + "logps/rejected": -109.18955993652344, + "loss": 0.8243, + "rewards/accuracies": 0.0, + "rewards/chosen": 4.444858074188232, + "rewards/margins": -1.3997116088867188, + "rewards/rejected": 5.844569683074951, + "step": 861 + }, + { + "epoch": 0.14, + "learning_rate": 9.958247408878322e-06, + "logits/chosen": -0.8846782445907593, + "logits/rejected": -0.927800714969635, + "logps/chosen": -254.7894287109375, + "logps/rejected": -104.35159301757812, + "loss": 0.0416, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.349173069000244, + "rewards/margins": 3.301149845123291, + "rewards/rejected": 2.048023223876953, + "step": 862 + }, + { + "epoch": 0.14, + "learning_rate": 9.958077748228212e-06, + "logits/chosen": -0.7791095972061157, + "logits/rejected": -0.6775401830673218, + "logps/chosen": -136.39840698242188, + "logps/rejected": -68.9216537475586, + "loss": 1.1093, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.751887798309326, + "rewards/margins": 2.67018985748291, + "rewards/rejected": 2.081697940826416, + "step": 863 + }, + { + "epoch": 0.14, + "learning_rate": 9.957907745021845e-06, + "logits/chosen": -0.16826975345611572, + "logits/rejected": -0.09976411610841751, + "logps/chosen": -55.258705139160156, + "logps/rejected": -1.3737547397613525, + "loss": 0.4596, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.3151352107524872, + "rewards/margins": -0.10111436247825623, + "rewards/rejected": 0.4162495732307434, + "step": 864 + }, + { + "epoch": 0.14, + "learning_rate": 9.957737399270964e-06, + "logits/chosen": -0.4860479235649109, + "logits/rejected": -0.6426355242729187, + "logps/chosen": -154.3215789794922, + "logps/rejected": -124.8685302734375, + "loss": 1.0751, + "rewards/accuracies": 0.0, + "rewards/chosen": 5.18154764175415, + "rewards/margins": -0.4965543746948242, + "rewards/rejected": 5.678102016448975, + "step": 865 + }, + { + "epoch": 0.14, + "learning_rate": 9.957566710987338e-06, + "logits/chosen": -0.7797292470932007, + "logits/rejected": -0.800934374332428, + "logps/chosen": -65.35485076904297, + "logps/rejected": -117.0864028930664, + "loss": 0.3257, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.562427520751953, + "rewards/margins": 1.7080093622207642, + "rewards/rejected": 1.854418158531189, + "step": 866 + }, + { + "epoch": 0.14, + "learning_rate": 9.957395680182763e-06, + "logits/chosen": -0.6156048774719238, + "logits/rejected": -0.5417841672897339, + "logps/chosen": -61.24073028564453, + "logps/rejected": -58.26276779174805, + "loss": 0.221, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.0547897815704346, + "rewards/margins": 0.8316265344619751, + "rewards/rejected": 1.2231632471084595, + "step": 867 + }, + { + "epoch": 0.14, + "learning_rate": 9.957224306869053e-06, + "logits/chosen": -0.35973772406578064, + "logits/rejected": -0.35973772406578064, + "logps/chosen": -94.89783477783203, + "logps/rejected": -94.89783477783203, + "loss": 0.3841, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.445899248123169, + "rewards/margins": 0.0, + "rewards/rejected": 1.445899248123169, + "step": 868 + }, + { + "epoch": 0.14, + "learning_rate": 9.95705259105805e-06, + "logits/chosen": -0.5655567646026611, + "logits/rejected": -0.49678748846054077, + "logps/chosen": -35.95808410644531, + "logps/rejected": -29.35657501220703, + "loss": 0.155, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.250126600265503, + "rewards/margins": 1.1739585399627686, + "rewards/rejected": 1.0761680603027344, + "step": 869 + }, + { + "epoch": 0.14, + "learning_rate": 9.956880532761614e-06, + "logits/chosen": -0.8642375469207764, + "logits/rejected": -0.7170228362083435, + "logps/chosen": -78.94806671142578, + "logps/rejected": -23.96567726135254, + "loss": 1.5155, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.7571457028388977, + "rewards/margins": 0.5547367334365845, + "rewards/rejected": 0.20240898430347443, + "step": 870 + }, + { + "epoch": 0.14, + "learning_rate": 9.956708131991639e-06, + "logits/chosen": -0.27765730023384094, + "logits/rejected": -0.14511941373348236, + "logps/chosen": -35.24100112915039, + "logps/rejected": -5.716228008270264, + "loss": 0.2619, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.1609982252120972, + "rewards/margins": 0.6576058268547058, + "rewards/rejected": 0.5033923983573914, + "step": 871 + }, + { + "epoch": 0.14, + "learning_rate": 9.956535388760031e-06, + "logits/chosen": -0.9974261522293091, + "logits/rejected": -0.8377315402030945, + "logps/chosen": -144.61758422851562, + "logps/rejected": -83.04499816894531, + "loss": 0.3979, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.8440918922424316, + "rewards/margins": 0.6938445568084717, + "rewards/rejected": 3.15024733543396, + "step": 872 + }, + { + "epoch": 0.14, + "learning_rate": 9.956362303078729e-06, + "logits/chosen": -0.832694411277771, + "logits/rejected": -0.9094406962394714, + "logps/chosen": -127.05677032470703, + "logps/rejected": -90.37149047851562, + "loss": 1.7377, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.801746368408203, + "rewards/margins": -1.3975701332092285, + "rewards/rejected": 4.199316501617432, + "step": 873 + }, + { + "epoch": 0.14, + "learning_rate": 9.956188874959686e-06, + "logits/chosen": -0.8741188645362854, + "logits/rejected": -0.7462032437324524, + "logps/chosen": -136.1294708251953, + "logps/rejected": -61.25395965576172, + "loss": 0.3233, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1454102993011475, + "rewards/margins": 0.19679951667785645, + "rewards/rejected": 2.948610782623291, + "step": 874 + }, + { + "epoch": 0.14, + "learning_rate": 9.956015104414892e-06, + "logits/chosen": -0.22160489857196808, + "logits/rejected": -0.24909912049770355, + "logps/chosen": -3.871452808380127, + "logps/rejected": -27.316556930541992, + "loss": 0.5878, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.34963956475257874, + "rewards/margins": 0.03769803047180176, + "rewards/rejected": 0.311941534280777, + "step": 875 + }, + { + "epoch": 0.14, + "learning_rate": 9.955840991456346e-06, + "logits/chosen": -0.4595761001110077, + "logits/rejected": -0.40932992100715637, + "logps/chosen": -39.17061996459961, + "logps/rejected": -24.07264518737793, + "loss": 2.5783, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.3751003742218018, + "rewards/margins": -0.739372730255127, + "rewards/rejected": 2.1144731044769287, + "step": 876 + }, + { + "epoch": 0.14, + "learning_rate": 9.95566653609608e-06, + "logits/chosen": -0.7661556601524353, + "logits/rejected": -0.6541758179664612, + "logps/chosen": -122.10482025146484, + "logps/rejected": -43.5731201171875, + "loss": 0.1661, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.8054558038711548, + "rewards/margins": 1.7436760663986206, + "rewards/rejected": 0.06177978590130806, + "step": 877 + }, + { + "epoch": 0.14, + "learning_rate": 9.955491738346149e-06, + "logits/chosen": -0.414723664522171, + "logits/rejected": -0.38509848713874817, + "logps/chosen": -87.2088623046875, + "logps/rejected": -72.3746337890625, + "loss": 0.8081, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.3961594104766846, + "rewards/margins": 0.28307199478149414, + "rewards/rejected": 2.1130874156951904, + "step": 878 + }, + { + "epoch": 0.14, + "learning_rate": 9.955316598218625e-06, + "logits/chosen": -1.039377212524414, + "logits/rejected": -0.9439689517021179, + "logps/chosen": -150.94595336914062, + "logps/rejected": -183.63162231445312, + "loss": 0.2961, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.756430149078369, + "rewards/margins": 0.2430267333984375, + "rewards/rejected": 5.513403415679932, + "step": 879 + }, + { + "epoch": 0.14, + "learning_rate": 9.955141115725613e-06, + "logits/chosen": -0.8580850958824158, + "logits/rejected": -0.8805297017097473, + "logps/chosen": -103.67552185058594, + "logps/rejected": -83.93937683105469, + "loss": 1.0921, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.16656188666820526, + "rewards/margins": -2.016253709793091, + "rewards/rejected": 2.1828155517578125, + "step": 880 + }, + { + "epoch": 0.14, + "learning_rate": 9.954965290879237e-06, + "logits/chosen": -0.6152099370956421, + "logits/rejected": -0.6331115365028381, + "logps/chosen": -47.686248779296875, + "logps/rejected": -102.932861328125, + "loss": 0.4431, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.8587784171104431, + "rewards/margins": 1.5277862548828125, + "rewards/rejected": -0.6690078973770142, + "step": 881 + }, + { + "epoch": 0.14, + "learning_rate": 9.954789123691643e-06, + "logits/chosen": -0.8925887942314148, + "logits/rejected": -0.7747175097465515, + "logps/chosen": -101.14376068115234, + "logps/rejected": -110.9441909790039, + "loss": 0.1824, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.105149269104004, + "rewards/margins": 0.9305100440979004, + "rewards/rejected": 4.1746392250061035, + "step": 882 + }, + { + "epoch": 0.14, + "learning_rate": 9.954612614175004e-06, + "logits/chosen": -0.5189657807350159, + "logits/rejected": -0.5189657807350159, + "logps/chosen": -23.988636016845703, + "logps/rejected": -23.988636016845703, + "loss": 0.3477, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.6362335681915283, + "rewards/margins": 0.0, + "rewards/rejected": 1.6362335681915283, + "step": 883 + }, + { + "epoch": 0.14, + "learning_rate": 9.954435762341513e-06, + "logits/chosen": -0.6909632682800293, + "logits/rejected": -0.6874885559082031, + "logps/chosen": -84.03840637207031, + "logps/rejected": -82.29304504394531, + "loss": 1.6461, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.200053408741951, + "rewards/margins": -2.465118408203125, + "rewards/rejected": 2.2650649547576904, + "step": 884 + }, + { + "epoch": 0.14, + "learning_rate": 9.95425856820339e-06, + "logits/chosen": -0.8388775587081909, + "logits/rejected": -0.8377255797386169, + "logps/chosen": -32.59942626953125, + "logps/rejected": -73.87833404541016, + "loss": 0.3895, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.2196273803710938, + "rewards/margins": 0.12416458129882812, + "rewards/rejected": 2.0954627990722656, + "step": 885 + }, + { + "epoch": 0.14, + "learning_rate": 9.954081031772878e-06, + "logits/chosen": -0.7982849478721619, + "logits/rejected": -0.7794848680496216, + "logps/chosen": -60.24786376953125, + "logps/rejected": -57.35966873168945, + "loss": 0.4846, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.6961601972579956, + "rewards/margins": 0.5769100189208984, + "rewards/rejected": 1.1192501783370972, + "step": 886 + }, + { + "epoch": 0.14, + "learning_rate": 9.953903153062243e-06, + "logits/chosen": -0.691796600818634, + "logits/rejected": -0.6969780921936035, + "logps/chosen": -85.14141082763672, + "logps/rejected": -125.50424194335938, + "loss": 0.9742, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.6179161071777344, + "rewards/margins": 1.167701005935669, + "rewards/rejected": 0.4502151608467102, + "step": 887 + }, + { + "epoch": 0.14, + "learning_rate": 9.953724932083774e-06, + "logits/chosen": -1.1231589317321777, + "logits/rejected": -1.0005567073822021, + "logps/chosen": -67.43274688720703, + "logps/rejected": -31.893526077270508, + "loss": 0.1632, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.6179924011230469, + "rewards/margins": 1.5338220596313477, + "rewards/rejected": 0.08417034149169922, + "step": 888 + }, + { + "epoch": 0.14, + "learning_rate": 9.953546368849787e-06, + "logits/chosen": -0.6705487966537476, + "logits/rejected": -0.7570803761482239, + "logps/chosen": -88.5545425415039, + "logps/rejected": -182.2411651611328, + "loss": 2.7341, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.5992698669433594, + "rewards/margins": -3.847506046295166, + "rewards/rejected": 5.446775913238525, + "step": 889 + }, + { + "epoch": 0.14, + "learning_rate": 9.953367463372615e-06, + "logits/chosen": -0.8605883717536926, + "logits/rejected": -0.7988565564155579, + "logps/chosen": -58.936641693115234, + "logps/rejected": -88.01897430419922, + "loss": 0.8727, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.7698261737823486, + "rewards/margins": -0.10361063480377197, + "rewards/rejected": 1.8734368085861206, + "step": 890 + }, + { + "epoch": 0.14, + "learning_rate": 9.95318821566462e-06, + "logits/chosen": -0.9198125004768372, + "logits/rejected": -0.754386305809021, + "logps/chosen": -62.738121032714844, + "logps/rejected": -18.93805503845215, + "loss": 0.6522, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.6824440956115723, + "rewards/margins": 3.3096556663513184, + "rewards/rejected": 0.3727884292602539, + "step": 891 + }, + { + "epoch": 0.14, + "learning_rate": 9.953008625738186e-06, + "logits/chosen": -0.9761381149291992, + "logits/rejected": -0.8520554304122925, + "logps/chosen": -114.71075439453125, + "logps/rejected": -62.53143310546875, + "loss": 0.1914, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.996575832366943, + "rewards/margins": 3.639224052429199, + "rewards/rejected": 2.357351779937744, + "step": 892 + }, + { + "epoch": 0.14, + "learning_rate": 9.952828693605723e-06, + "logits/chosen": -0.6781055331230164, + "logits/rejected": -0.6798438429832458, + "logps/chosen": -57.661434173583984, + "logps/rejected": -79.46864318847656, + "loss": 0.1953, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.842362642288208, + "rewards/margins": 0.9719234704971313, + "rewards/rejected": 0.8704391717910767, + "step": 893 + }, + { + "epoch": 0.15, + "learning_rate": 9.952648419279662e-06, + "logits/chosen": -0.5441323518753052, + "logits/rejected": -0.42731866240501404, + "logps/chosen": -49.76336669921875, + "logps/rejected": -6.125643730163574, + "loss": 1.0327, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.7611786127090454, + "rewards/margins": 1.056830644607544, + "rewards/rejected": 0.7043480277061462, + "step": 894 + }, + { + "epoch": 0.15, + "learning_rate": 9.952467802772457e-06, + "logits/chosen": -0.7299266457557678, + "logits/rejected": -0.7324961423873901, + "logps/chosen": -92.64158630371094, + "logps/rejected": -114.99368286132812, + "loss": 0.3696, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.7391495108604431, + "rewards/margins": -0.07592082023620605, + "rewards/rejected": 0.8150703310966492, + "step": 895 + }, + { + "epoch": 0.15, + "learning_rate": 9.952286844096589e-06, + "logits/chosen": -1.0134602785110474, + "logits/rejected": -1.2219548225402832, + "logps/chosen": -83.44332885742188, + "logps/rejected": -34.77271270751953, + "loss": 0.1716, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.0883712768554688, + "rewards/margins": 1.764013648033142, + "rewards/rejected": 0.3243575990200043, + "step": 896 + }, + { + "epoch": 0.15, + "learning_rate": 9.952105543264557e-06, + "logits/chosen": -0.35143351554870605, + "logits/rejected": -0.27773913741111755, + "logps/chosen": -45.42818832397461, + "logps/rejected": -25.669391632080078, + "loss": 0.4057, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.3571858406066895, + "rewards/margins": 1.3473610877990723, + "rewards/rejected": 1.0098247528076172, + "step": 897 + }, + { + "epoch": 0.15, + "learning_rate": 9.951923900288888e-06, + "logits/chosen": -1.221051573753357, + "logits/rejected": -1.1748794317245483, + "logps/chosen": -57.41825485229492, + "logps/rejected": -68.28103637695312, + "loss": 0.5639, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.8635785579681396, + "rewards/margins": -0.7086887359619141, + "rewards/rejected": 3.5722672939300537, + "step": 898 + }, + { + "epoch": 0.15, + "learning_rate": 9.951741915182135e-06, + "logits/chosen": -0.7705074548721313, + "logits/rejected": -0.7144997119903564, + "logps/chosen": -73.07305908203125, + "logps/rejected": -56.798095703125, + "loss": 0.7116, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.3126319646835327, + "rewards/margins": -0.07233667373657227, + "rewards/rejected": 1.384968638420105, + "step": 899 + }, + { + "epoch": 0.15, + "learning_rate": 9.95155958795687e-06, + "logits/chosen": -0.5728573203086853, + "logits/rejected": -0.5682424902915955, + "logps/chosen": -3.1557023525238037, + "logps/rejected": -33.56087875366211, + "loss": 0.2528, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5220678448677063, + "rewards/margins": 0.6752306222915649, + "rewards/rejected": -0.15316276252269745, + "step": 900 + }, + { + "epoch": 0.15, + "learning_rate": 9.951376918625688e-06, + "logits/chosen": -0.6071822643280029, + "logits/rejected": -0.6333919763565063, + "logps/chosen": -92.24015808105469, + "logps/rejected": -124.9256591796875, + "loss": 0.5389, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.8011444211006165, + "rewards/margins": 0.09569394588470459, + "rewards/rejected": 0.7054504752159119, + "step": 901 + }, + { + "epoch": 0.15, + "learning_rate": 9.951193907201212e-06, + "logits/chosen": -0.4804830551147461, + "logits/rejected": -0.2226371169090271, + "logps/chosen": -101.95085906982422, + "logps/rejected": -59.46814727783203, + "loss": 0.083, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.714295387268066, + "rewards/margins": 3.1721863746643066, + "rewards/rejected": 1.5421088933944702, + "step": 902 + }, + { + "epoch": 0.15, + "learning_rate": 9.951010553696085e-06, + "logits/chosen": -0.44812285900115967, + "logits/rejected": -0.2901563346385956, + "logps/chosen": -89.66095733642578, + "logps/rejected": -63.415069580078125, + "loss": 0.9851, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.3724617958068848, + "rewards/margins": -0.22213363647460938, + "rewards/rejected": 2.594595432281494, + "step": 903 + }, + { + "epoch": 0.15, + "learning_rate": 9.950826858122978e-06, + "logits/chosen": -0.8362154364585876, + "logits/rejected": -0.8209632039070129, + "logps/chosen": -75.08353424072266, + "logps/rejected": -81.62242126464844, + "loss": 2.1653, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.1922523975372314, + "rewards/margins": -0.7926521301269531, + "rewards/rejected": 1.9849045276641846, + "step": 904 + }, + { + "epoch": 0.15, + "learning_rate": 9.950642820494577e-06, + "logits/chosen": -0.22824342548847198, + "logits/rejected": -0.3915039300918579, + "logps/chosen": -59.39702606201172, + "logps/rejected": -124.1939697265625, + "loss": 2.6094, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.6290168762207031, + "rewards/margins": -3.7966394424438477, + "rewards/rejected": 4.425656318664551, + "step": 905 + }, + { + "epoch": 0.15, + "learning_rate": 9.950458440823602e-06, + "logits/chosen": -0.6838247179985046, + "logits/rejected": -0.7023335695266724, + "logps/chosen": -62.574180603027344, + "logps/rejected": -94.59139251708984, + "loss": 1.9758, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.3613647520542145, + "rewards/margins": -0.7280189990997314, + "rewards/rejected": 1.0893837213516235, + "step": 906 + }, + { + "epoch": 0.15, + "learning_rate": 9.950273719122791e-06, + "logits/chosen": -0.6651269197463989, + "logits/rejected": -0.6888731718063354, + "logps/chosen": -80.97157287597656, + "logps/rejected": -70.58873748779297, + "loss": 0.5702, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.0440444946289062, + "rewards/margins": -0.32943809032440186, + "rewards/rejected": 1.373482584953308, + "step": 907 + }, + { + "epoch": 0.15, + "learning_rate": 9.950088655404906e-06, + "logits/chosen": -0.8373301029205322, + "logits/rejected": -0.8670288324356079, + "logps/chosen": -109.94056701660156, + "logps/rejected": -149.96836853027344, + "loss": 0.4463, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.3583879470825195, + "rewards/margins": 0.16134357452392578, + "rewards/rejected": 6.197044372558594, + "step": 908 + }, + { + "epoch": 0.15, + "learning_rate": 9.949903249682734e-06, + "logits/chosen": -0.8999089002609253, + "logits/rejected": -1.0316251516342163, + "logps/chosen": -44.05694580078125, + "logps/rejected": -104.75090789794922, + "loss": 2.2661, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.710186719894409, + "rewards/margins": -3.6647636890411377, + "rewards/rejected": 6.374950408935547, + "step": 909 + }, + { + "epoch": 0.15, + "learning_rate": 9.94971750196908e-06, + "logits/chosen": -0.2612563967704773, + "logits/rejected": -0.27156680822372437, + "logps/chosen": -1.9838281869888306, + "logps/rejected": -29.217985153198242, + "loss": 0.8364, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.2432764619588852, + "rewards/margins": -0.10631747543811798, + "rewards/rejected": 0.3495939373970032, + "step": 910 + }, + { + "epoch": 0.15, + "learning_rate": 9.949531412276785e-06, + "logits/chosen": -0.35551321506500244, + "logits/rejected": -0.3315727710723877, + "logps/chosen": -50.35868453979492, + "logps/rejected": -16.378938674926758, + "loss": 0.4365, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2767395079135895, + "rewards/margins": 0.07504729926586151, + "rewards/rejected": 0.20169220864772797, + "step": 911 + }, + { + "epoch": 0.15, + "learning_rate": 9.9493449806187e-06, + "logits/chosen": -0.8055996298789978, + "logits/rejected": -0.8112307190895081, + "logps/chosen": -65.09141540527344, + "logps/rejected": -44.92212677001953, + "loss": 0.8443, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.2597648799419403, + "rewards/margins": -1.2206916809082031, + "rewards/rejected": 1.4804565906524658, + "step": 912 + }, + { + "epoch": 0.15, + "learning_rate": 9.94915820700771e-06, + "logits/chosen": -0.5863357782363892, + "logits/rejected": -0.6460315585136414, + "logps/chosen": -115.72071075439453, + "logps/rejected": -66.06383514404297, + "loss": 0.0621, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0683541297912598, + "rewards/margins": 2.209538459777832, + "rewards/rejected": 0.8588157892227173, + "step": 913 + }, + { + "epoch": 0.15, + "learning_rate": 9.948971091456715e-06, + "logits/chosen": -0.4062586724758148, + "logits/rejected": -0.4062586724758148, + "logps/chosen": -75.04315185546875, + "logps/rejected": -75.04315185546875, + "loss": 0.3475, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.3471955060958862, + "rewards/margins": 0.0, + "rewards/rejected": 1.3471955060958862, + "step": 914 + }, + { + "epoch": 0.15, + "learning_rate": 9.948783633978649e-06, + "logits/chosen": -0.5320987701416016, + "logits/rejected": -0.5663317441940308, + "logps/chosen": -43.68982696533203, + "logps/rejected": -34.49142074584961, + "loss": 0.6599, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.1515251249074936, + "rewards/margins": -0.8720138669013977, + "rewards/rejected": 0.7204887270927429, + "step": 915 + }, + { + "epoch": 0.15, + "learning_rate": 9.948595834586457e-06, + "logits/chosen": -1.0045485496520996, + "logits/rejected": -1.0029152631759644, + "logps/chosen": -50.800315856933594, + "logps/rejected": -65.90595245361328, + "loss": 0.1121, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.554227590560913, + "rewards/margins": 1.51548171043396, + "rewards/rejected": 1.0387458801269531, + "step": 916 + }, + { + "epoch": 0.15, + "learning_rate": 9.948407693293117e-06, + "logits/chosen": -0.2922779321670532, + "logits/rejected": -0.2781275808811188, + "logps/chosen": -8.65463924407959, + "logps/rejected": -4.334199905395508, + "loss": 0.4654, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.10566568374633789, + "rewards/margins": -0.17308512330055237, + "rewards/rejected": 0.27875080704689026, + "step": 917 + }, + { + "epoch": 0.15, + "learning_rate": 9.948219210111628e-06, + "logits/chosen": -1.2122466564178467, + "logits/rejected": -1.245984673500061, + "logps/chosen": -99.93675994873047, + "logps/rejected": -154.40286254882812, + "loss": 1.1381, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.82673716545105, + "rewards/margins": -1.3531825542449951, + "rewards/rejected": 4.179919719696045, + "step": 918 + }, + { + "epoch": 0.15, + "learning_rate": 9.94803038505501e-06, + "logits/chosen": -0.5832720994949341, + "logits/rejected": -0.5676288604736328, + "logps/chosen": -99.70449829101562, + "logps/rejected": -88.4804458618164, + "loss": 0.9004, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.8332260847091675, + "rewards/margins": -1.5520652532577515, + "rewards/rejected": 3.385291337966919, + "step": 919 + }, + { + "epoch": 0.15, + "learning_rate": 9.947841218136314e-06, + "logits/chosen": -1.081568956375122, + "logits/rejected": -1.0550868511199951, + "logps/chosen": -33.03103256225586, + "logps/rejected": -22.46576690673828, + "loss": 0.1248, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.6318798065185547, + "rewards/margins": 1.4283885955810547, + "rewards/rejected": 0.2034912109375, + "step": 920 + }, + { + "epoch": 0.15, + "learning_rate": 9.947651709368605e-06, + "logits/chosen": -1.2402117252349854, + "logits/rejected": -1.2806035280227661, + "logps/chosen": -113.83362579345703, + "logps/rejected": -77.7782974243164, + "loss": 0.2935, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.178718090057373, + "rewards/margins": 1.0925538539886475, + "rewards/rejected": 3.0861642360687256, + "step": 921 + }, + { + "epoch": 0.15, + "learning_rate": 9.947461858764978e-06, + "logits/chosen": -0.7045000195503235, + "logits/rejected": -0.7009838819503784, + "logps/chosen": -44.818355560302734, + "logps/rejected": -205.5766143798828, + "loss": 2.3798, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.6061413288116455, + "rewards/margins": -4.1595001220703125, + "rewards/rejected": 5.765641689300537, + "step": 922 + }, + { + "epoch": 0.15, + "learning_rate": 9.947271666338552e-06, + "logits/chosen": -0.6165085434913635, + "logits/rejected": -0.5600213408470154, + "logps/chosen": -36.7490234375, + "logps/rejected": -54.8492317199707, + "loss": 0.651, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.157342553138733, + "rewards/margins": -0.6624233722686768, + "rewards/rejected": 1.8197659254074097, + "step": 923 + }, + { + "epoch": 0.15, + "learning_rate": 9.947081132102464e-06, + "logits/chosen": -0.5535211563110352, + "logits/rejected": -0.5081825256347656, + "logps/chosen": -40.84818649291992, + "logps/rejected": -49.362525939941406, + "loss": 1.2138, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.285379409790039, + "rewards/margins": -1.2835559844970703, + "rewards/rejected": 2.5689353942871094, + "step": 924 + }, + { + "epoch": 0.15, + "learning_rate": 9.946890256069878e-06, + "logits/chosen": -0.4368779957294464, + "logits/rejected": -0.376263827085495, + "logps/chosen": -92.18989562988281, + "logps/rejected": -71.6100082397461, + "loss": 0.9427, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.1280534267425537, + "rewards/margins": -1.6619176864624023, + "rewards/rejected": 3.789971113204956, + "step": 925 + }, + { + "epoch": 0.15, + "learning_rate": 9.946699038253985e-06, + "logits/chosen": -0.7852778434753418, + "logits/rejected": -0.7928486466407776, + "logps/chosen": -58.7232666015625, + "logps/rejected": -57.105228424072266, + "loss": 1.5878, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.150458574295044, + "rewards/margins": -0.11402702331542969, + "rewards/rejected": 2.2644855976104736, + "step": 926 + }, + { + "epoch": 0.15, + "learning_rate": 9.946507478667995e-06, + "logits/chosen": -0.6984836459159851, + "logits/rejected": -0.712509036064148, + "logps/chosen": -303.41680908203125, + "logps/rejected": -90.14981842041016, + "loss": 0.6751, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.266821384429932, + "rewards/margins": 0.6976051330566406, + "rewards/rejected": 3.569216251373291, + "step": 927 + }, + { + "epoch": 0.15, + "learning_rate": 9.94631557732514e-06, + "logits/chosen": -0.8176780343055725, + "logits/rejected": -0.8498349189758301, + "logps/chosen": -59.91166687011719, + "logps/rejected": -100.17063903808594, + "loss": 0.7987, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.247456431388855, + "rewards/margins": 1.1151490211486816, + "rewards/rejected": 0.13230744004249573, + "step": 928 + }, + { + "epoch": 0.15, + "learning_rate": 9.946123334238685e-06, + "logits/chosen": -0.799324095249176, + "logits/rejected": -1.1326276063919067, + "logps/chosen": -88.81111907958984, + "logps/rejected": -35.88409423828125, + "loss": 0.6654, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.8922394514083862, + "rewards/margins": 1.6710751056671143, + "rewards/rejected": 0.22116433084011078, + "step": 929 + }, + { + "epoch": 0.15, + "learning_rate": 9.945930749421903e-06, + "logits/chosen": -0.7544516921043396, + "logits/rejected": -0.5812107920646667, + "logps/chosen": -138.54440307617188, + "logps/rejected": -53.44469451904297, + "loss": 0.0317, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.832316875457764, + "rewards/margins": 3.708155393600464, + "rewards/rejected": 2.1241614818573, + "step": 930 + }, + { + "epoch": 0.15, + "learning_rate": 9.94573782288811e-06, + "logits/chosen": -0.476868212223053, + "logits/rejected": -0.5503398180007935, + "logps/chosen": -69.02261352539062, + "logps/rejected": -49.08555603027344, + "loss": 0.3116, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.5207947492599487, + "rewards/margins": 0.15279853343963623, + "rewards/rejected": 1.3679962158203125, + "step": 931 + }, + { + "epoch": 0.15, + "learning_rate": 9.945544554650628e-06, + "logits/chosen": -1.1643648147583008, + "logits/rejected": -1.1458293199539185, + "logps/chosen": -84.39503479003906, + "logps/rejected": -126.79402923583984, + "loss": 0.7023, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.057567596435547, + "rewards/margins": 0.6577835083007812, + "rewards/rejected": 1.3997840881347656, + "step": 932 + }, + { + "epoch": 0.15, + "learning_rate": 9.945350944722813e-06, + "logits/chosen": -0.7352979183197021, + "logits/rejected": -0.7352979183197021, + "logps/chosen": -79.90742492675781, + "logps/rejected": -79.90742492675781, + "loss": 0.4013, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.5677552223205566, + "rewards/margins": 0.0, + "rewards/rejected": 2.5677552223205566, + "step": 933 + }, + { + "epoch": 0.15, + "learning_rate": 9.945156993118042e-06, + "logits/chosen": -0.9183648824691772, + "logits/rejected": -0.8773717880249023, + "logps/chosen": -122.60325622558594, + "logps/rejected": -57.52958679199219, + "loss": 0.2657, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.023693799972534, + "rewards/margins": 1.2941398620605469, + "rewards/rejected": 0.7295539975166321, + "step": 934 + }, + { + "epoch": 0.15, + "learning_rate": 9.944962699849712e-06, + "logits/chosen": -0.8794714212417603, + "logits/rejected": -0.8322320580482483, + "logps/chosen": -73.0479736328125, + "logps/rejected": -72.81939697265625, + "loss": 0.1033, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0019195079803467, + "rewards/margins": 1.8919425010681152, + "rewards/rejected": 1.1099770069122314, + "step": 935 + }, + { + "epoch": 0.15, + "learning_rate": 9.944768064931251e-06, + "logits/chosen": -0.5056772828102112, + "logits/rejected": -0.4821241497993469, + "logps/chosen": -105.76371765136719, + "logps/rejected": -64.04057312011719, + "loss": 0.6188, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.5070816278457642, + "rewards/margins": -0.16239625215530396, + "rewards/rejected": 0.6694778800010681, + "step": 936 + }, + { + "epoch": 0.15, + "learning_rate": 9.944573088376103e-06, + "logits/chosen": -1.0288366079330444, + "logits/rejected": -0.8835861086845398, + "logps/chosen": -133.68438720703125, + "logps/rejected": -78.23672485351562, + "loss": 0.0079, + "rewards/accuracies": 1.0, + "rewards/chosen": 8.993059158325195, + "rewards/margins": 5.096907615661621, + "rewards/rejected": 3.8961517810821533, + "step": 937 + }, + { + "epoch": 0.15, + "learning_rate": 9.944377770197741e-06, + "logits/chosen": -0.9333157539367676, + "logits/rejected": -0.9884878396987915, + "logps/chosen": -103.79246520996094, + "logps/rejected": -195.28936767578125, + "loss": 0.3589, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.759246826171875, + "rewards/margins": -0.01460576057434082, + "rewards/rejected": 1.7738525867462158, + "step": 938 + }, + { + "epoch": 0.15, + "learning_rate": 9.944182110409662e-06, + "logits/chosen": -0.7476300597190857, + "logits/rejected": -0.7046464681625366, + "logps/chosen": -95.88417053222656, + "logps/rejected": -48.43010711669922, + "loss": 0.3863, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.868432641029358, + "rewards/margins": -0.058475494384765625, + "rewards/rejected": 1.9269081354141235, + "step": 939 + }, + { + "epoch": 0.15, + "learning_rate": 9.943986109025378e-06, + "logits/chosen": -0.609706699848175, + "logits/rejected": -0.5817316770553589, + "logps/chosen": -72.0810546875, + "logps/rejected": -67.69532012939453, + "loss": 0.2422, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.072835683822632, + "rewards/margins": 0.8760422468185425, + "rewards/rejected": 1.1967934370040894, + "step": 940 + }, + { + "epoch": 0.15, + "learning_rate": 9.943789766058437e-06, + "logits/chosen": -0.7087042331695557, + "logits/rejected": -0.4338686764240265, + "logps/chosen": -89.98566436767578, + "logps/rejected": -73.61123657226562, + "loss": 0.0652, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.338440895080566, + "rewards/margins": 1.998638391494751, + "rewards/rejected": 3.3398025035858154, + "step": 941 + }, + { + "epoch": 0.15, + "learning_rate": 9.943593081522398e-06, + "logits/chosen": -0.7397804856300354, + "logits/rejected": -0.7340706586837769, + "logps/chosen": -59.300079345703125, + "logps/rejected": -70.11972045898438, + "loss": 1.1358, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.7586395144462585, + "rewards/margins": 0.13013911247253418, + "rewards/rejected": 0.6285004019737244, + "step": 942 + }, + { + "epoch": 0.15, + "learning_rate": 9.943396055430857e-06, + "logits/chosen": -0.7538631558418274, + "logits/rejected": -0.7538631558418274, + "logps/chosen": -47.621238708496094, + "logps/rejected": -47.621238708496094, + "loss": 0.5231, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.2355759143829346, + "rewards/margins": 0.0, + "rewards/rejected": 1.2355759143829346, + "step": 943 + }, + { + "epoch": 0.15, + "learning_rate": 9.943198687797422e-06, + "logits/chosen": -0.34382662177085876, + "logits/rejected": -0.3476409912109375, + "logps/chosen": -1.5481884479522705, + "logps/rejected": -1.8906859159469604, + "loss": 0.3443, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.24077926576137543, + "rewards/margins": 0.03842511773109436, + "rewards/rejected": 0.20235414803028107, + "step": 944 + }, + { + "epoch": 0.15, + "learning_rate": 9.94300097863573e-06, + "logits/chosen": -1.3814747333526611, + "logits/rejected": -1.359487533569336, + "logps/chosen": -86.41969299316406, + "logps/rejected": -41.65219497680664, + "loss": 0.1484, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.5588477849960327, + "rewards/margins": 1.553622841835022, + "rewards/rejected": 0.005224991124123335, + "step": 945 + }, + { + "epoch": 0.15, + "learning_rate": 9.942802927959444e-06, + "logits/chosen": -0.4610329866409302, + "logits/rejected": -0.5065093040466309, + "logps/chosen": -7.692859649658203, + "logps/rejected": -29.27581024169922, + "loss": 2.2186, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.2757481634616852, + "rewards/margins": -0.5054236650466919, + "rewards/rejected": 0.7811717987060547, + "step": 946 + }, + { + "epoch": 0.15, + "learning_rate": 9.942604535782244e-06, + "logits/chosen": -0.9079068899154663, + "logits/rejected": -0.8782613277435303, + "logps/chosen": -91.90472412109375, + "logps/rejected": -79.22450256347656, + "loss": 1.2527, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.017359972000122, + "rewards/margins": -2.406010389328003, + "rewards/rejected": 3.423370361328125, + "step": 947 + }, + { + "epoch": 0.15, + "learning_rate": 9.942405802117836e-06, + "logits/chosen": -0.9676279425621033, + "logits/rejected": -0.950051486492157, + "logps/chosen": -78.18791961669922, + "logps/rejected": -54.444557189941406, + "loss": 0.5598, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.8302360773086548, + "rewards/margins": -0.4019958972930908, + "rewards/rejected": 1.2322319746017456, + "step": 948 + }, + { + "epoch": 0.15, + "learning_rate": 9.942206726979955e-06, + "logits/chosen": -0.6981898546218872, + "logits/rejected": -0.7849306464195251, + "logps/chosen": -72.26882934570312, + "logps/rejected": -121.75981140136719, + "loss": 1.4603, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.6585747003555298, + "rewards/margins": -2.637545108795166, + "rewards/rejected": 4.296119689941406, + "step": 949 + }, + { + "epoch": 0.15, + "learning_rate": 9.942007310382352e-06, + "logits/chosen": -0.6679187417030334, + "logits/rejected": -0.5247595906257629, + "logps/chosen": -97.13971710205078, + "logps/rejected": -51.46876525878906, + "loss": 0.4945, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.739705801010132, + "rewards/margins": 1.273078203201294, + "rewards/rejected": 2.466627597808838, + "step": 950 + }, + { + "epoch": 0.15, + "learning_rate": 9.941807552338805e-06, + "logits/chosen": -0.5936371684074402, + "logits/rejected": -0.6017687916755676, + "logps/chosen": -82.12828826904297, + "logps/rejected": -135.99383544921875, + "loss": 0.8805, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.9367050528526306, + "rewards/margins": -0.6465774178504944, + "rewards/rejected": 1.583282470703125, + "step": 951 + }, + { + "epoch": 0.15, + "learning_rate": 9.941607452863115e-06, + "logits/chosen": -1.0570675134658813, + "logits/rejected": -1.1137992143630981, + "logps/chosen": -45.24913787841797, + "logps/rejected": -73.90044403076172, + "loss": 1.9113, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.180021047592163, + "rewards/margins": -0.34545421600341797, + "rewards/rejected": 2.525475263595581, + "step": 952 + }, + { + "epoch": 0.15, + "learning_rate": 9.94140701196911e-06, + "logits/chosen": -0.5653223395347595, + "logits/rejected": -0.5693771839141846, + "logps/chosen": -5.311827182769775, + "logps/rejected": -24.092130661010742, + "loss": 0.4778, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.29036745429039, + "rewards/margins": 0.3035258650779724, + "rewards/rejected": -0.01315841730684042, + "step": 953 + }, + { + "epoch": 0.15, + "learning_rate": 9.941206229670634e-06, + "logits/chosen": -0.7102869749069214, + "logits/rejected": -0.5024687051773071, + "logps/chosen": -143.22337341308594, + "logps/rejected": -49.37446594238281, + "loss": 0.6268, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.536326885223389, + "rewards/margins": 3.815751075744629, + "rewards/rejected": 1.7205756902694702, + "step": 954 + }, + { + "epoch": 0.16, + "learning_rate": 9.941005105981564e-06, + "logits/chosen": -1.0644912719726562, + "logits/rejected": -0.9473379850387573, + "logps/chosen": -181.74212646484375, + "logps/rejected": -18.689804077148438, + "loss": 0.012, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.211949348449707, + "rewards/margins": 5.966632843017578, + "rewards/rejected": 0.24531669914722443, + "step": 955 + }, + { + "epoch": 0.16, + "learning_rate": 9.940803640915792e-06, + "logits/chosen": -0.9551603198051453, + "logits/rejected": -0.8647509813308716, + "logps/chosen": -102.4731674194336, + "logps/rejected": -68.33489990234375, + "loss": 0.0587, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.9047980308532715, + "rewards/margins": 2.192748785018921, + "rewards/rejected": 2.7120492458343506, + "step": 956 + }, + { + "epoch": 0.16, + "learning_rate": 9.94060183448724e-06, + "logits/chosen": -0.8240150809288025, + "logits/rejected": -0.8430594205856323, + "logps/chosen": -98.55577087402344, + "logps/rejected": -117.58019256591797, + "loss": 0.6896, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.3529595136642456, + "rewards/margins": -0.3507354259490967, + "rewards/rejected": 1.7036949396133423, + "step": 957 + }, + { + "epoch": 0.16, + "learning_rate": 9.940399686709849e-06, + "logits/chosen": -1.1274219751358032, + "logits/rejected": -0.7869605422019958, + "logps/chosen": -122.18221282958984, + "logps/rejected": -20.19772720336914, + "loss": 0.026, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.5310492515563965, + "rewards/margins": 6.188379287719727, + "rewards/rejected": 0.34266987442970276, + "step": 958 + }, + { + "epoch": 0.16, + "learning_rate": 9.940197197597588e-06, + "logits/chosen": -0.7436199188232422, + "logits/rejected": -0.6292299628257751, + "logps/chosen": -220.6072235107422, + "logps/rejected": -103.72830963134766, + "loss": 0.1137, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.762431621551514, + "rewards/margins": 2.047353506088257, + "rewards/rejected": 2.715078115463257, + "step": 959 + }, + { + "epoch": 0.16, + "learning_rate": 9.939994367164442e-06, + "logits/chosen": -0.6424039602279663, + "logits/rejected": -0.5472148060798645, + "logps/chosen": -82.79949951171875, + "logps/rejected": -31.704919815063477, + "loss": 2.2606, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.3555939197540283, + "rewards/margins": 1.1957098245620728, + "rewards/rejected": 0.15988408029079437, + "step": 960 + }, + { + "epoch": 0.16, + "learning_rate": 9.939791195424431e-06, + "logits/chosen": -0.3152448534965515, + "logits/rejected": -0.3000408709049225, + "logps/chosen": -81.05413818359375, + "logps/rejected": -49.35972213745117, + "loss": 1.1299, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.0753204822540283, + "rewards/margins": 0.8236881494522095, + "rewards/rejected": 1.2516323328018188, + "step": 961 + }, + { + "epoch": 0.16, + "learning_rate": 9.939587682391587e-06, + "logits/chosen": -0.7470812201499939, + "logits/rejected": -0.6764734387397766, + "logps/chosen": -64.93123626708984, + "logps/rejected": -71.28284454345703, + "loss": 0.5516, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.6331337690353394, + "rewards/margins": 0.1921067237854004, + "rewards/rejected": 1.441027045249939, + "step": 962 + }, + { + "epoch": 0.16, + "learning_rate": 9.939383828079974e-06, + "logits/chosen": -0.7897855043411255, + "logits/rejected": -0.8182560205459595, + "logps/chosen": -45.32937240600586, + "logps/rejected": -77.66384887695312, + "loss": 0.863, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.1942867040634155, + "rewards/margins": -1.3061755895614624, + "rewards/rejected": 2.500462293624878, + "step": 963 + }, + { + "epoch": 0.16, + "learning_rate": 9.939179632503673e-06, + "logits/chosen": -1.1252574920654297, + "logits/rejected": -1.0733520984649658, + "logps/chosen": -99.37043762207031, + "logps/rejected": -195.72738647460938, + "loss": 0.1633, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.35662841796875, + "rewards/margins": 1.0311322212219238, + "rewards/rejected": 5.325496196746826, + "step": 964 + }, + { + "epoch": 0.16, + "learning_rate": 9.938975095676797e-06, + "logits/chosen": -1.4295588731765747, + "logits/rejected": -1.3319590091705322, + "logps/chosen": -110.697021484375, + "logps/rejected": -97.69945526123047, + "loss": 0.0566, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.940245151519775, + "rewards/margins": 2.913245439529419, + "rewards/rejected": 2.0269997119903564, + "step": 965 + }, + { + "epoch": 0.16, + "learning_rate": 9.938770217613474e-06, + "logits/chosen": -0.3314513564109802, + "logits/rejected": -0.3243207037448883, + "logps/chosen": -52.49781799316406, + "logps/rejected": -70.79743957519531, + "loss": 1.444, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.9171470403671265, + "rewards/margins": -1.976575493812561, + "rewards/rejected": 3.8937225341796875, + "step": 966 + }, + { + "epoch": 0.16, + "learning_rate": 9.93856499832786e-06, + "logits/chosen": -1.1727160215377808, + "logits/rejected": -1.1258678436279297, + "logps/chosen": -104.14421081542969, + "logps/rejected": -29.629074096679688, + "loss": 2.0625, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.2167648077011108, + "rewards/margins": 1.1042910814285278, + "rewards/rejected": 0.11247368156909943, + "step": 967 + }, + { + "epoch": 0.16, + "learning_rate": 9.938359437834135e-06, + "logits/chosen": -0.5474592447280884, + "logits/rejected": -0.52689129114151, + "logps/chosen": -65.36206817626953, + "logps/rejected": -88.59962463378906, + "loss": 0.2389, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.703986406326294, + "rewards/margins": 0.546587347984314, + "rewards/rejected": 1.15739905834198, + "step": 968 + }, + { + "epoch": 0.16, + "learning_rate": 9.938153536146498e-06, + "logits/chosen": -0.49581044912338257, + "logits/rejected": -0.5145812630653381, + "logps/chosen": -58.815345764160156, + "logps/rejected": -82.11369323730469, + "loss": 0.6941, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.207207441329956, + "rewards/margins": 0.5681174993515015, + "rewards/rejected": 1.6390899419784546, + "step": 969 + }, + { + "epoch": 0.16, + "learning_rate": 9.937947293279178e-06, + "logits/chosen": -1.0401650667190552, + "logits/rejected": -0.8767862915992737, + "logps/chosen": -82.32864379882812, + "logps/rejected": -70.17554473876953, + "loss": 1.0339, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.6991868019104004, + "rewards/margins": 0.9905067682266235, + "rewards/rejected": 1.7086800336837769, + "step": 970 + }, + { + "epoch": 0.16, + "learning_rate": 9.937740709246422e-06, + "logits/chosen": -0.899537205696106, + "logits/rejected": -0.7839072942733765, + "logps/chosen": -44.31623077392578, + "logps/rejected": -39.2714958190918, + "loss": 0.3956, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.7639961242675781, + "rewards/margins": 1.441514253616333, + "rewards/rejected": 0.3224819302558899, + "step": 971 + }, + { + "epoch": 0.16, + "learning_rate": 9.937533784062507e-06, + "logits/chosen": -0.8849065899848938, + "logits/rejected": -0.9014638066291809, + "logps/chosen": -45.767398834228516, + "logps/rejected": -39.236244201660156, + "loss": 0.3717, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.3957184553146362, + "rewards/margins": 0.26120615005493164, + "rewards/rejected": 1.1345123052597046, + "step": 972 + }, + { + "epoch": 0.16, + "learning_rate": 9.937326517741725e-06, + "logits/chosen": -0.3574150800704956, + "logits/rejected": -0.2731892466545105, + "logps/chosen": -67.73986053466797, + "logps/rejected": -47.73283386230469, + "loss": 0.8454, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.2128472328186035, + "rewards/margins": 0.2374955415725708, + "rewards/rejected": 1.9753516912460327, + "step": 973 + }, + { + "epoch": 0.16, + "learning_rate": 9.937118910298398e-06, + "logits/chosen": -0.9771447777748108, + "logits/rejected": -0.801884651184082, + "logps/chosen": -98.79182434082031, + "logps/rejected": -59.12971878051758, + "loss": 0.2459, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.041555881500244, + "rewards/margins": 2.180636405944824, + "rewards/rejected": -0.13908043503761292, + "step": 974 + }, + { + "epoch": 0.16, + "learning_rate": 9.93691096174687e-06, + "logits/chosen": -1.1952260732650757, + "logits/rejected": -1.1596204042434692, + "logps/chosen": -100.94534301757812, + "logps/rejected": -60.38788986206055, + "loss": 1.5541, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.2151840925216675, + "rewards/margins": -1.4896091222763062, + "rewards/rejected": 2.7047932147979736, + "step": 975 + }, + { + "epoch": 0.16, + "learning_rate": 9.936702672101509e-06, + "logits/chosen": -1.1708883047103882, + "logits/rejected": -1.120470643043518, + "logps/chosen": -78.8829345703125, + "logps/rejected": -76.34330749511719, + "loss": 2.5053, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.2302109003067017, + "rewards/margins": -1.4983383417129517, + "rewards/rejected": 2.7285492420196533, + "step": 976 + }, + { + "epoch": 0.16, + "learning_rate": 9.936494041376703e-06, + "logits/chosen": -0.8778335452079773, + "logits/rejected": -0.7762823700904846, + "logps/chosen": -62.64419174194336, + "logps/rejected": -35.285125732421875, + "loss": 0.2009, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.182640552520752, + "rewards/margins": 1.8967350721359253, + "rewards/rejected": 0.2859054505825043, + "step": 977 + }, + { + "epoch": 0.16, + "learning_rate": 9.93628506958687e-06, + "logits/chosen": -1.0305081605911255, + "logits/rejected": -0.9334591627120972, + "logps/chosen": -84.91035461425781, + "logps/rejected": -60.74156951904297, + "loss": 2.4338, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.959820568561554, + "rewards/margins": -1.4649429321289062, + "rewards/rejected": 2.4247634410858154, + "step": 978 + }, + { + "epoch": 0.16, + "learning_rate": 9.936075756746445e-06, + "logits/chosen": -0.7267990708351135, + "logits/rejected": -0.6726067066192627, + "logps/chosen": -90.09129333496094, + "logps/rejected": -51.84901428222656, + "loss": 0.788, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.7187470197677612, + "rewards/margins": -0.7907112836837769, + "rewards/rejected": 2.509458303451538, + "step": 979 + }, + { + "epoch": 0.16, + "learning_rate": 9.935866102869891e-06, + "logits/chosen": -0.41669759154319763, + "logits/rejected": -0.42870062589645386, + "logps/chosen": -87.10832977294922, + "logps/rejected": -72.74089050292969, + "loss": 1.0132, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.9755111932754517, + "rewards/margins": 0.09103620052337646, + "rewards/rejected": 1.8844749927520752, + "step": 980 + }, + { + "epoch": 0.16, + "learning_rate": 9.935656107971695e-06, + "logits/chosen": -0.5043390989303589, + "logits/rejected": -0.5331488251686096, + "logps/chosen": -81.21327209472656, + "logps/rejected": -98.57707977294922, + "loss": 2.2633, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.9401489496231079, + "rewards/margins": -0.9452759027481079, + "rewards/rejected": 1.8854248523712158, + "step": 981 + }, + { + "epoch": 0.16, + "learning_rate": 9.935445772066362e-06, + "logits/chosen": -0.7789239287376404, + "logits/rejected": -0.8117008209228516, + "logps/chosen": -56.22503662109375, + "logps/rejected": -45.355079650878906, + "loss": 0.8356, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.3616844415664673, + "rewards/margins": -0.3417949676513672, + "rewards/rejected": 1.7034794092178345, + "step": 982 + }, + { + "epoch": 0.16, + "learning_rate": 9.935235095168424e-06, + "logits/chosen": -0.8700138330459595, + "logits/rejected": -0.6768751740455627, + "logps/chosen": -94.95710754394531, + "logps/rejected": -10.217500686645508, + "loss": 0.0086, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.519484043121338, + "rewards/margins": 5.802703857421875, + "rewards/rejected": 0.7167803049087524, + "step": 983 + }, + { + "epoch": 0.16, + "learning_rate": 9.93502407729244e-06, + "logits/chosen": -0.790105938911438, + "logits/rejected": -0.8273228406906128, + "logps/chosen": -65.33068084716797, + "logps/rejected": -63.65227508544922, + "loss": 0.4183, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.0603439807891846, + "rewards/margins": 0.24899828433990479, + "rewards/rejected": 1.8113456964492798, + "step": 984 + }, + { + "epoch": 0.16, + "learning_rate": 9.934812718452988e-06, + "logits/chosen": -1.2866060733795166, + "logits/rejected": -1.2284554243087769, + "logps/chosen": -105.9365005493164, + "logps/rejected": -65.82254028320312, + "loss": 0.9051, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.886975884437561, + "rewards/margins": -0.6487724781036377, + "rewards/rejected": 1.5357483625411987, + "step": 985 + }, + { + "epoch": 0.16, + "learning_rate": 9.934601018664672e-06, + "logits/chosen": -0.7620704770088196, + "logits/rejected": -0.8613855838775635, + "logps/chosen": -73.70231628417969, + "logps/rejected": -115.19316101074219, + "loss": 1.0772, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.0510194301605225, + "rewards/margins": -1.8703384399414062, + "rewards/rejected": 3.9213578701019287, + "step": 986 + }, + { + "epoch": 0.16, + "learning_rate": 9.934388977942116e-06, + "logits/chosen": -0.662204921245575, + "logits/rejected": -0.6144109964370728, + "logps/chosen": -26.80567741394043, + "logps/rejected": -43.89972686767578, + "loss": 0.6436, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.7981569170951843, + "rewards/margins": -0.5511873364448547, + "rewards/rejected": 1.349344253540039, + "step": 987 + }, + { + "epoch": 0.16, + "learning_rate": 9.93417659629997e-06, + "logits/chosen": -0.6872990131378174, + "logits/rejected": -0.6975247859954834, + "logps/chosen": -71.89572143554688, + "logps/rejected": -45.78478240966797, + "loss": 0.2463, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.5436729192733765, + "rewards/margins": 0.5264121294021606, + "rewards/rejected": 1.0172607898712158, + "step": 988 + }, + { + "epoch": 0.16, + "learning_rate": 9.93396387375291e-06, + "logits/chosen": -0.9410592913627625, + "logits/rejected": -0.9059848785400391, + "logps/chosen": -48.53178405761719, + "logps/rejected": -96.26184844970703, + "loss": 0.476, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.3772149085998535, + "rewards/margins": -0.4250662326812744, + "rewards/rejected": 2.802281141281128, + "step": 989 + }, + { + "epoch": 0.16, + "learning_rate": 9.933750810315632e-06, + "logits/chosen": -0.6001541614532471, + "logits/rejected": -0.5827940702438354, + "logps/chosen": -63.261959075927734, + "logps/rejected": -97.1754379272461, + "loss": 2.1341, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.3705165982246399, + "rewards/margins": -0.8727138638496399, + "rewards/rejected": 1.2432304620742798, + "step": 990 + }, + { + "epoch": 0.16, + "learning_rate": 9.933537406002858e-06, + "logits/chosen": -1.1230851411819458, + "logits/rejected": -1.1199252605438232, + "logps/chosen": -53.140769958496094, + "logps/rejected": -76.62193298339844, + "loss": 0.5032, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.5930137634277344, + "rewards/margins": -0.5437448024749756, + "rewards/rejected": 2.13675856590271, + "step": 991 + }, + { + "epoch": 0.16, + "learning_rate": 9.933323660829328e-06, + "logits/chosen": -0.6887528896331787, + "logits/rejected": -0.5739595293998718, + "logps/chosen": -31.438684463500977, + "logps/rejected": -15.846352577209473, + "loss": 0.4329, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.7194963693618774, + "rewards/margins": 0.8349452018737793, + "rewards/rejected": 0.8845511674880981, + "step": 992 + }, + { + "epoch": 0.16, + "learning_rate": 9.933109574809814e-06, + "logits/chosen": -0.48496928811073303, + "logits/rejected": -0.2675305902957916, + "logps/chosen": -72.672607421875, + "logps/rejected": -46.48814010620117, + "loss": 0.1666, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.7874228954315186, + "rewards/margins": 2.1880311965942383, + "rewards/rejected": 1.5993915796279907, + "step": 993 + }, + { + "epoch": 0.16, + "learning_rate": 9.932895147959106e-06, + "logits/chosen": -0.5489899516105652, + "logits/rejected": -0.5489899516105652, + "logps/chosen": -38.41676330566406, + "logps/rejected": -38.41676330566406, + "loss": 0.5349, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.45139741897583, + "rewards/margins": 0.0, + "rewards/rejected": 2.45139741897583, + "step": 994 + }, + { + "epoch": 0.16, + "learning_rate": 9.932680380292019e-06, + "logits/chosen": -0.697618305683136, + "logits/rejected": -0.6462913751602173, + "logps/chosen": -71.93721008300781, + "logps/rejected": -42.16941833496094, + "loss": 0.9392, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.44661256670951843, + "rewards/margins": -0.8205474615097046, + "rewards/rejected": 1.2671600580215454, + "step": 995 + }, + { + "epoch": 0.16, + "learning_rate": 9.93246527182339e-06, + "logits/chosen": -0.7965465188026428, + "logits/rejected": -0.83793044090271, + "logps/chosen": -57.31657791137695, + "logps/rejected": -75.15989685058594, + "loss": 0.7558, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.4892498254776, + "rewards/margins": -0.2599971294403076, + "rewards/rejected": 1.7492469549179077, + "step": 996 + }, + { + "epoch": 0.16, + "learning_rate": 9.932249822568085e-06, + "logits/chosen": -1.188218355178833, + "logits/rejected": -1.0577561855316162, + "logps/chosen": -128.61795043945312, + "logps/rejected": -58.10600280761719, + "loss": 0.9076, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.9326568841934204, + "rewards/margins": 0.23875653743743896, + "rewards/rejected": 1.6939003467559814, + "step": 997 + }, + { + "epoch": 0.16, + "learning_rate": 9.932034032540984e-06, + "logits/chosen": -0.6007674932479858, + "logits/rejected": -0.5187857747077942, + "logps/chosen": -90.21304321289062, + "logps/rejected": -76.8084487915039, + "loss": 2.052, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.65985107421875, + "rewards/margins": -0.6506599187850952, + "rewards/rejected": 1.3105109930038452, + "step": 998 + }, + { + "epoch": 0.16, + "learning_rate": 9.931817901757e-06, + "logits/chosen": -0.5828919410705566, + "logits/rejected": -0.6894444823265076, + "logps/chosen": -75.40189361572266, + "logps/rejected": -92.1731185913086, + "loss": 1.7695, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.6412713527679443, + "rewards/margins": -1.1210601329803467, + "rewards/rejected": 3.762331485748291, + "step": 999 + }, + { + "epoch": 0.16, + "learning_rate": 9.931601430231062e-06, + "logits/chosen": -0.6650331616401672, + "logits/rejected": -0.768011748790741, + "logps/chosen": -79.81239318847656, + "logps/rejected": -102.96730041503906, + "loss": 2.0589, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.805341362953186, + "rewards/margins": -2.449819564819336, + "rewards/rejected": 4.255160808563232, + "step": 1000 + }, + { + "epoch": 0.16, + "learning_rate": 9.93138461797813e-06, + "logits/chosen": -1.039915919303894, + "logits/rejected": -1.0030244588851929, + "logps/chosen": -139.3895263671875, + "logps/rejected": -63.95527648925781, + "loss": 0.0749, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.5960845947265625, + "rewards/margins": 2.1386497020721436, + "rewards/rejected": 3.457434892654419, + "step": 1001 + }, + { + "epoch": 0.16, + "learning_rate": 9.931167465013182e-06, + "logits/chosen": -0.7880202531814575, + "logits/rejected": -0.8088386058807373, + "logps/chosen": -79.51953125, + "logps/rejected": -185.2762451171875, + "loss": 3.4259, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.4162087440490723, + "rewards/margins": -4.05629825592041, + "rewards/rejected": 6.472506999969482, + "step": 1002 + }, + { + "epoch": 0.16, + "learning_rate": 9.930949971351223e-06, + "logits/chosen": -0.8353986740112305, + "logits/rejected": -0.8353986740112305, + "logps/chosen": -131.9127197265625, + "logps/rejected": -131.9127197265625, + "loss": 0.4978, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.7823151350021362, + "rewards/margins": 0.0, + "rewards/rejected": 1.7823151350021362, + "step": 1003 + }, + { + "epoch": 0.16, + "learning_rate": 9.930732137007275e-06, + "logits/chosen": -0.5524372458457947, + "logits/rejected": -0.5524897575378418, + "logps/chosen": -2.508800745010376, + "logps/rejected": -1.6779431104660034, + "loss": 0.6526, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.4177589416503906, + "rewards/margins": -0.09536373615264893, + "rewards/rejected": 0.5131226778030396, + "step": 1004 + }, + { + "epoch": 0.16, + "learning_rate": 9.930513961996394e-06, + "logits/chosen": -0.44687265157699585, + "logits/rejected": -0.42427781224250793, + "logps/chosen": -13.541272163391113, + "logps/rejected": -21.533422470092773, + "loss": 0.8178, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.8355805277824402, + "rewards/margins": 0.50794517993927, + "rewards/rejected": 0.32763537764549255, + "step": 1005 + }, + { + "epoch": 0.16, + "learning_rate": 9.930295446333649e-06, + "logits/chosen": -1.198736548423767, + "logits/rejected": -1.1412993669509888, + "logps/chosen": -86.12895202636719, + "logps/rejected": -62.507598876953125, + "loss": 0.3524, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.2677459716796875, + "rewards/margins": -0.00558161735534668, + "rewards/rejected": 2.273327589035034, + "step": 1006 + }, + { + "epoch": 0.16, + "learning_rate": 9.930076590034141e-06, + "logits/chosen": -0.8616927266120911, + "logits/rejected": -0.8616927266120911, + "logps/chosen": -82.49430084228516, + "logps/rejected": -82.49430084228516, + "loss": 0.4809, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.614312767982483, + "rewards/margins": 0.0, + "rewards/rejected": 1.614312767982483, + "step": 1007 + }, + { + "epoch": 0.16, + "learning_rate": 9.929857393112989e-06, + "logits/chosen": -0.8727273344993591, + "logits/rejected": -0.978568434715271, + "logps/chosen": -63.571720123291016, + "logps/rejected": -131.79446411132812, + "loss": 0.8077, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.6451358795166016, + "rewards/margins": -0.9479444026947021, + "rewards/rejected": 3.5930802822113037, + "step": 1008 + }, + { + "epoch": 0.16, + "learning_rate": 9.929637855585337e-06, + "logits/chosen": -0.5468672513961792, + "logits/rejected": -0.5808968544006348, + "logps/chosen": -27.557098388671875, + "logps/rejected": -18.603710174560547, + "loss": 0.8184, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.14674758911132812, + "rewards/margins": -0.45072880387306213, + "rewards/rejected": 0.303981214761734, + "step": 1009 + }, + { + "epoch": 0.16, + "learning_rate": 9.929417977466356e-06, + "logits/chosen": -0.5449709892272949, + "logits/rejected": -0.5672656297683716, + "logps/chosen": -69.24411010742188, + "logps/rejected": -51.84121322631836, + "loss": 0.5306, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.8358360528945923, + "rewards/margins": -0.5987163782119751, + "rewards/rejected": 2.4345524311065674, + "step": 1010 + }, + { + "epoch": 0.16, + "learning_rate": 9.929197758771235e-06, + "logits/chosen": -0.7389549016952515, + "logits/rejected": -0.5573006868362427, + "logps/chosen": -92.8084945678711, + "logps/rejected": -78.30899047851562, + "loss": 0.0766, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.9982829093933105, + "rewards/margins": 1.8108179569244385, + "rewards/rejected": 3.187464952468872, + "step": 1011 + }, + { + "epoch": 0.16, + "learning_rate": 9.928977199515187e-06, + "logits/chosen": -1.153041958808899, + "logits/rejected": -1.0790647268295288, + "logps/chosen": -126.89045715332031, + "logps/rejected": -102.97923278808594, + "loss": 0.211, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.872277736663818, + "rewards/margins": 0.7243208885192871, + "rewards/rejected": 5.147956848144531, + "step": 1012 + }, + { + "epoch": 0.16, + "learning_rate": 9.928756299713454e-06, + "logits/chosen": -0.607005774974823, + "logits/rejected": -0.5187335014343262, + "logps/chosen": -63.95268630981445, + "logps/rejected": -13.065422058105469, + "loss": 0.4667, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.9490062594413757, + "rewards/margins": 0.38868290185928345, + "rewards/rejected": 0.5603233575820923, + "step": 1013 + }, + { + "epoch": 0.16, + "learning_rate": 9.928535059381298e-06, + "logits/chosen": -0.6939927339553833, + "logits/rejected": -0.5961138606071472, + "logps/chosen": -55.85277557373047, + "logps/rejected": -40.60003662109375, + "loss": 1.2715, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.1825110912323, + "rewards/margins": 0.4726557731628418, + "rewards/rejected": 1.709855318069458, + "step": 1014 + }, + { + "epoch": 0.16, + "learning_rate": 9.928313478534003e-06, + "logits/chosen": -1.2262153625488281, + "logits/rejected": -1.1810640096664429, + "logps/chosen": -72.71206665039062, + "logps/rejected": -52.918373107910156, + "loss": 0.7882, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.6628235578536987, + "rewards/margins": 1.3095784187316895, + "rewards/rejected": 0.35324516892433167, + "step": 1015 + }, + { + "epoch": 0.16, + "learning_rate": 9.928091557186878e-06, + "logits/chosen": -0.5618224740028381, + "logits/rejected": -0.5274065136909485, + "logps/chosen": -66.34832763671875, + "logps/rejected": -27.46689224243164, + "loss": 0.8149, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.501904308795929, + "rewards/margins": -1.039285659790039, + "rewards/rejected": 1.5411900281906128, + "step": 1016 + }, + { + "epoch": 0.17, + "learning_rate": 9.927869295355258e-06, + "logits/chosen": -1.3003922700881958, + "logits/rejected": -1.249742865562439, + "logps/chosen": -34.820709228515625, + "logps/rejected": -21.029632568359375, + "loss": 0.0791, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.176952362060547, + "rewards/margins": 1.775987982749939, + "rewards/rejected": 0.4009643495082855, + "step": 1017 + }, + { + "epoch": 0.17, + "learning_rate": 9.927646693054498e-06, + "logits/chosen": -0.9472134709358215, + "logits/rejected": -0.7441808581352234, + "logps/chosen": -119.04988098144531, + "logps/rejected": -24.183853149414062, + "loss": 0.0039, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.342688083648682, + "rewards/margins": 5.307958126068115, + "rewards/rejected": 1.0347298383712769, + "step": 1018 + }, + { + "epoch": 0.17, + "learning_rate": 9.927423750299974e-06, + "logits/chosen": -0.7500896453857422, + "logits/rejected": -0.779312252998352, + "logps/chosen": -63.21555709838867, + "logps/rejected": -61.24491882324219, + "loss": 0.5212, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.117938756942749, + "rewards/margins": -0.2793750762939453, + "rewards/rejected": 2.3973138332366943, + "step": 1019 + }, + { + "epoch": 0.17, + "learning_rate": 9.927200467107095e-06, + "logits/chosen": -0.5848895907402039, + "logits/rejected": -0.6715510487556458, + "logps/chosen": -63.900054931640625, + "logps/rejected": -72.54765319824219, + "loss": 1.9986, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.6135025024414062, + "rewards/margins": -3.9720048904418945, + "rewards/rejected": 5.585507392883301, + "step": 1020 + }, + { + "epoch": 0.17, + "learning_rate": 9.926976843491285e-06, + "logits/chosen": -0.810642659664154, + "logits/rejected": -0.7386626601219177, + "logps/chosen": -70.70913696289062, + "logps/rejected": -42.62133026123047, + "loss": 1.7118, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.95860755443573, + "rewards/margins": -0.008210301399230957, + "rewards/rejected": 1.966817855834961, + "step": 1021 + }, + { + "epoch": 0.17, + "learning_rate": 9.926752879467995e-06, + "logits/chosen": -0.6562033891677856, + "logits/rejected": -0.6562033891677856, + "logps/chosen": -99.11827087402344, + "logps/rejected": -99.11827087402344, + "loss": 0.4934, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.7563339471817017, + "rewards/margins": 0.0, + "rewards/rejected": 0.7563339471817017, + "step": 1022 + }, + { + "epoch": 0.17, + "learning_rate": 9.926528575052698e-06, + "logits/chosen": -0.8565542101860046, + "logits/rejected": -0.948924720287323, + "logps/chosen": -76.91931915283203, + "logps/rejected": -92.0157699584961, + "loss": 1.3059, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.5519431829452515, + "rewards/margins": -1.6822456121444702, + "rewards/rejected": 3.2341887950897217, + "step": 1023 + }, + { + "epoch": 0.17, + "learning_rate": 9.926303930260892e-06, + "logits/chosen": -0.9774463772773743, + "logits/rejected": -0.8568050861358643, + "logps/chosen": -100.71954345703125, + "logps/rejected": -49.39323806762695, + "loss": 0.1811, + "rewards/accuracies": 1.0, + "rewards/chosen": 7.076092720031738, + "rewards/margins": 3.1141369342803955, + "rewards/rejected": 3.9619557857513428, + "step": 1024 + }, + { + "epoch": 0.17, + "learning_rate": 9.926078945108098e-06, + "logits/chosen": -1.0919772386550903, + "logits/rejected": -1.1866041421890259, + "logps/chosen": -166.10870361328125, + "logps/rejected": -125.13243103027344, + "loss": 0.8933, + "rewards/accuracies": 0.0, + "rewards/chosen": 5.042275905609131, + "rewards/margins": -0.5173110961914062, + "rewards/rejected": 5.559587001800537, + "step": 1025 + }, + { + "epoch": 0.17, + "learning_rate": 9.925853619609858e-06, + "logits/chosen": -0.5331553220748901, + "logits/rejected": -0.5331553220748901, + "logps/chosen": -53.30464172363281, + "logps/rejected": -53.30464172363281, + "loss": 0.3574, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.5373992919921875, + "rewards/margins": 0.0, + "rewards/rejected": 1.5373992919921875, + "step": 1026 + }, + { + "epoch": 0.17, + "learning_rate": 9.925627953781744e-06, + "logits/chosen": -1.0845516920089722, + "logits/rejected": -0.8629153966903687, + "logps/chosen": -85.97921752929688, + "logps/rejected": -86.73056030273438, + "loss": 0.3374, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.9591034650802612, + "rewards/margins": 0.9595375657081604, + "rewards/rejected": 0.9995658993721008, + "step": 1027 + }, + { + "epoch": 0.17, + "learning_rate": 9.925401947639344e-06, + "logits/chosen": -1.0800776481628418, + "logits/rejected": -1.1986428499221802, + "logps/chosen": -221.84815979003906, + "logps/rejected": -124.83819580078125, + "loss": 2.1285, + "rewards/accuracies": 0.0, + "rewards/chosen": 5.290846347808838, + "rewards/margins": -1.0290327072143555, + "rewards/rejected": 6.319879055023193, + "step": 1028 + }, + { + "epoch": 0.17, + "learning_rate": 9.925175601198273e-06, + "logits/chosen": -0.9277109503746033, + "logits/rejected": -0.8476077318191528, + "logps/chosen": -64.73680877685547, + "logps/rejected": -64.95535278320312, + "loss": 0.6255, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.7741615176200867, + "rewards/margins": 0.23041605949401855, + "rewards/rejected": 0.5437454581260681, + "step": 1029 + }, + { + "epoch": 0.17, + "learning_rate": 9.924948914474173e-06, + "logits/chosen": -0.7644563317298889, + "logits/rejected": -0.631628155708313, + "logps/chosen": -135.11473083496094, + "logps/rejected": -100.67774963378906, + "loss": 0.0303, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.031491279602051, + "rewards/margins": 2.8359460830688477, + "rewards/rejected": 2.195545196533203, + "step": 1030 + }, + { + "epoch": 0.17, + "learning_rate": 9.924721887482702e-06, + "logits/chosen": -0.9295254945755005, + "logits/rejected": -0.9206826686859131, + "logps/chosen": -87.4398422241211, + "logps/rejected": -247.41354370117188, + "loss": 1.8322, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.479326605796814, + "rewards/margins": -3.0400733947753906, + "rewards/rejected": 4.519400119781494, + "step": 1031 + }, + { + "epoch": 0.17, + "learning_rate": 9.924494520239545e-06, + "logits/chosen": -0.9484032988548279, + "logits/rejected": -1.0040136575698853, + "logps/chosen": -49.285377502441406, + "logps/rejected": -84.957275390625, + "loss": 2.5365, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.777788519859314, + "rewards/margins": -4.137303829193115, + "rewards/rejected": 5.915092468261719, + "step": 1032 + }, + { + "epoch": 0.17, + "learning_rate": 9.924266812760415e-06, + "logits/chosen": -0.9698896408081055, + "logits/rejected": -1.0032743215560913, + "logps/chosen": -101.91007995605469, + "logps/rejected": -192.43392944335938, + "loss": 0.9617, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.562170386314392, + "rewards/margins": -0.8846954107284546, + "rewards/rejected": 2.4468657970428467, + "step": 1033 + }, + { + "epoch": 0.17, + "learning_rate": 9.924038765061042e-06, + "logits/chosen": -1.126103162765503, + "logits/rejected": -1.1270090341567993, + "logps/chosen": -120.26316833496094, + "logps/rejected": -107.12724304199219, + "loss": 0.3051, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.33740234375, + "rewards/margins": 0.1835876703262329, + "rewards/rejected": 1.153814673423767, + "step": 1034 + }, + { + "epoch": 0.17, + "learning_rate": 9.92381037715718e-06, + "logits/chosen": -0.8148942589759827, + "logits/rejected": -0.8166270852088928, + "logps/chosen": -77.63919067382812, + "logps/rejected": -48.02995300292969, + "loss": 0.6344, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.1276596784591675, + "rewards/margins": -0.9123839139938354, + "rewards/rejected": 2.040043592453003, + "step": 1035 + }, + { + "epoch": 0.17, + "learning_rate": 9.923581649064612e-06, + "logits/chosen": -0.831239640712738, + "logits/rejected": -0.7676218152046204, + "logps/chosen": -119.21334075927734, + "logps/rejected": -37.3453483581543, + "loss": 0.2708, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.235964298248291, + "rewards/margins": 1.0481503009796143, + "rewards/rejected": 1.1878139972686768, + "step": 1036 + }, + { + "epoch": 0.17, + "learning_rate": 9.92335258079914e-06, + "logits/chosen": -1.1363240480422974, + "logits/rejected": -0.8467519879341125, + "logps/chosen": -164.4252471923828, + "logps/rejected": -146.05340576171875, + "loss": 0.7529, + "rewards/accuracies": 0.0, + "rewards/chosen": 4.058457851409912, + "rewards/margins": -0.16011524200439453, + "rewards/rejected": 4.218573093414307, + "step": 1037 + }, + { + "epoch": 0.17, + "learning_rate": 9.923123172376588e-06, + "logits/chosen": -0.6238774657249451, + "logits/rejected": -0.49236273765563965, + "logps/chosen": -66.2486343383789, + "logps/rejected": -12.948027610778809, + "loss": 0.6379, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.663890838623047, + "rewards/margins": 2.340806245803833, + "rewards/rejected": 0.3230845630168915, + "step": 1038 + }, + { + "epoch": 0.17, + "learning_rate": 9.92289342381281e-06, + "logits/chosen": -0.7118231654167175, + "logits/rejected": -0.7173677086830139, + "logps/chosen": -47.34398651123047, + "logps/rejected": -90.36145782470703, + "loss": 0.7181, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.4534279108047485, + "rewards/margins": 0.8613601922988892, + "rewards/rejected": 0.5920677185058594, + "step": 1039 + }, + { + "epoch": 0.17, + "learning_rate": 9.922663335123674e-06, + "logits/chosen": -0.42634743452072144, + "logits/rejected": -0.4690461754798889, + "logps/chosen": -81.66824340820312, + "logps/rejected": -47.06805419921875, + "loss": 0.4, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.2252633571624756, + "rewards/margins": 0.14305973052978516, + "rewards/rejected": 2.0822036266326904, + "step": 1040 + }, + { + "epoch": 0.17, + "learning_rate": 9.922432906325083e-06, + "logits/chosen": -0.8813912272453308, + "logits/rejected": -0.8541409969329834, + "logps/chosen": -68.90007019042969, + "logps/rejected": -56.4853515625, + "loss": 0.2467, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.8982224464416504, + "rewards/margins": 0.7080733776092529, + "rewards/rejected": 2.1901490688323975, + "step": 1041 + }, + { + "epoch": 0.17, + "learning_rate": 9.922202137432954e-06, + "logits/chosen": -0.18167643249034882, + "logits/rejected": -0.18167643249034882, + "logps/chosen": -20.716970443725586, + "logps/rejected": -20.716970443725586, + "loss": 0.9773, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.25890350341796875, + "rewards/margins": 0.0, + "rewards/rejected": 0.25890350341796875, + "step": 1042 + }, + { + "epoch": 0.17, + "learning_rate": 9.921971028463233e-06, + "logits/chosen": -0.7575119733810425, + "logits/rejected": -1.0867362022399902, + "logps/chosen": -98.36371612548828, + "logps/rejected": -112.1335678100586, + "loss": 0.2372, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.7472360134124756, + "rewards/margins": 0.686767578125, + "rewards/rejected": 2.0604684352874756, + "step": 1043 + }, + { + "epoch": 0.17, + "learning_rate": 9.921739579431883e-06, + "logits/chosen": -0.8009732961654663, + "logits/rejected": -0.6754125952720642, + "logps/chosen": -78.01850891113281, + "logps/rejected": -72.17054748535156, + "loss": 0.9952, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.197702169418335, + "rewards/margins": -1.1383988857269287, + "rewards/rejected": 4.336101055145264, + "step": 1044 + }, + { + "epoch": 0.17, + "learning_rate": 9.9215077903549e-06, + "logits/chosen": -0.7076573371887207, + "logits/rejected": -0.7845975160598755, + "logps/chosen": -112.96588134765625, + "logps/rejected": -104.67411804199219, + "loss": 0.5305, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.355513095855713, + "rewards/margins": 0.8744583129882812, + "rewards/rejected": 2.4810547828674316, + "step": 1045 + }, + { + "epoch": 0.17, + "learning_rate": 9.921275661248296e-06, + "logits/chosen": -0.7950448393821716, + "logits/rejected": -0.8842033743858337, + "logps/chosen": -36.98430252075195, + "logps/rejected": -99.57708740234375, + "loss": 1.222, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.3471122980117798, + "rewards/margins": -2.116147518157959, + "rewards/rejected": 3.4632599353790283, + "step": 1046 + }, + { + "epoch": 0.17, + "learning_rate": 9.92104319212811e-06, + "logits/chosen": -0.837604820728302, + "logits/rejected": -0.9525009989738464, + "logps/chosen": -86.27107238769531, + "logps/rejected": -134.50067138671875, + "loss": 0.2311, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.5790321826934814, + "rewards/margins": 0.550961971282959, + "rewards/rejected": 2.0280702114105225, + "step": 1047 + }, + { + "epoch": 0.17, + "learning_rate": 9.920810383010402e-06, + "logits/chosen": -0.7414405941963196, + "logits/rejected": -0.7399863004684448, + "logps/chosen": -2.238133430480957, + "logps/rejected": -1.5746735334396362, + "loss": 0.7065, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.37799832224845886, + "rewards/margins": -0.05271843075752258, + "rewards/rejected": 0.43071675300598145, + "step": 1048 + }, + { + "epoch": 0.17, + "learning_rate": 9.920577233911257e-06, + "logits/chosen": -0.9173545241355896, + "logits/rejected": -0.7123413681983948, + "logps/chosen": -77.06278228759766, + "logps/rejected": -50.132083892822266, + "loss": 0.3242, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.776327610015869, + "rewards/margins": 1.948129653930664, + "rewards/rejected": 0.8281978964805603, + "step": 1049 + }, + { + "epoch": 0.17, + "learning_rate": 9.920343744846786e-06, + "logits/chosen": -0.7977133989334106, + "logits/rejected": -0.7236182689666748, + "logps/chosen": -70.02297973632812, + "logps/rejected": -76.13713073730469, + "loss": 0.6708, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.0571670532226562, + "rewards/margins": 0.14899742603302002, + "rewards/rejected": 1.9081696271896362, + "step": 1050 + }, + { + "epoch": 0.17, + "learning_rate": 9.920109915833118e-06, + "logits/chosen": -0.7458306550979614, + "logits/rejected": -0.7198777198791504, + "logps/chosen": -40.71952438354492, + "logps/rejected": -63.42619323730469, + "loss": 0.962, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.7486084699630737, + "rewards/margins": -0.2604590654373169, + "rewards/rejected": 2.0090675354003906, + "step": 1051 + }, + { + "epoch": 0.17, + "learning_rate": 9.919875746886409e-06, + "logits/chosen": -0.8530144691467285, + "logits/rejected": -0.9950476288795471, + "logps/chosen": -172.93679809570312, + "logps/rejected": -200.62240600585938, + "loss": 1.4044, + "rewards/accuracies": 0.0, + "rewards/chosen": 5.740399360656738, + "rewards/margins": -0.7620940208435059, + "rewards/rejected": 6.502493381500244, + "step": 1052 + }, + { + "epoch": 0.17, + "learning_rate": 9.919641238022839e-06, + "logits/chosen": -0.9317665696144104, + "logits/rejected": -0.9005598425865173, + "logps/chosen": -138.54983520507812, + "logps/rejected": -58.2546272277832, + "loss": 0.9191, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.2734100818634033, + "rewards/margins": -0.44976305961608887, + "rewards/rejected": 1.7231731414794922, + "step": 1053 + }, + { + "epoch": 0.17, + "learning_rate": 9.919406389258607e-06, + "logits/chosen": -0.7749043107032776, + "logits/rejected": -0.8132354021072388, + "logps/chosen": -56.26111602783203, + "logps/rejected": -60.65381622314453, + "loss": 0.9768, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.7492958307266235, + "rewards/margins": -0.8355896472930908, + "rewards/rejected": 1.5848854780197144, + "step": 1054 + }, + { + "epoch": 0.17, + "learning_rate": 9.919171200609945e-06, + "logits/chosen": -0.7684983611106873, + "logits/rejected": -0.7749994397163391, + "logps/chosen": -54.73255920410156, + "logps/rejected": -89.37460327148438, + "loss": 0.2978, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.9627685546875, + "rewards/margins": 0.4035438299179077, + "rewards/rejected": 1.5592247247695923, + "step": 1055 + }, + { + "epoch": 0.17, + "learning_rate": 9.918935672093096e-06, + "logits/chosen": -0.45445889234542847, + "logits/rejected": -0.5177388787269592, + "logps/chosen": -41.41157531738281, + "logps/rejected": -56.95820999145508, + "loss": 0.8211, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.5560845136642456, + "rewards/margins": -0.3193545341491699, + "rewards/rejected": 1.8754390478134155, + "step": 1056 + }, + { + "epoch": 0.17, + "learning_rate": 9.918699803724337e-06, + "logits/chosen": -0.5963652729988098, + "logits/rejected": -0.6588645577430725, + "logps/chosen": -93.09545135498047, + "logps/rejected": -127.541748046875, + "loss": 0.6955, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.438999891281128, + "rewards/margins": -0.9485886096954346, + "rewards/rejected": 4.3875885009765625, + "step": 1057 + }, + { + "epoch": 0.17, + "learning_rate": 9.918463595519963e-06, + "logits/chosen": -0.7552613019943237, + "logits/rejected": -0.7552613019943237, + "logps/chosen": -102.3837661743164, + "logps/rejected": -102.3837661743164, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.723122477531433, + "rewards/margins": 0.0, + "rewards/rejected": 1.723122477531433, + "step": 1058 + }, + { + "epoch": 0.17, + "learning_rate": 9.918227047496292e-06, + "logits/chosen": -0.8484178781509399, + "logits/rejected": -0.6898423433303833, + "logps/chosen": -94.95985412597656, + "logps/rejected": -36.54196548461914, + "loss": 0.7767, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.0483207702636719, + "rewards/margins": 0.8897861242294312, + "rewards/rejected": 0.15853463113307953, + "step": 1059 + }, + { + "epoch": 0.17, + "learning_rate": 9.91799015966967e-06, + "logits/chosen": -0.6378569006919861, + "logits/rejected": -0.6378569006919861, + "logps/chosen": -61.21827697753906, + "logps/rejected": -61.21827697753906, + "loss": 0.6505, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.619258165359497, + "rewards/margins": 0.0, + "rewards/rejected": 1.619258165359497, + "step": 1060 + }, + { + "epoch": 0.17, + "learning_rate": 9.917752932056462e-06, + "logits/chosen": -0.6534433960914612, + "logits/rejected": -0.6534433960914612, + "logps/chosen": -69.09815216064453, + "logps/rejected": -69.09815216064453, + "loss": 1.2015, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.6857063174247742, + "rewards/margins": 0.0, + "rewards/rejected": 0.6857063174247742, + "step": 1061 + }, + { + "epoch": 0.17, + "learning_rate": 9.917515364673056e-06, + "logits/chosen": -0.8348450064659119, + "logits/rejected": -0.8844822645187378, + "logps/chosen": -84.65302276611328, + "logps/rejected": -37.43793869018555, + "loss": 0.7579, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.985781192779541, + "rewards/margins": 2.649346351623535, + "rewards/rejected": 1.3364349603652954, + "step": 1062 + }, + { + "epoch": 0.17, + "learning_rate": 9.917277457535873e-06, + "logits/chosen": -0.816917896270752, + "logits/rejected": -0.8719503879547119, + "logps/chosen": -60.46240234375, + "logps/rejected": -90.13311767578125, + "loss": 0.666, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.9829788208007812, + "rewards/margins": -0.5049896240234375, + "rewards/rejected": 1.4879684448242188, + "step": 1063 + }, + { + "epoch": 0.17, + "learning_rate": 9.917039210661341e-06, + "logits/chosen": -0.7802512645721436, + "logits/rejected": -0.7356520891189575, + "logps/chosen": -95.61624145507812, + "logps/rejected": -57.00776290893555, + "loss": 0.6165, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.7344329953193665, + "rewards/margins": -0.878976047039032, + "rewards/rejected": 1.6134090423583984, + "step": 1064 + }, + { + "epoch": 0.17, + "learning_rate": 9.916800624065928e-06, + "logits/chosen": -0.9294317960739136, + "logits/rejected": -0.8873370289802551, + "logps/chosen": -124.3235092163086, + "logps/rejected": -209.8443145751953, + "loss": 0.8529, + "rewards/accuracies": 0.0, + "rewards/chosen": 4.236936092376709, + "rewards/margins": -1.2385540008544922, + "rewards/rejected": 5.475490093231201, + "step": 1065 + }, + { + "epoch": 0.17, + "learning_rate": 9.916561697766114e-06, + "logits/chosen": -1.250536322593689, + "logits/rejected": -1.2699017524719238, + "logps/chosen": -71.28128051757812, + "logps/rejected": -128.42495727539062, + "loss": 1.1286, + "rewards/accuracies": 0.0, + "rewards/chosen": 4.456237316131592, + "rewards/margins": -1.9909610748291016, + "rewards/rejected": 6.447198390960693, + "step": 1066 + }, + { + "epoch": 0.17, + "learning_rate": 9.916322431778408e-06, + "logits/chosen": -1.1934329271316528, + "logits/rejected": -1.1629457473754883, + "logps/chosen": -121.990234375, + "logps/rejected": -145.32485961914062, + "loss": 0.045, + "rewards/accuracies": 1.0, + "rewards/chosen": 7.6591477394104, + "rewards/margins": 2.607962131500244, + "rewards/rejected": 5.051185607910156, + "step": 1067 + }, + { + "epoch": 0.17, + "learning_rate": 9.91608282611934e-06, + "logits/chosen": -0.6775638461112976, + "logits/rejected": -0.5215858221054077, + "logps/chosen": -70.28646850585938, + "logps/rejected": -44.225921630859375, + "loss": 0.0752, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0281434059143066, + "rewards/margins": 1.8586632013320923, + "rewards/rejected": 1.1694802045822144, + "step": 1068 + }, + { + "epoch": 0.17, + "learning_rate": 9.915842880805466e-06, + "logits/chosen": -0.8604180216789246, + "logits/rejected": -0.7758450508117676, + "logps/chosen": -41.801090240478516, + "logps/rejected": -41.75526809692383, + "loss": 0.5691, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.6330264806747437, + "rewards/margins": 0.6721618175506592, + "rewards/rejected": 0.9608646631240845, + "step": 1069 + }, + { + "epoch": 0.17, + "learning_rate": 9.915602595853363e-06, + "logits/chosen": -0.8052322864532471, + "logits/rejected": -0.7054086327552795, + "logps/chosen": -55.30535888671875, + "logps/rejected": -21.837926864624023, + "loss": 0.4295, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5844287872314453, + "rewards/margins": 0.034430861473083496, + "rewards/rejected": 0.5499979257583618, + "step": 1070 + }, + { + "epoch": 0.17, + "learning_rate": 9.915361971279631e-06, + "logits/chosen": -0.9163388609886169, + "logits/rejected": -1.0032401084899902, + "logps/chosen": -219.90457153320312, + "logps/rejected": -173.8032684326172, + "loss": 0.2243, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.194491863250732, + "rewards/margins": 0.7062671184539795, + "rewards/rejected": 3.488224744796753, + "step": 1071 + }, + { + "epoch": 0.17, + "learning_rate": 9.9151210071009e-06, + "logits/chosen": -0.5618488788604736, + "logits/rejected": -0.5761560201644897, + "logps/chosen": -105.77116394042969, + "logps/rejected": -87.68638610839844, + "loss": 0.857, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.315591424703598, + "rewards/margins": -1.3733985424041748, + "rewards/rejected": 1.6889899969100952, + "step": 1072 + }, + { + "epoch": 0.17, + "learning_rate": 9.914879703333811e-06, + "logits/chosen": -0.5790195465087891, + "logits/rejected": -0.5508353114128113, + "logps/chosen": -51.18806457519531, + "logps/rejected": -44.07759094238281, + "loss": 0.5137, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.491647481918335, + "rewards/margins": -0.47104644775390625, + "rewards/rejected": 2.962693929672241, + "step": 1073 + }, + { + "epoch": 0.17, + "learning_rate": 9.91463805999504e-06, + "logits/chosen": -0.7567042708396912, + "logits/rejected": -0.6627806425094604, + "logps/chosen": -59.318721771240234, + "logps/rejected": -34.250396728515625, + "loss": 0.8066, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.8461414575576782, + "rewards/margins": 0.5072215795516968, + "rewards/rejected": 1.3389198780059814, + "step": 1074 + }, + { + "epoch": 0.17, + "learning_rate": 9.914396077101283e-06, + "logits/chosen": -0.7267269492149353, + "logits/rejected": -0.6706776022911072, + "logps/chosen": -190.55899047851562, + "logps/rejected": -174.5454864501953, + "loss": 0.784, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.709155559539795, + "rewards/margins": 3.418428421020508, + "rewards/rejected": 1.2907272577285767, + "step": 1075 + }, + { + "epoch": 0.17, + "learning_rate": 9.914153754669256e-06, + "logits/chosen": -1.1804351806640625, + "logits/rejected": -1.0767076015472412, + "logps/chosen": -62.393184661865234, + "logps/rejected": -96.39865112304688, + "loss": 0.2713, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.577812671661377, + "rewards/margins": 0.3343074321746826, + "rewards/rejected": 2.2435052394866943, + "step": 1076 + }, + { + "epoch": 0.17, + "learning_rate": 9.913911092715703e-06, + "logits/chosen": -0.5153332948684692, + "logits/rejected": -0.5793940424919128, + "logps/chosen": -57.50044631958008, + "logps/rejected": -46.540794372558594, + "loss": 2.434, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.122585654258728, + "rewards/margins": -1.4978123903274536, + "rewards/rejected": 2.6203980445861816, + "step": 1077 + }, + { + "epoch": 0.17, + "learning_rate": 9.91366809125739e-06, + "logits/chosen": -0.8023507595062256, + "logits/rejected": -0.8238227963447571, + "logps/chosen": -204.7345733642578, + "logps/rejected": -92.17171478271484, + "loss": 1.0413, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.408578634262085, + "rewards/margins": 2.1354188919067383, + "rewards/rejected": 1.2731598615646362, + "step": 1078 + }, + { + "epoch": 0.18, + "learning_rate": 9.913424750311107e-06, + "logits/chosen": -0.9734993577003479, + "logits/rejected": -0.9747793078422546, + "logps/chosen": -72.13966369628906, + "logps/rejected": -73.95881652832031, + "loss": 2.5855, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.5283730030059814, + "rewards/margins": -1.7463607788085938, + "rewards/rejected": 3.274733781814575, + "step": 1079 + }, + { + "epoch": 0.18, + "learning_rate": 9.913181069893662e-06, + "logits/chosen": -0.8830820918083191, + "logits/rejected": -0.8802115321159363, + "logps/chosen": -61.79288864135742, + "logps/rejected": -188.65965270996094, + "loss": 2.0286, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.7893116474151611, + "rewards/margins": -4.001711845397949, + "rewards/rejected": 5.791023254394531, + "step": 1080 + }, + { + "epoch": 0.18, + "learning_rate": 9.912937050021896e-06, + "logits/chosen": -0.38019412755966187, + "logits/rejected": -0.3918239176273346, + "logps/chosen": -24.40282440185547, + "logps/rejected": -23.252674102783203, + "loss": 0.6758, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.103571318089962, + "rewards/margins": -0.11949902027845383, + "rewards/rejected": 0.22307033836841583, + "step": 1081 + }, + { + "epoch": 0.18, + "learning_rate": 9.912692690712667e-06, + "logits/chosen": -1.1943676471710205, + "logits/rejected": -1.0863797664642334, + "logps/chosen": -98.86437225341797, + "logps/rejected": -36.90029525756836, + "loss": 0.0243, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.7463507652282715, + "rewards/margins": 3.127471446990967, + "rewards/rejected": 2.6188793182373047, + "step": 1082 + }, + { + "epoch": 0.18, + "learning_rate": 9.912447991982857e-06, + "logits/chosen": -1.133509874343872, + "logits/rejected": -1.0157051086425781, + "logps/chosen": -72.79884338378906, + "logps/rejected": -31.089069366455078, + "loss": 0.1402, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.4838905334472656, + "rewards/margins": 1.2178943157196045, + "rewards/rejected": 0.26599618792533875, + "step": 1083 + }, + { + "epoch": 0.18, + "learning_rate": 9.91220295384937e-06, + "logits/chosen": -0.6302797794342041, + "logits/rejected": -0.7648680806159973, + "logps/chosen": -98.99597930908203, + "logps/rejected": -135.24685668945312, + "loss": 1.0243, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.9131004810333252, + "rewards/margins": -1.8952186107635498, + "rewards/rejected": 3.808319091796875, + "step": 1084 + }, + { + "epoch": 0.18, + "learning_rate": 9.91195757632914e-06, + "logits/chosen": -0.9830310344696045, + "logits/rejected": -0.9900108575820923, + "logps/chosen": -221.1858673095703, + "logps/rejected": -235.17733764648438, + "loss": 0.6228, + "rewards/accuracies": 0.0, + "rewards/chosen": 4.671492099761963, + "rewards/margins": -0.8780837059020996, + "rewards/rejected": 5.5495758056640625, + "step": 1085 + }, + { + "epoch": 0.18, + "learning_rate": 9.91171185943912e-06, + "logits/chosen": -0.8382629752159119, + "logits/rejected": -0.746508777141571, + "logps/chosen": -59.29652786254883, + "logps/rejected": -64.13026428222656, + "loss": 1.1375, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.9218227863311768, + "rewards/margins": 0.2648700475692749, + "rewards/rejected": 1.6569527387619019, + "step": 1086 + }, + { + "epoch": 0.18, + "learning_rate": 9.911465803196286e-06, + "logits/chosen": -1.1915225982666016, + "logits/rejected": -1.3523648977279663, + "logps/chosen": -95.87959289550781, + "logps/rejected": -35.345909118652344, + "loss": 0.3654, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.895867943763733, + "rewards/margins": 1.476812720298767, + "rewards/rejected": 0.41905519366264343, + "step": 1087 + }, + { + "epoch": 0.18, + "learning_rate": 9.911219407617638e-06, + "logits/chosen": -0.967864990234375, + "logits/rejected": -1.0515902042388916, + "logps/chosen": -180.33804321289062, + "logps/rejected": -226.6265411376953, + "loss": 2.3527, + "rewards/accuracies": 0.0, + "rewards/chosen": 4.348858833312988, + "rewards/margins": -3.4505233764648438, + "rewards/rejected": 7.799382209777832, + "step": 1088 + }, + { + "epoch": 0.18, + "learning_rate": 9.910972672720198e-06, + "logits/chosen": -1.1805503368377686, + "logits/rejected": -1.1662977933883667, + "logps/chosen": -100.01827239990234, + "logps/rejected": -98.41007995605469, + "loss": 0.9404, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.793108344078064, + "rewards/margins": -0.8067909479141235, + "rewards/rejected": 2.5998992919921875, + "step": 1089 + }, + { + "epoch": 0.18, + "learning_rate": 9.910725598521014e-06, + "logits/chosen": -0.9617851972579956, + "logits/rejected": -0.6560981273651123, + "logps/chosen": -113.03684997558594, + "logps/rejected": -46.67505645751953, + "loss": 0.0675, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.403190612792969, + "rewards/margins": 6.340602874755859, + "rewards/rejected": 0.06258773803710938, + "step": 1090 + }, + { + "epoch": 0.18, + "learning_rate": 9.910478185037159e-06, + "logits/chosen": -0.3945826292037964, + "logits/rejected": -0.3945826292037964, + "logps/chosen": -4.001270294189453, + "logps/rejected": -4.001270294189453, + "loss": 0.6384, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.3450888693332672, + "rewards/margins": 0.0, + "rewards/rejected": 0.3450888693332672, + "step": 1091 + }, + { + "epoch": 0.18, + "learning_rate": 9.910230432285722e-06, + "logits/chosen": -0.5702180862426758, + "logits/rejected": -0.5291637182235718, + "logps/chosen": -77.07182312011719, + "logps/rejected": -30.033933639526367, + "loss": 0.137, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.3174729347229004, + "rewards/margins": 1.1941076517105103, + "rewards/rejected": 1.1233652830123901, + "step": 1092 + }, + { + "epoch": 0.18, + "learning_rate": 9.909982340283824e-06, + "logits/chosen": -0.7975302338600159, + "logits/rejected": -0.7876273989677429, + "logps/chosen": -66.02001953125, + "logps/rejected": -97.44054412841797, + "loss": 0.4446, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.7013870477676392, + "rewards/margins": -0.15824657678604126, + "rewards/rejected": 0.8596336245536804, + "step": 1093 + }, + { + "epoch": 0.18, + "learning_rate": 9.909733909048606e-06, + "logits/chosen": -0.6840369701385498, + "logits/rejected": -0.719143271446228, + "logps/chosen": -96.58795166015625, + "logps/rejected": -107.08819580078125, + "loss": 0.4814, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.6224991083145142, + "rewards/margins": 0.39733052253723145, + "rewards/rejected": 1.2251685857772827, + "step": 1094 + }, + { + "epoch": 0.18, + "learning_rate": 9.909485138597231e-06, + "logits/chosen": -0.5765254497528076, + "logits/rejected": -0.631731390953064, + "logps/chosen": -5.127641677856445, + "logps/rejected": -47.54817199707031, + "loss": 1.8774, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.4402560293674469, + "rewards/margins": -0.5566178560256958, + "rewards/rejected": 0.9968738555908203, + "step": 1095 + }, + { + "epoch": 0.18, + "learning_rate": 9.909236028946885e-06, + "logits/chosen": -0.6409870982170105, + "logits/rejected": -0.49631640315055847, + "logps/chosen": -56.055091857910156, + "logps/rejected": -41.47523880004883, + "loss": 0.2883, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.6073235273361206, + "rewards/margins": 0.3338634967803955, + "rewards/rejected": 1.273460030555725, + "step": 1096 + }, + { + "epoch": 0.18, + "learning_rate": 9.908986580114783e-06, + "logits/chosen": -0.9053390026092529, + "logits/rejected": -0.8941728472709656, + "logps/chosen": -60.357723236083984, + "logps/rejected": -62.50991439819336, + "loss": 0.7145, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.6602100133895874, + "rewards/margins": -0.718521237373352, + "rewards/rejected": 2.3787312507629395, + "step": 1097 + }, + { + "epoch": 0.18, + "learning_rate": 9.908736792118157e-06, + "logits/chosen": -0.7432287931442261, + "logits/rejected": -0.7591704726219177, + "logps/chosen": -3.2921228408813477, + "logps/rejected": -1.6357932090759277, + "loss": 2.0856, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.18747682869434357, + "rewards/margins": -0.1168331652879715, + "rewards/rejected": 0.30430999398231506, + "step": 1098 + }, + { + "epoch": 0.18, + "learning_rate": 9.908486664974266e-06, + "logits/chosen": -0.6884504556655884, + "logits/rejected": -0.5568250417709351, + "logps/chosen": -75.67818450927734, + "logps/rejected": -25.451480865478516, + "loss": 0.2106, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.659437656402588, + "rewards/margins": 1.9526822566986084, + "rewards/rejected": 0.7067554593086243, + "step": 1099 + }, + { + "epoch": 0.18, + "learning_rate": 9.90823619870039e-06, + "logits/chosen": -0.7455837726593018, + "logits/rejected": -0.718392014503479, + "logps/chosen": -41.94269943237305, + "logps/rejected": -68.78274536132812, + "loss": 0.403, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.403153657913208, + "rewards/margins": 0.1126246452331543, + "rewards/rejected": 2.2905290126800537, + "step": 1100 + }, + { + "epoch": 0.18, + "learning_rate": 9.907985393313836e-06, + "logits/chosen": -0.8761774301528931, + "logits/rejected": -0.969946026802063, + "logps/chosen": -89.01145935058594, + "logps/rejected": -114.33474731445312, + "loss": 0.8302, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.2044677734375, + "rewards/margins": -1.2207245826721191, + "rewards/rejected": 3.425192356109619, + "step": 1101 + }, + { + "epoch": 0.18, + "learning_rate": 9.90773424883193e-06, + "logits/chosen": -1.1087830066680908, + "logits/rejected": -1.1144813299179077, + "logps/chosen": -66.2096939086914, + "logps/rejected": -77.15847778320312, + "loss": 0.2965, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.046346426010132, + "rewards/margins": 0.2297447919845581, + "rewards/rejected": 1.8166016340255737, + "step": 1102 + }, + { + "epoch": 0.18, + "learning_rate": 9.907482765272027e-06, + "logits/chosen": -0.8486506342887878, + "logits/rejected": -0.866662323474884, + "logps/chosen": -6.915375709533691, + "logps/rejected": -44.06833267211914, + "loss": 1.4436, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.28337621688842773, + "rewards/margins": -0.30815595388412476, + "rewards/rejected": 0.5915321707725525, + "step": 1103 + }, + { + "epoch": 0.18, + "learning_rate": 9.907230942651498e-06, + "logits/chosen": -1.0214166641235352, + "logits/rejected": -0.7298220992088318, + "logps/chosen": -155.54946899414062, + "logps/rejected": -20.616281509399414, + "loss": 0.0399, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.537990093231201, + "rewards/margins": 5.456254005432129, + "rewards/rejected": 0.0817359909415245, + "step": 1104 + }, + { + "epoch": 0.18, + "learning_rate": 9.906978780987744e-06, + "logits/chosen": -0.4786258041858673, + "logits/rejected": -0.5132476687431335, + "logps/chosen": -53.8411865234375, + "logps/rejected": -49.086692810058594, + "loss": 1.3056, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.15450668334960938, + "rewards/margins": -1.7591781616210938, + "rewards/rejected": 1.6046714782714844, + "step": 1105 + }, + { + "epoch": 0.18, + "learning_rate": 9.906726280298185e-06, + "logits/chosen": -1.0484378337860107, + "logits/rejected": -0.9420539736747742, + "logps/chosen": -111.45055389404297, + "logps/rejected": -59.670204162597656, + "loss": 0.0173, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.966416358947754, + "rewards/margins": 4.012292861938477, + "rewards/rejected": 1.9541237354278564, + "step": 1106 + }, + { + "epoch": 0.18, + "learning_rate": 9.906473440600271e-06, + "logits/chosen": -0.9273026585578918, + "logits/rejected": -0.8434553146362305, + "logps/chosen": -79.402587890625, + "logps/rejected": -45.74198913574219, + "loss": 0.754, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.174224853515625, + "rewards/margins": 0.5641075372695923, + "rewards/rejected": 1.6101173162460327, + "step": 1107 + }, + { + "epoch": 0.18, + "learning_rate": 9.906220261911466e-06, + "logits/chosen": -1.1149400472640991, + "logits/rejected": -1.1452687978744507, + "logps/chosen": -87.17840576171875, + "logps/rejected": -92.13028717041016, + "loss": 0.3728, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.749018907546997, + "rewards/margins": 0.021935224533081055, + "rewards/rejected": 2.727083683013916, + "step": 1108 + }, + { + "epoch": 0.18, + "learning_rate": 9.905966744249262e-06, + "logits/chosen": -0.8471750617027283, + "logits/rejected": -0.8067190647125244, + "logps/chosen": -69.40550231933594, + "logps/rejected": -46.58488845825195, + "loss": 0.5702, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.38886719942092896, + "rewards/margins": -0.7519657015800476, + "rewards/rejected": 1.1408329010009766, + "step": 1109 + }, + { + "epoch": 0.18, + "learning_rate": 9.90571288763118e-06, + "logits/chosen": -0.9583612084388733, + "logits/rejected": -0.8857651948928833, + "logps/chosen": -200.00634765625, + "logps/rejected": -81.52471923828125, + "loss": 0.115, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.408298015594482, + "rewards/margins": 2.7324869632720947, + "rewards/rejected": 2.6758110523223877, + "step": 1110 + }, + { + "epoch": 0.18, + "learning_rate": 9.905458692074755e-06, + "logits/chosen": -0.8826810717582703, + "logits/rejected": -0.8779268264770508, + "logps/chosen": -14.739486694335938, + "logps/rejected": -23.954177856445312, + "loss": 0.7014, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.11420383304357529, + "rewards/margins": -0.03258610516786575, + "rewards/rejected": 0.14678993821144104, + "step": 1111 + }, + { + "epoch": 0.18, + "learning_rate": 9.905204157597548e-06, + "logits/chosen": -0.6511728763580322, + "logits/rejected": -0.5088980793952942, + "logps/chosen": -84.01681518554688, + "logps/rejected": -67.91300201416016, + "loss": 0.0274, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.980264186859131, + "rewards/margins": 4.538518905639648, + "rewards/rejected": 1.4417450428009033, + "step": 1112 + }, + { + "epoch": 0.18, + "learning_rate": 9.90494928421715e-06, + "logits/chosen": -1.0205707550048828, + "logits/rejected": -0.9510658383369446, + "logps/chosen": -53.61461639404297, + "logps/rejected": -41.491065979003906, + "loss": 0.9453, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.8246865272521973, + "rewards/margins": 1.025719165802002, + "rewards/rejected": 1.7989673614501953, + "step": 1113 + }, + { + "epoch": 0.18, + "learning_rate": 9.904694071951167e-06, + "logits/chosen": -0.5211344957351685, + "logits/rejected": -0.5211344957351685, + "logps/chosen": -78.23155975341797, + "logps/rejected": -78.23155975341797, + "loss": 1.2225, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.2326080799102783, + "rewards/margins": 0.0, + "rewards/rejected": 1.2326080799102783, + "step": 1114 + }, + { + "epoch": 0.18, + "learning_rate": 9.90443852081723e-06, + "logits/chosen": -0.9354482293128967, + "logits/rejected": -0.8905490040779114, + "logps/chosen": -49.422271728515625, + "logps/rejected": -18.58895492553711, + "loss": 0.4028, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.8356819152832031, + "rewards/margins": -0.11852836608886719, + "rewards/rejected": 0.9542102813720703, + "step": 1115 + }, + { + "epoch": 0.18, + "learning_rate": 9.904182630832998e-06, + "logits/chosen": -0.9838342070579529, + "logits/rejected": -1.0478336811065674, + "logps/chosen": -59.235633850097656, + "logps/rejected": -99.15619659423828, + "loss": 0.5705, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.932684302330017, + "rewards/margins": -0.048920512199401855, + "rewards/rejected": 1.981604814529419, + "step": 1116 + }, + { + "epoch": 0.18, + "learning_rate": 9.903926402016153e-06, + "logits/chosen": -0.396424263715744, + "logits/rejected": -0.396424263715744, + "logps/chosen": -75.26453399658203, + "logps/rejected": -75.26453399658203, + "loss": 0.3484, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.3275741636753082, + "rewards/margins": 0.0, + "rewards/rejected": 0.3275741636753082, + "step": 1117 + }, + { + "epoch": 0.18, + "learning_rate": 9.903669834384392e-06, + "logits/chosen": -0.7848242521286011, + "logits/rejected": -0.8296281099319458, + "logps/chosen": -29.14055824279785, + "logps/rejected": -64.570068359375, + "loss": 0.9177, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.178302526473999, + "rewards/margins": -1.6540064811706543, + "rewards/rejected": 3.8323090076446533, + "step": 1118 + }, + { + "epoch": 0.18, + "learning_rate": 9.903412927955446e-06, + "logits/chosen": -0.7117148637771606, + "logits/rejected": -0.6305639147758484, + "logps/chosen": -67.54869079589844, + "logps/rejected": -25.736658096313477, + "loss": 0.1564, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.4114837646484375, + "rewards/margins": 1.268277883529663, + "rewards/rejected": 1.1432058811187744, + "step": 1119 + }, + { + "epoch": 0.18, + "learning_rate": 9.903155682747064e-06, + "logits/chosen": -1.1315685510635376, + "logits/rejected": -1.0564261674880981, + "logps/chosen": -72.82197570800781, + "logps/rejected": -209.97857666015625, + "loss": 2.0317, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.2742676734924316, + "rewards/margins": -3.9692931175231934, + "rewards/rejected": 6.243560791015625, + "step": 1120 + }, + { + "epoch": 0.18, + "learning_rate": 9.902898098777016e-06, + "logits/chosen": -0.727714478969574, + "logits/rejected": -0.6865300536155701, + "logps/chosen": -61.47873306274414, + "logps/rejected": -41.0338134765625, + "loss": 0.7374, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.0303958654403687, + "rewards/margins": -0.7964305877685547, + "rewards/rejected": 1.8268264532089233, + "step": 1121 + }, + { + "epoch": 0.18, + "learning_rate": 9.902640176063103e-06, + "logits/chosen": -0.9380643963813782, + "logits/rejected": -1.2120435237884521, + "logps/chosen": -109.23936462402344, + "logps/rejected": -35.87615966796875, + "loss": 0.1897, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.2322174310684204, + "rewards/margins": 1.061130166053772, + "rewards/rejected": 0.17108726501464844, + "step": 1122 + }, + { + "epoch": 0.18, + "learning_rate": 9.902381914623142e-06, + "logits/chosen": -0.9358757734298706, + "logits/rejected": -0.9358757734298706, + "logps/chosen": -53.460784912109375, + "logps/rejected": -53.460784912109375, + "loss": 0.556, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.1546295881271362, + "rewards/margins": 0.0, + "rewards/rejected": 1.1546295881271362, + "step": 1123 + }, + { + "epoch": 0.18, + "learning_rate": 9.902123314474979e-06, + "logits/chosen": -0.7093110084533691, + "logits/rejected": -0.7395483255386353, + "logps/chosen": -186.18429565429688, + "logps/rejected": -101.45553588867188, + "loss": 0.3866, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.178216457366943, + "rewards/margins": 0.40863633155822754, + "rewards/rejected": 3.769580125808716, + "step": 1124 + }, + { + "epoch": 0.18, + "learning_rate": 9.901864375636477e-06, + "logits/chosen": -1.3265376091003418, + "logits/rejected": -1.3345322608947754, + "logps/chosen": -10.27951431274414, + "logps/rejected": -22.222469329833984, + "loss": 0.6334, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.22228221595287323, + "rewards/margins": 0.09776362776756287, + "rewards/rejected": 0.12451858818531036, + "step": 1125 + }, + { + "epoch": 0.18, + "learning_rate": 9.901605098125528e-06, + "logits/chosen": -1.0926181077957153, + "logits/rejected": -1.1610013246536255, + "logps/chosen": -127.05782318115234, + "logps/rejected": -219.103759765625, + "loss": 0.5312, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.959683895111084, + "rewards/margins": 0.8453192710876465, + "rewards/rejected": 6.1143646240234375, + "step": 1126 + }, + { + "epoch": 0.18, + "learning_rate": 9.901345481960049e-06, + "logits/chosen": -0.6872801184654236, + "logits/rejected": -0.6801244020462036, + "logps/chosen": -18.570602416992188, + "logps/rejected": -11.34787368774414, + "loss": 1.1571, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.004399490542709827, + "rewards/margins": -0.3278753459453583, + "rewards/rejected": 0.33227482438087463, + "step": 1127 + }, + { + "epoch": 0.18, + "learning_rate": 9.901085527157971e-06, + "logits/chosen": -0.8576279878616333, + "logits/rejected": -0.8201282024383545, + "logps/chosen": -62.55580520629883, + "logps/rejected": -42.54647445678711, + "loss": 0.4228, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.2378787994384766, + "rewards/margins": 1.0869728326797485, + "rewards/rejected": 1.150905966758728, + "step": 1128 + }, + { + "epoch": 0.18, + "learning_rate": 9.900825233737261e-06, + "logits/chosen": -0.5315654277801514, + "logits/rejected": -0.5315654277801514, + "logps/chosen": -34.648590087890625, + "logps/rejected": -34.648590087890625, + "loss": 0.3565, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.38443222641944885, + "rewards/margins": 0.0, + "rewards/rejected": 0.38443222641944885, + "step": 1129 + }, + { + "epoch": 0.18, + "learning_rate": 9.900564601715898e-06, + "logits/chosen": -1.0309332609176636, + "logits/rejected": -0.9067134261131287, + "logps/chosen": -96.81659698486328, + "logps/rejected": -84.43832397460938, + "loss": 0.2224, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.495659589767456, + "rewards/margins": 0.5930564403533936, + "rewards/rejected": 2.9026031494140625, + "step": 1130 + }, + { + "epoch": 0.18, + "learning_rate": 9.90030363111189e-06, + "logits/chosen": -0.6011806130409241, + "logits/rejected": -0.5972328186035156, + "logps/chosen": -54.63005447387695, + "logps/rejected": -73.596435546875, + "loss": 0.9743, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.1572468280792236, + "rewards/margins": -0.9200634956359863, + "rewards/rejected": 3.07731032371521, + "step": 1131 + }, + { + "epoch": 0.18, + "learning_rate": 9.90004232194327e-06, + "logits/chosen": -0.9337689280509949, + "logits/rejected": -1.0593104362487793, + "logps/chosen": -154.49197387695312, + "logps/rejected": -204.29954528808594, + "loss": 1.0909, + "rewards/accuracies": 0.0, + "rewards/chosen": 5.065011501312256, + "rewards/margins": -2.0597610473632812, + "rewards/rejected": 7.124772548675537, + "step": 1132 + }, + { + "epoch": 0.18, + "learning_rate": 9.899780674228087e-06, + "logits/chosen": -0.24541504681110382, + "logits/rejected": -0.24846476316452026, + "logps/chosen": -3.3577511310577393, + "logps/rejected": -1.295893907546997, + "loss": 1.0189, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.1155410036444664, + "rewards/margins": -0.10840099304914474, + "rewards/rejected": 0.22394199669361115, + "step": 1133 + }, + { + "epoch": 0.18, + "learning_rate": 9.899518687984424e-06, + "logits/chosen": -0.616621196269989, + "logits/rejected": -0.616621196269989, + "logps/chosen": -22.07625961303711, + "logps/rejected": -22.07625961303711, + "loss": 0.685, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.29091987013816833, + "rewards/margins": 0.0, + "rewards/rejected": 0.29091987013816833, + "step": 1134 + }, + { + "epoch": 0.18, + "learning_rate": 9.899256363230379e-06, + "logits/chosen": -0.42704612016677856, + "logits/rejected": -0.3053523302078247, + "logps/chosen": -77.90324401855469, + "logps/rejected": -1.6253105401992798, + "loss": 0.8774, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.8009727597236633, + "rewards/margins": 0.5756422281265259, + "rewards/rejected": 0.22533054649829865, + "step": 1135 + }, + { + "epoch": 0.18, + "learning_rate": 9.898993699984076e-06, + "logits/chosen": -0.683828592300415, + "logits/rejected": -0.7572482228279114, + "logps/chosen": -141.22171020507812, + "logps/rejected": -105.0111083984375, + "loss": 0.7958, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.5735855102539062, + "rewards/margins": -1.127211093902588, + "rewards/rejected": 2.700796604156494, + "step": 1136 + }, + { + "epoch": 0.18, + "learning_rate": 9.898730698263663e-06, + "logits/chosen": -1.010170578956604, + "logits/rejected": -0.9314213395118713, + "logps/chosen": -47.03527069091797, + "logps/rejected": -20.025043487548828, + "loss": 0.495, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.4599099159240723, + "rewards/margins": 1.0623303651809692, + "rewards/rejected": 1.397579550743103, + "step": 1137 + }, + { + "epoch": 0.18, + "learning_rate": 9.89846735808731e-06, + "logits/chosen": -1.1230298280715942, + "logits/rejected": -1.0923857688903809, + "logps/chosen": -101.75625610351562, + "logps/rejected": -102.96831512451172, + "loss": 2.6653, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.0357329845428467, + "rewards/margins": 0.317380428314209, + "rewards/rejected": 1.7183525562286377, + "step": 1138 + }, + { + "epoch": 0.18, + "learning_rate": 9.898203679473214e-06, + "logits/chosen": -0.8361178636550903, + "logits/rejected": -0.7541330456733704, + "logps/chosen": -66.10526275634766, + "logps/rejected": -42.191017150878906, + "loss": 0.2234, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.09198522567749, + "rewards/margins": 2.015634059906006, + "rewards/rejected": 2.0763511657714844, + "step": 1139 + }, + { + "epoch": 0.19, + "learning_rate": 9.897939662439591e-06, + "logits/chosen": -0.852912425994873, + "logits/rejected": -0.8227827548980713, + "logps/chosen": -80.72183227539062, + "logps/rejected": -121.69053649902344, + "loss": 1.6264, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.8766647577285767, + "rewards/margins": -2.562696933746338, + "rewards/rejected": 3.439361572265625, + "step": 1140 + }, + { + "epoch": 0.19, + "learning_rate": 9.897675307004681e-06, + "logits/chosen": -0.5510352253913879, + "logits/rejected": -0.5976144671440125, + "logps/chosen": -53.283382415771484, + "logps/rejected": -114.92469787597656, + "loss": 0.6364, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.472142457962036, + "rewards/margins": -0.7058088779449463, + "rewards/rejected": 4.177951335906982, + "step": 1141 + }, + { + "epoch": 0.19, + "learning_rate": 9.89741061318675e-06, + "logits/chosen": -0.9117801785469055, + "logits/rejected": -0.8262136578559875, + "logps/chosen": -62.10374069213867, + "logps/rejected": -32.0374641418457, + "loss": 0.4218, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.8051151037216187, + "rewards/margins": -0.2713102102279663, + "rewards/rejected": 2.076425313949585, + "step": 1142 + }, + { + "epoch": 0.19, + "learning_rate": 9.897145581004085e-06, + "logits/chosen": -0.7415232062339783, + "logits/rejected": -0.69853675365448, + "logps/chosen": -99.4371566772461, + "logps/rejected": -78.16668701171875, + "loss": 0.7232, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.6464264392852783, + "rewards/margins": 0.13047492504119873, + "rewards/rejected": 1.5159515142440796, + "step": 1143 + }, + { + "epoch": 0.19, + "learning_rate": 9.896880210474998e-06, + "logits/chosen": -0.4318236708641052, + "logits/rejected": -0.26221057772636414, + "logps/chosen": -59.97303771972656, + "logps/rejected": -20.15802764892578, + "loss": 0.1309, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.242466688156128, + "rewards/margins": 1.625885009765625, + "rewards/rejected": 0.6165817379951477, + "step": 1144 + }, + { + "epoch": 0.19, + "learning_rate": 9.896614501617824e-06, + "logits/chosen": -1.2131367921829224, + "logits/rejected": -1.0715843439102173, + "logps/chosen": -98.17704772949219, + "logps/rejected": -98.51760864257812, + "loss": 0.6165, + "rewards/accuracies": 1.0, + "rewards/chosen": 7.336845397949219, + "rewards/margins": 3.110037326812744, + "rewards/rejected": 4.226808071136475, + "step": 1145 + }, + { + "epoch": 0.19, + "learning_rate": 9.896348454450918e-06, + "logits/chosen": -0.7465471625328064, + "logits/rejected": -0.6240372061729431, + "logps/chosen": -279.0732116699219, + "logps/rejected": -35.55616760253906, + "loss": 0.111, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.433746337890625, + "rewards/margins": 2.8978004455566406, + "rewards/rejected": 1.5359458923339844, + "step": 1146 + }, + { + "epoch": 0.19, + "learning_rate": 9.896082068992666e-06, + "logits/chosen": -0.9799686670303345, + "logits/rejected": -1.0744857788085938, + "logps/chosen": -201.51742553710938, + "logps/rejected": -183.2526092529297, + "loss": 1.414, + "rewards/accuracies": 0.0, + "rewards/chosen": 6.754006862640381, + "rewards/margins": -2.744093418121338, + "rewards/rejected": 9.498100280761719, + "step": 1147 + }, + { + "epoch": 0.19, + "learning_rate": 9.895815345261468e-06, + "logits/chosen": -0.5380980372428894, + "logits/rejected": -0.4001677930355072, + "logps/chosen": -77.5622329711914, + "logps/rejected": -22.375843048095703, + "loss": 0.0439, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.744094371795654, + "rewards/margins": 4.5128045082092285, + "rewards/rejected": 1.2312897443771362, + "step": 1148 + }, + { + "epoch": 0.19, + "learning_rate": 9.895548283275756e-06, + "logits/chosen": -0.5738152265548706, + "logits/rejected": -0.5600231289863586, + "logps/chosen": -102.25904083251953, + "logps/rejected": -85.16443634033203, + "loss": 0.8527, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.7969993948936462, + "rewards/margins": -1.199479579925537, + "rewards/rejected": 1.9964790344238281, + "step": 1149 + }, + { + "epoch": 0.19, + "learning_rate": 9.895280883053977e-06, + "logits/chosen": -0.9957134127616882, + "logits/rejected": -0.9945573210716248, + "logps/chosen": -48.13567352294922, + "logps/rejected": -75.95691680908203, + "loss": 0.9331, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.4051506519317627, + "rewards/margins": -0.5180953741073608, + "rewards/rejected": 1.9232460260391235, + "step": 1150 + }, + { + "epoch": 0.19, + "learning_rate": 9.895013144614611e-06, + "logits/chosen": -0.9818049073219299, + "logits/rejected": -1.1932685375213623, + "logps/chosen": -154.280029296875, + "logps/rejected": -93.57095336914062, + "loss": 0.2907, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.322183132171631, + "rewards/margins": 0.41429877281188965, + "rewards/rejected": 3.907884359359741, + "step": 1151 + }, + { + "epoch": 0.19, + "learning_rate": 9.894745067976154e-06, + "logits/chosen": -0.9339762926101685, + "logits/rejected": -0.9870686531066895, + "logps/chosen": -94.12934875488281, + "logps/rejected": -51.5668830871582, + "loss": 0.4514, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.593656301498413, + "rewards/margins": -0.18590641021728516, + "rewards/rejected": 2.7795627117156982, + "step": 1152 + }, + { + "epoch": 0.19, + "learning_rate": 9.894476653157125e-06, + "logits/chosen": -1.2186548709869385, + "logits/rejected": -1.232843279838562, + "logps/chosen": -59.78823471069336, + "logps/rejected": -165.0178985595703, + "loss": 0.5694, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.1526172161102295, + "rewards/margins": -0.7390453815460205, + "rewards/rejected": 3.89166259765625, + "step": 1153 + }, + { + "epoch": 0.19, + "learning_rate": 9.894207900176074e-06, + "logits/chosen": -0.8615090250968933, + "logits/rejected": -0.8335702419281006, + "logps/chosen": -100.08305358886719, + "logps/rejected": -128.29629516601562, + "loss": 1.0084, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.8183059692382812, + "rewards/margins": -0.43529510498046875, + "rewards/rejected": 4.25360107421875, + "step": 1154 + }, + { + "epoch": 0.19, + "learning_rate": 9.893938809051564e-06, + "logits/chosen": -0.9336379170417786, + "logits/rejected": -0.8689197897911072, + "logps/chosen": -170.02078247070312, + "logps/rejected": -77.63871002197266, + "loss": 0.1042, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.690312385559082, + "rewards/margins": 1.5512566566467285, + "rewards/rejected": 5.1390557289123535, + "step": 1155 + }, + { + "epoch": 0.19, + "learning_rate": 9.89366937980219e-06, + "logits/chosen": -0.7324743270874023, + "logits/rejected": -0.7324743270874023, + "logps/chosen": -86.22244262695312, + "logps/rejected": -86.22244262695312, + "loss": 0.3667, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.3238372802734375, + "rewards/margins": 0.0, + "rewards/rejected": 2.3238372802734375, + "step": 1156 + }, + { + "epoch": 0.19, + "learning_rate": 9.893399612446568e-06, + "logits/chosen": -0.40311118960380554, + "logits/rejected": -0.40726831555366516, + "logps/chosen": -11.936952590942383, + "logps/rejected": -1.1331257820129395, + "loss": 0.585, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.02886371687054634, + "rewards/margins": -0.33058008551597595, + "rewards/rejected": 0.3594438135623932, + "step": 1157 + }, + { + "epoch": 0.19, + "learning_rate": 9.893129507003334e-06, + "logits/chosen": -0.7491864562034607, + "logits/rejected": -0.7616144418716431, + "logps/chosen": -32.492977142333984, + "logps/rejected": -33.38895797729492, + "loss": 0.5183, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.4317517280578613, + "rewards/margins": -0.31711912155151367, + "rewards/rejected": 2.748870849609375, + "step": 1158 + }, + { + "epoch": 0.19, + "learning_rate": 9.89285906349115e-06, + "logits/chosen": -0.8518397212028503, + "logits/rejected": -0.8384445309638977, + "logps/chosen": -20.95557403564453, + "logps/rejected": -35.746028900146484, + "loss": 2.5558, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.3152512311935425, + "rewards/margins": -2.269524574279785, + "rewards/rejected": 3.584775924682617, + "step": 1159 + }, + { + "epoch": 0.19, + "learning_rate": 9.892588281928699e-06, + "logits/chosen": -0.8722864389419556, + "logits/rejected": -0.8456690311431885, + "logps/chosen": -69.64974212646484, + "logps/rejected": -114.04197692871094, + "loss": 0.6387, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.4048652648925781, + "rewards/margins": 0.23115158081054688, + "rewards/rejected": 1.1737136840820312, + "step": 1160 + }, + { + "epoch": 0.19, + "learning_rate": 9.892317162334694e-06, + "logits/chosen": -1.511086106300354, + "logits/rejected": -1.4533109664916992, + "logps/chosen": -156.16937255859375, + "logps/rejected": -39.51142120361328, + "loss": 0.0392, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.8671875, + "rewards/margins": 2.596174716949463, + "rewards/rejected": 0.2710128724575043, + "step": 1161 + }, + { + "epoch": 0.19, + "learning_rate": 9.892045704727864e-06, + "logits/chosen": -0.9581989049911499, + "logits/rejected": -0.8117986917495728, + "logps/chosen": -122.07774353027344, + "logps/rejected": -99.18592834472656, + "loss": 0.0615, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.786537170410156, + "rewards/margins": 2.7050514221191406, + "rewards/rejected": 2.0814857482910156, + "step": 1162 + }, + { + "epoch": 0.19, + "learning_rate": 9.891773909126964e-06, + "logits/chosen": -0.9359015226364136, + "logits/rejected": -0.9461162090301514, + "logps/chosen": -160.39920043945312, + "logps/rejected": -96.68782806396484, + "loss": 0.1749, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.9126083850860596, + "rewards/margins": 1.1530182361602783, + "rewards/rejected": 2.7595901489257812, + "step": 1163 + }, + { + "epoch": 0.19, + "learning_rate": 9.891501775550776e-06, + "logits/chosen": -0.9874159693717957, + "logits/rejected": -0.8516509532928467, + "logps/chosen": -65.64570617675781, + "logps/rejected": -11.099405288696289, + "loss": 0.5099, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.0554490089416504, + "rewards/margins": 1.5841734409332275, + "rewards/rejected": 0.47127553820610046, + "step": 1164 + }, + { + "epoch": 0.19, + "learning_rate": 9.891229304018098e-06, + "logits/chosen": -0.4010251760482788, + "logits/rejected": -0.39819619059562683, + "logps/chosen": -2.2681150436401367, + "logps/rejected": -4.280683517456055, + "loss": 0.718, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.19190894067287445, + "rewards/margins": 0.26418033242225647, + "rewards/rejected": -0.07227139919996262, + "step": 1165 + }, + { + "epoch": 0.19, + "learning_rate": 9.890956494547756e-06, + "logits/chosen": -0.6286904811859131, + "logits/rejected": -0.6364920139312744, + "logps/chosen": -49.091575622558594, + "logps/rejected": -57.75806427001953, + "loss": 0.1341, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.6962227821350098, + "rewards/margins": 1.453547716140747, + "rewards/rejected": 2.2426750659942627, + "step": 1166 + }, + { + "epoch": 0.19, + "learning_rate": 9.890683347158598e-06, + "logits/chosen": -1.0106639862060547, + "logits/rejected": -1.0265549421310425, + "logps/chosen": -170.70889282226562, + "logps/rejected": -118.25071716308594, + "loss": 0.7121, + "rewards/accuracies": 0.0, + "rewards/chosen": 4.206552028656006, + "rewards/margins": -1.1368122100830078, + "rewards/rejected": 5.343364238739014, + "step": 1167 + }, + { + "epoch": 0.19, + "learning_rate": 9.890409861869497e-06, + "logits/chosen": -0.5352805852890015, + "logits/rejected": -0.5379823446273804, + "logps/chosen": -22.78692626953125, + "logps/rejected": -57.43284606933594, + "loss": 0.5796, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.6695846915245056, + "rewards/margins": 0.521274983882904, + "rewards/rejected": 0.14830970764160156, + "step": 1168 + }, + { + "epoch": 0.19, + "learning_rate": 9.89013603869935e-06, + "logits/chosen": -0.7161974310874939, + "logits/rejected": -0.6335353851318359, + "logps/chosen": -57.654754638671875, + "logps/rejected": -60.606075286865234, + "loss": 2.4377, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.373799204826355, + "rewards/margins": -1.7290698289871216, + "rewards/rejected": 3.1028690338134766, + "step": 1169 + }, + { + "epoch": 0.19, + "learning_rate": 9.889861877667071e-06, + "logits/chosen": -0.5324914455413818, + "logits/rejected": -0.5324914455413818, + "logps/chosen": -49.440513610839844, + "logps/rejected": -49.440513610839844, + "loss": 1.5117, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.339557647705078, + "rewards/margins": 0.0, + "rewards/rejected": 2.339557647705078, + "step": 1170 + }, + { + "epoch": 0.19, + "learning_rate": 9.889587378791605e-06, + "logits/chosen": -0.6286941766738892, + "logits/rejected": -0.6455866694450378, + "logps/chosen": -52.277099609375, + "logps/rejected": -98.7306900024414, + "loss": 0.3561, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.6995407342910767, + "rewards/margins": 0.5349289178848267, + "rewards/rejected": 1.16461181640625, + "step": 1171 + }, + { + "epoch": 0.19, + "learning_rate": 9.889312542091918e-06, + "logits/chosen": -0.7946077585220337, + "logits/rejected": -0.9319955706596375, + "logps/chosen": -59.8896369934082, + "logps/rejected": -141.3635711669922, + "loss": 2.3267, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.236900806427002, + "rewards/margins": -4.63688325881958, + "rewards/rejected": 7.873784065246582, + "step": 1172 + }, + { + "epoch": 0.19, + "learning_rate": 9.889037367586997e-06, + "logits/chosen": -0.9365679621696472, + "logits/rejected": -0.9365679621696472, + "logps/chosen": -64.15699005126953, + "logps/rejected": -64.15699005126953, + "loss": 0.5641, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.0820014476776123, + "rewards/margins": 0.0, + "rewards/rejected": 2.0820014476776123, + "step": 1173 + }, + { + "epoch": 0.19, + "learning_rate": 9.888761855295855e-06, + "logits/chosen": -0.7124614119529724, + "logits/rejected": -0.6337899565696716, + "logps/chosen": -86.79524230957031, + "logps/rejected": -78.86116027832031, + "loss": 1.8078, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.1720657348632812, + "rewards/margins": -0.7230279445648193, + "rewards/rejected": 2.8950936794281006, + "step": 1174 + }, + { + "epoch": 0.19, + "learning_rate": 9.888486005237525e-06, + "logits/chosen": -0.4446828365325928, + "logits/rejected": -0.4446828365325928, + "logps/chosen": -2.847226142883301, + "logps/rejected": -2.847226142883301, + "loss": 0.9113, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.14115391671657562, + "rewards/margins": 0.0, + "rewards/rejected": 0.14115391671657562, + "step": 1175 + }, + { + "epoch": 0.19, + "learning_rate": 9.88820981743107e-06, + "logits/chosen": -0.5282016396522522, + "logits/rejected": -0.48568233847618103, + "logps/chosen": -80.66609954833984, + "logps/rejected": -103.91975402832031, + "loss": 0.4388, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.4848830699920654, + "rewards/margins": -0.32288217544555664, + "rewards/rejected": 2.807765245437622, + "step": 1176 + }, + { + "epoch": 0.19, + "learning_rate": 9.887933291895566e-06, + "logits/chosen": -1.0882419347763062, + "logits/rejected": -0.8442671895027161, + "logps/chosen": -324.7669982910156, + "logps/rejected": -165.1080322265625, + "loss": 0.9641, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.8525971174240112, + "rewards/margins": -0.9981368780136108, + "rewards/rejected": 2.850733995437622, + "step": 1177 + }, + { + "epoch": 0.19, + "learning_rate": 9.887656428650123e-06, + "logits/chosen": -1.345183253288269, + "logits/rejected": -1.2878391742706299, + "logps/chosen": -109.11028289794922, + "logps/rejected": -39.845802307128906, + "loss": 0.7268, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.643933892250061, + "rewards/margins": 1.3821792602539062, + "rewards/rejected": 0.2617546021938324, + "step": 1178 + }, + { + "epoch": 0.19, + "learning_rate": 9.88737922771387e-06, + "logits/chosen": -0.9052780866622925, + "logits/rejected": -0.8946108818054199, + "logps/chosen": -10.684690475463867, + "logps/rejected": -3.634594440460205, + "loss": 0.3776, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0038942338433116674, + "rewards/margins": -0.1029539629817009, + "rewards/rejected": 0.10684819519519806, + "step": 1179 + }, + { + "epoch": 0.19, + "learning_rate": 9.887101689105956e-06, + "logits/chosen": -0.8734294772148132, + "logits/rejected": -0.7732897400856018, + "logps/chosen": -54.27106475830078, + "logps/rejected": -53.250404357910156, + "loss": 0.4346, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.2293602228164673, + "rewards/margins": -0.29398417472839355, + "rewards/rejected": 1.5233443975448608, + "step": 1180 + }, + { + "epoch": 0.19, + "learning_rate": 9.886823812845557e-06, + "logits/chosen": -0.9758632183074951, + "logits/rejected": -0.9900442957878113, + "logps/chosen": -65.06620788574219, + "logps/rejected": -80.42338562011719, + "loss": 0.7571, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.5225493907928467, + "rewards/margins": -0.37299513816833496, + "rewards/rejected": 2.8955445289611816, + "step": 1181 + }, + { + "epoch": 0.19, + "learning_rate": 9.886545598951872e-06, + "logits/chosen": -0.7002377510070801, + "logits/rejected": -0.6381484866142273, + "logps/chosen": -78.49942779541016, + "logps/rejected": -42.889225006103516, + "loss": 0.519, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.2789726257324219, + "rewards/margins": -0.010780692100524902, + "rewards/rejected": 1.2897533178329468, + "step": 1182 + }, + { + "epoch": 0.19, + "learning_rate": 9.886267047444123e-06, + "logits/chosen": -0.5485878586769104, + "logits/rejected": -1.0646227598190308, + "logps/chosen": -65.95735931396484, + "logps/rejected": -34.684791564941406, + "loss": 0.1238, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.394228458404541, + "rewards/margins": 2.181960344314575, + "rewards/rejected": 0.21226806938648224, + "step": 1183 + }, + { + "epoch": 0.19, + "learning_rate": 9.885988158341555e-06, + "logits/chosen": -0.8818962574005127, + "logits/rejected": -0.7851120829582214, + "logps/chosen": -72.83460998535156, + "logps/rejected": -72.47351837158203, + "loss": 0.182, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.778523325920105, + "rewards/margins": 0.8329994678497314, + "rewards/rejected": 0.9455238580703735, + "step": 1184 + }, + { + "epoch": 0.19, + "learning_rate": 9.885708931663437e-06, + "logits/chosen": -0.4437292516231537, + "logits/rejected": -0.4269849359989166, + "logps/chosen": -23.377735137939453, + "logps/rejected": -1.637933611869812, + "loss": 1.021, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.27034664154052734, + "rewards/margins": -0.10655379295349121, + "rewards/rejected": 0.37690043449401855, + "step": 1185 + }, + { + "epoch": 0.19, + "learning_rate": 9.885429367429062e-06, + "logits/chosen": -0.7385357022285461, + "logits/rejected": -0.6786890029907227, + "logps/chosen": -76.64797973632812, + "logps/rejected": -75.79022979736328, + "loss": 0.7175, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.3225128650665283, + "rewards/margins": -1.0439467430114746, + "rewards/rejected": 3.366459608078003, + "step": 1186 + }, + { + "epoch": 0.19, + "learning_rate": 9.885149465657744e-06, + "logits/chosen": -0.8094119429588318, + "logits/rejected": -0.431052029132843, + "logps/chosen": -153.80093383789062, + "logps/rejected": -52.380760192871094, + "loss": 0.7568, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.560751438140869, + "rewards/margins": 4.114109039306641, + "rewards/rejected": 1.446642279624939, + "step": 1187 + }, + { + "epoch": 0.19, + "learning_rate": 9.884869226368821e-06, + "logits/chosen": -0.5547130107879639, + "logits/rejected": -0.5547130107879639, + "logps/chosen": -43.098243713378906, + "logps/rejected": -43.098243713378906, + "loss": 1.0779, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.2597640752792358, + "rewards/margins": 0.0, + "rewards/rejected": 1.2597640752792358, + "step": 1188 + }, + { + "epoch": 0.19, + "learning_rate": 9.884588649581655e-06, + "logits/chosen": -0.5860466957092285, + "logits/rejected": -0.5490294694900513, + "logps/chosen": -88.0090103149414, + "logps/rejected": -86.07533264160156, + "loss": 2.3089, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.2994072139263153, + "rewards/margins": -0.8534507751464844, + "rewards/rejected": 1.152858018875122, + "step": 1189 + }, + { + "epoch": 0.19, + "learning_rate": 9.884307735315633e-06, + "logits/chosen": -0.7901561856269836, + "logits/rejected": -0.7901561856269836, + "logps/chosen": -70.1642837524414, + "logps/rejected": -70.1642837524414, + "loss": 0.4506, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.698413133621216, + "rewards/margins": 0.0, + "rewards/rejected": 2.698413133621216, + "step": 1190 + }, + { + "epoch": 0.19, + "learning_rate": 9.88402648359016e-06, + "logits/chosen": -0.9532039165496826, + "logits/rejected": -0.9425260424613953, + "logps/chosen": -80.40013885498047, + "logps/rejected": -106.92823028564453, + "loss": 1.2806, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.9804069995880127, + "rewards/margins": -2.432220697402954, + "rewards/rejected": 4.412627696990967, + "step": 1191 + }, + { + "epoch": 0.19, + "learning_rate": 9.883744894424671e-06, + "logits/chosen": -0.6739646792411804, + "logits/rejected": -0.6536793112754822, + "logps/chosen": -41.07710647583008, + "logps/rejected": -83.7674560546875, + "loss": 3.3598, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1312496662139893, + "rewards/margins": 0.182023286819458, + "rewards/rejected": 2.9492263793945312, + "step": 1192 + }, + { + "epoch": 0.19, + "learning_rate": 9.883462967838621e-06, + "logits/chosen": -0.7436785697937012, + "logits/rejected": -0.5440390706062317, + "logps/chosen": -115.33311462402344, + "logps/rejected": -67.31306457519531, + "loss": 0.3659, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.960429668426514, + "rewards/margins": 3.863440990447998, + "rewards/rejected": 1.0969886779785156, + "step": 1193 + }, + { + "epoch": 0.19, + "learning_rate": 9.883180703851488e-06, + "logits/chosen": -0.7283904552459717, + "logits/rejected": -0.8810258507728577, + "logps/chosen": -104.61845397949219, + "logps/rejected": -129.10643005371094, + "loss": 2.2647, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.8806953430175781, + "rewards/margins": -2.5329933166503906, + "rewards/rejected": 4.413688659667969, + "step": 1194 + }, + { + "epoch": 0.19, + "learning_rate": 9.882898102482773e-06, + "logits/chosen": -0.9547662734985352, + "logits/rejected": -0.8860839605331421, + "logps/chosen": -111.21179962158203, + "logps/rejected": -156.50674438476562, + "loss": 0.1735, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.6228766441345215, + "rewards/margins": 1.163858413696289, + "rewards/rejected": 4.459018230438232, + "step": 1195 + }, + { + "epoch": 0.19, + "learning_rate": 9.882615163752001e-06, + "logits/chosen": -0.9662734866142273, + "logits/rejected": -1.077893853187561, + "logps/chosen": -62.71630096435547, + "logps/rejected": -151.49288940429688, + "loss": 3.1927, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.0504570007324219, + "rewards/margins": -5.746274471282959, + "rewards/rejected": 6.796731472015381, + "step": 1196 + }, + { + "epoch": 0.19, + "learning_rate": 9.882331887678723e-06, + "logits/chosen": -0.7747915387153625, + "logits/rejected": -0.7275688052177429, + "logps/chosen": -35.21951675415039, + "logps/rejected": -39.354408264160156, + "loss": 0.5989, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.5227928161621094, + "rewards/margins": 0.011212944984436035, + "rewards/rejected": 1.5115798711776733, + "step": 1197 + }, + { + "epoch": 0.19, + "learning_rate": 9.882048274282505e-06, + "logits/chosen": -0.5939089059829712, + "logits/rejected": -0.5939089059829712, + "logps/chosen": -49.720909118652344, + "logps/rejected": -49.720909118652344, + "loss": 0.5239, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.360788106918335, + "rewards/margins": 0.0, + "rewards/rejected": 2.360788106918335, + "step": 1198 + }, + { + "epoch": 0.19, + "learning_rate": 9.881764323582948e-06, + "logits/chosen": -0.7780541777610779, + "logits/rejected": -0.6869981288909912, + "logps/chosen": -61.709774017333984, + "logps/rejected": -48.353004455566406, + "loss": 0.9732, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.3747142553329468, + "rewards/margins": -1.2564908266067505, + "rewards/rejected": 2.6312050819396973, + "step": 1199 + }, + { + "epoch": 0.19, + "learning_rate": 9.881480035599667e-06, + "logits/chosen": -0.4010534882545471, + "logits/rejected": -0.4021470844745636, + "logps/chosen": -15.002806663513184, + "logps/rejected": -9.200335502624512, + "loss": 0.3815, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.38052722811698914, + "rewards/margins": 0.05896589159965515, + "rewards/rejected": 0.321561336517334, + "step": 1200 + }, + { + "epoch": 0.19, + "learning_rate": 9.881195410352305e-06, + "logits/chosen": -0.7721803188323975, + "logits/rejected": -0.8062270879745483, + "logps/chosen": -59.24452590942383, + "logps/rejected": -84.05811309814453, + "loss": 0.7556, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.5261151790618896, + "rewards/margins": 0.11092424392700195, + "rewards/rejected": 2.4151909351348877, + "step": 1201 + }, + { + "epoch": 0.2, + "learning_rate": 9.880910447860527e-06, + "logits/chosen": -0.966469943523407, + "logits/rejected": -0.9092693328857422, + "logps/chosen": -76.70645141601562, + "logps/rejected": -26.945453643798828, + "loss": 2.4205, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.511474609375, + "rewards/margins": 0.4618651866912842, + "rewards/rejected": 1.0496094226837158, + "step": 1202 + }, + { + "epoch": 0.2, + "learning_rate": 9.880625148144021e-06, + "logits/chosen": -0.7167216539382935, + "logits/rejected": -0.678493082523346, + "logps/chosen": -68.13912963867188, + "logps/rejected": -65.72671508789062, + "loss": 0.4861, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.1376664638519287, + "rewards/margins": -0.37644481658935547, + "rewards/rejected": 2.514111280441284, + "step": 1203 + }, + { + "epoch": 0.2, + "learning_rate": 9.880339511222496e-06, + "logits/chosen": -0.44880110025405884, + "logits/rejected": -0.38682910799980164, + "logps/chosen": -69.07374572753906, + "logps/rejected": -68.60840606689453, + "loss": 1.225, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.4440948963165283, + "rewards/margins": -1.062950849533081, + "rewards/rejected": 2.5070457458496094, + "step": 1204 + }, + { + "epoch": 0.2, + "learning_rate": 9.88005353711569e-06, + "logits/chosen": -0.6001231670379639, + "logits/rejected": -0.6333956122398376, + "logps/chosen": -60.692535400390625, + "logps/rejected": -53.983009338378906, + "loss": 0.9997, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.6121957302093506, + "rewards/margins": 1.2165391445159912, + "rewards/rejected": 1.3956565856933594, + "step": 1205 + }, + { + "epoch": 0.2, + "learning_rate": 9.879767225843363e-06, + "logits/chosen": -0.6043420433998108, + "logits/rejected": -0.6043420433998108, + "logps/chosen": -4.563880920410156, + "logps/rejected": -4.563880920410156, + "loss": 0.3719, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.7976345419883728, + "rewards/margins": 0.0, + "rewards/rejected": 0.7976345419883728, + "step": 1206 + }, + { + "epoch": 0.2, + "learning_rate": 9.879480577425289e-06, + "logits/chosen": -0.8232935667037964, + "logits/rejected": -0.7589981555938721, + "logps/chosen": -65.35246276855469, + "logps/rejected": -62.953189849853516, + "loss": 0.5687, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.035964250564575, + "rewards/margins": -0.17875933647155762, + "rewards/rejected": 2.214723587036133, + "step": 1207 + }, + { + "epoch": 0.2, + "learning_rate": 9.87919359188128e-06, + "logits/chosen": -1.0158454179763794, + "logits/rejected": -0.9760311245918274, + "logps/chosen": -115.54403686523438, + "logps/rejected": -117.07015991210938, + "loss": 0.0317, + "rewards/accuracies": 1.0, + "rewards/chosen": 7.8726959228515625, + "rewards/margins": 4.466097831726074, + "rewards/rejected": 3.406597852706909, + "step": 1208 + }, + { + "epoch": 0.2, + "learning_rate": 9.878906269231158e-06, + "logits/chosen": -0.5681848526000977, + "logits/rejected": -0.5872531533241272, + "logps/chosen": -55.14109420776367, + "logps/rejected": -96.17364501953125, + "loss": 0.4377, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.7171787023544312, + "rewards/margins": -0.05891084671020508, + "rewards/rejected": 1.7760895490646362, + "step": 1209 + }, + { + "epoch": 0.2, + "learning_rate": 9.878618609494781e-06, + "logits/chosen": -0.514777660369873, + "logits/rejected": -0.7049818634986877, + "logps/chosen": -87.85296630859375, + "logps/rejected": -115.99275970458984, + "loss": 2.5422, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.0801498889923096, + "rewards/margins": -3.779796838760376, + "rewards/rejected": 5.8599467277526855, + "step": 1210 + }, + { + "epoch": 0.2, + "learning_rate": 9.878330612692018e-06, + "logits/chosen": -0.7601553797721863, + "logits/rejected": -0.7570636868476868, + "logps/chosen": -90.8199691772461, + "logps/rejected": -41.779502868652344, + "loss": 0.4647, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.3101516962051392, + "rewards/margins": -0.05900454521179199, + "rewards/rejected": 1.3691562414169312, + "step": 1211 + }, + { + "epoch": 0.2, + "learning_rate": 9.87804227884277e-06, + "logits/chosen": -0.37949496507644653, + "logits/rejected": -0.37949496507644653, + "logps/chosen": -86.2257308959961, + "logps/rejected": -86.2257308959961, + "loss": 0.4541, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.2606072425842285, + "rewards/margins": 0.0, + "rewards/rejected": 2.2606072425842285, + "step": 1212 + }, + { + "epoch": 0.2, + "learning_rate": 9.877753607966956e-06, + "logits/chosen": -1.015966534614563, + "logits/rejected": -1.2295973300933838, + "logps/chosen": -147.03982543945312, + "logps/rejected": -34.12256622314453, + "loss": 0.1888, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.739283800125122, + "rewards/margins": 2.385540723800659, + "rewards/rejected": 0.3537429869174957, + "step": 1213 + }, + { + "epoch": 0.2, + "learning_rate": 9.877464600084521e-06, + "logits/chosen": -0.659838855266571, + "logits/rejected": -0.7204603552818298, + "logps/chosen": -29.434152603149414, + "logps/rejected": -125.97750091552734, + "loss": 0.615, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.4857943058013916, + "rewards/margins": 0.643383264541626, + "rewards/rejected": 0.8424110412597656, + "step": 1214 + }, + { + "epoch": 0.2, + "learning_rate": 9.877175255215436e-06, + "logits/chosen": -0.8069262504577637, + "logits/rejected": -0.8069262504577637, + "logps/chosen": -34.79423522949219, + "logps/rejected": -34.79423522949219, + "loss": 1.7332, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.0129952430725098, + "rewards/margins": 0.0, + "rewards/rejected": 2.0129952430725098, + "step": 1215 + }, + { + "epoch": 0.2, + "learning_rate": 9.876885573379686e-06, + "logits/chosen": -1.019483208656311, + "logits/rejected": -0.9155052900314331, + "logps/chosen": -133.3856964111328, + "logps/rejected": -88.45854187011719, + "loss": 0.2431, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.789848327636719, + "rewards/margins": 1.9001128673553467, + "rewards/rejected": 2.889735460281372, + "step": 1216 + }, + { + "epoch": 0.2, + "learning_rate": 9.87659555459729e-06, + "logits/chosen": -0.8852037191390991, + "logits/rejected": -0.8837031722068787, + "logps/chosen": -87.56196594238281, + "logps/rejected": -43.33131408691406, + "loss": 0.5715, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.1218010187149048, + "rewards/margins": -0.23517227172851562, + "rewards/rejected": 1.3569732904434204, + "step": 1217 + }, + { + "epoch": 0.2, + "learning_rate": 9.876305198888284e-06, + "logits/chosen": -0.5026662945747375, + "logits/rejected": -0.544261634349823, + "logps/chosen": -80.10385131835938, + "logps/rejected": -120.27680969238281, + "loss": 0.7198, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.5455673933029175, + "rewards/margins": -1.05713951587677, + "rewards/rejected": 2.6027069091796875, + "step": 1218 + }, + { + "epoch": 0.2, + "learning_rate": 9.876014506272728e-06, + "logits/chosen": -0.7215732336044312, + "logits/rejected": -0.7215732336044312, + "logps/chosen": -46.130531311035156, + "logps/rejected": -46.130531311035156, + "loss": 0.58, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.216386556625366, + "rewards/margins": 0.0, + "rewards/rejected": 2.216386556625366, + "step": 1219 + }, + { + "epoch": 0.2, + "learning_rate": 9.875723476770706e-06, + "logits/chosen": -1.139782428741455, + "logits/rejected": -0.8814185261726379, + "logps/chosen": -187.5002899169922, + "logps/rejected": -82.40185546875, + "loss": 0.6537, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.256106853485107, + "rewards/margins": 1.807267189025879, + "rewards/rejected": 3.4488396644592285, + "step": 1220 + }, + { + "epoch": 0.2, + "learning_rate": 9.875432110402328e-06, + "logits/chosen": -0.5606957674026489, + "logits/rejected": -0.4269196689128876, + "logps/chosen": -56.90184783935547, + "logps/rejected": -16.422609329223633, + "loss": 0.1063, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.9975792169570923, + "rewards/margins": 1.7598036527633667, + "rewards/rejected": 0.23777560889720917, + "step": 1221 + }, + { + "epoch": 0.2, + "learning_rate": 9.875140407187722e-06, + "logits/chosen": -0.9426528811454773, + "logits/rejected": -0.8534015417098999, + "logps/chosen": -38.980186462402344, + "logps/rejected": -22.018470764160156, + "loss": 0.2811, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.4662060737609863, + "rewards/margins": 1.8990473747253418, + "rewards/rejected": 0.5671586990356445, + "step": 1222 + }, + { + "epoch": 0.2, + "learning_rate": 9.874848367147045e-06, + "logits/chosen": -0.36789441108703613, + "logits/rejected": -0.36091578006744385, + "logps/chosen": -4.856274604797363, + "logps/rejected": -7.0175981521606445, + "loss": 1.0036, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3447708189487457, + "rewards/margins": 0.06708496809005737, + "rewards/rejected": 0.27768585085868835, + "step": 1223 + }, + { + "epoch": 0.2, + "learning_rate": 9.87455599030047e-06, + "logits/chosen": -0.7901510000228882, + "logits/rejected": -0.6444976329803467, + "logps/chosen": -62.31843185424805, + "logps/rejected": -19.330751419067383, + "loss": 0.2153, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.5513668060302734, + "rewards/margins": 1.3457971811294556, + "rewards/rejected": 0.20556965470314026, + "step": 1224 + }, + { + "epoch": 0.2, + "learning_rate": 9.8742632766682e-06, + "logits/chosen": -1.3977707624435425, + "logits/rejected": -1.1211566925048828, + "logps/chosen": -97.32311248779297, + "logps/rejected": -104.45130920410156, + "loss": 0.8921, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.186829566955566, + "rewards/margins": 2.207390785217285, + "rewards/rejected": 3.9794387817382812, + "step": 1225 + }, + { + "epoch": 0.2, + "learning_rate": 9.873970226270458e-06, + "logits/chosen": -0.7627900838851929, + "logits/rejected": -0.8759392499923706, + "logps/chosen": -73.90705871582031, + "logps/rejected": -129.03070068359375, + "loss": 1.9139, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.6147658824920654, + "rewards/margins": -3.681624174118042, + "rewards/rejected": 7.296390056610107, + "step": 1226 + }, + { + "epoch": 0.2, + "learning_rate": 9.87367683912749e-06, + "logits/chosen": -0.5159445405006409, + "logits/rejected": -0.4699074327945709, + "logps/chosen": -37.11903762817383, + "logps/rejected": -39.777305603027344, + "loss": 0.441, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.0582512617111206, + "rewards/margins": 0.1389378309249878, + "rewards/rejected": 0.9193134307861328, + "step": 1227 + }, + { + "epoch": 0.2, + "learning_rate": 9.873383115259569e-06, + "logits/chosen": -0.7995566725730896, + "logits/rejected": -1.0946532487869263, + "logps/chosen": -69.26022338867188, + "logps/rejected": -47.490760803222656, + "loss": 0.2192, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.482311964035034, + "rewards/margins": 1.8604460954666138, + "rewards/rejected": 0.6218658685684204, + "step": 1228 + }, + { + "epoch": 0.2, + "learning_rate": 9.873089054686988e-06, + "logits/chosen": -0.6205146908760071, + "logits/rejected": -0.6197266578674316, + "logps/chosen": -73.9964599609375, + "logps/rejected": -57.150753021240234, + "loss": 0.3788, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.387762427330017, + "rewards/margins": -0.03936958312988281, + "rewards/rejected": 1.4271320104599, + "step": 1229 + }, + { + "epoch": 0.2, + "learning_rate": 9.872794657430063e-06, + "logits/chosen": -0.8312016725540161, + "logits/rejected": -0.8729535341262817, + "logps/chosen": -86.84874725341797, + "logps/rejected": -79.72615814208984, + "loss": 0.8835, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.022803544998169, + "rewards/margins": -1.5277817249298096, + "rewards/rejected": 2.5505852699279785, + "step": 1230 + }, + { + "epoch": 0.2, + "learning_rate": 9.872499923509133e-06, + "logits/chosen": -0.8156598210334778, + "logits/rejected": -1.1673647165298462, + "logps/chosen": -99.1326904296875, + "logps/rejected": -34.56013107299805, + "loss": 0.3841, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.275862216949463, + "rewards/margins": 2.00345778465271, + "rewards/rejected": 0.2724044919013977, + "step": 1231 + }, + { + "epoch": 0.2, + "learning_rate": 9.87220485294456e-06, + "logits/chosen": -1.014056921005249, + "logits/rejected": -0.8616622090339661, + "logps/chosen": -100.88105773925781, + "logps/rejected": -70.87781524658203, + "loss": 0.4383, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.8651323318481445, + "rewards/margins": 3.306813955307007, + "rewards/rejected": 1.5583183765411377, + "step": 1232 + }, + { + "epoch": 0.2, + "learning_rate": 9.871909445756737e-06, + "logits/chosen": -0.7144022583961487, + "logits/rejected": -0.7144022583961487, + "logps/chosen": -63.49884033203125, + "logps/rejected": -63.49884033203125, + "loss": 1.2869, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.148341417312622, + "rewards/margins": 0.0, + "rewards/rejected": 2.148341417312622, + "step": 1233 + }, + { + "epoch": 0.2, + "learning_rate": 9.871613701966067e-06, + "logits/chosen": -0.872255802154541, + "logits/rejected": -0.8564691543579102, + "logps/chosen": -78.80484008789062, + "logps/rejected": -88.69190979003906, + "loss": 1.2174, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.080673933029175, + "rewards/margins": 0.5119597911834717, + "rewards/rejected": 2.568714141845703, + "step": 1234 + }, + { + "epoch": 0.2, + "learning_rate": 9.871317621592987e-06, + "logits/chosen": -1.4161995649337769, + "logits/rejected": -1.44484281539917, + "logps/chosen": -62.54659652709961, + "logps/rejected": -52.60205841064453, + "loss": 0.3684, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.3555850982666016, + "rewards/margins": -0.05942964553833008, + "rewards/rejected": 2.4150147438049316, + "step": 1235 + }, + { + "epoch": 0.2, + "learning_rate": 9.871021204657953e-06, + "logits/chosen": -0.8375892043113708, + "logits/rejected": -0.8228133320808411, + "logps/chosen": -44.29710388183594, + "logps/rejected": -71.34141540527344, + "loss": 1.0473, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.6666526794433594, + "rewards/margins": -1.1187461614608765, + "rewards/rejected": 1.7853988409042358, + "step": 1236 + }, + { + "epoch": 0.2, + "learning_rate": 9.870724451181443e-06, + "logits/chosen": -0.7723588347434998, + "logits/rejected": -0.793968915939331, + "logps/chosen": -58.392295837402344, + "logps/rejected": -57.94954299926758, + "loss": 1.6309, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.5322959423065186, + "rewards/margins": 0.7806147336959839, + "rewards/rejected": 1.7516812086105347, + "step": 1237 + }, + { + "epoch": 0.2, + "learning_rate": 9.87042736118396e-06, + "logits/chosen": -1.1274598836898804, + "logits/rejected": -1.1645030975341797, + "logps/chosen": -90.8077163696289, + "logps/rejected": -118.73272705078125, + "loss": 2.5217, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.3211700916290283, + "rewards/margins": -3.264282464981079, + "rewards/rejected": 6.585452556610107, + "step": 1238 + }, + { + "epoch": 0.2, + "learning_rate": 9.87012993468603e-06, + "logits/chosen": -1.2329639196395874, + "logits/rejected": -0.8376784920692444, + "logps/chosen": -166.2804412841797, + "logps/rejected": -34.835472106933594, + "loss": 0.0984, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.237977504730225, + "rewards/margins": 5.865898132324219, + "rewards/rejected": 0.372079461812973, + "step": 1239 + }, + { + "epoch": 0.2, + "learning_rate": 9.869832171708204e-06, + "logits/chosen": -0.9312797784805298, + "logits/rejected": -0.8150002360343933, + "logps/chosen": -100.25390625, + "logps/rejected": -53.54509353637695, + "loss": 0.0181, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.5712738037109375, + "rewards/margins": 3.966601848602295, + "rewards/rejected": 2.6046719551086426, + "step": 1240 + }, + { + "epoch": 0.2, + "learning_rate": 9.869534072271056e-06, + "logits/chosen": -0.6504230499267578, + "logits/rejected": -0.5079610347747803, + "logps/chosen": -62.35444641113281, + "logps/rejected": -66.31012725830078, + "loss": 1.0298, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.9360122680664062, + "rewards/margins": -1.072187900543213, + "rewards/rejected": 3.008200168609619, + "step": 1241 + }, + { + "epoch": 0.2, + "learning_rate": 9.869235636395177e-06, + "logits/chosen": -0.8762779831886292, + "logits/rejected": -0.796047568321228, + "logps/chosen": -47.348175048828125, + "logps/rejected": -72.88645935058594, + "loss": 0.3394, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.56365966796875, + "rewards/margins": 0.05728292465209961, + "rewards/rejected": 2.5063767433166504, + "step": 1242 + }, + { + "epoch": 0.2, + "learning_rate": 9.868936864101188e-06, + "logits/chosen": -0.5892977714538574, + "logits/rejected": -0.6552653312683105, + "logps/chosen": -49.86598205566406, + "logps/rejected": -53.50642776489258, + "loss": 0.7865, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.4981857538223267, + "rewards/margins": -0.9230242967605591, + "rewards/rejected": 2.4212100505828857, + "step": 1243 + }, + { + "epoch": 0.2, + "learning_rate": 9.868637755409734e-06, + "logits/chosen": -0.7547445893287659, + "logits/rejected": -0.7876331806182861, + "logps/chosen": -63.19852066040039, + "logps/rejected": -66.4605712890625, + "loss": 0.598, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.0502086877822876, + "rewards/margins": -0.813296914100647, + "rewards/rejected": 1.8635056018829346, + "step": 1244 + }, + { + "epoch": 0.2, + "learning_rate": 9.868338310341478e-06, + "logits/chosen": -0.8642536997795105, + "logits/rejected": -0.8623949885368347, + "logps/chosen": -45.49565124511719, + "logps/rejected": -80.72471618652344, + "loss": 0.6854, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.281071424484253, + "rewards/margins": -0.4498138427734375, + "rewards/rejected": 2.7308852672576904, + "step": 1245 + }, + { + "epoch": 0.2, + "learning_rate": 9.86803852891711e-06, + "logits/chosen": -1.0869323015213013, + "logits/rejected": -1.053838849067688, + "logps/chosen": -19.678747177124023, + "logps/rejected": -54.04484558105469, + "loss": 1.7644, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.248626947402954, + "rewards/margins": -1.472318172454834, + "rewards/rejected": 2.720945119857788, + "step": 1246 + }, + { + "epoch": 0.2, + "learning_rate": 9.86773841115734e-06, + "logits/chosen": -0.7387649416923523, + "logits/rejected": -0.6661521196365356, + "logps/chosen": -67.73844909667969, + "logps/rejected": -48.85931396484375, + "loss": 0.5104, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.0909981727600098, + "rewards/margins": 0.40187227725982666, + "rewards/rejected": 1.689125895500183, + "step": 1247 + }, + { + "epoch": 0.2, + "learning_rate": 9.867437957082906e-06, + "logits/chosen": -0.3506682813167572, + "logits/rejected": -0.38140836358070374, + "logps/chosen": -77.24969482421875, + "logps/rejected": -81.25889587402344, + "loss": 0.3827, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.6476211547851562, + "rewards/margins": 0.5741058588027954, + "rewards/rejected": 0.07351531833410263, + "step": 1248 + }, + { + "epoch": 0.2, + "learning_rate": 9.867137166714565e-06, + "logits/chosen": -0.693148136138916, + "logits/rejected": -0.7190566062927246, + "logps/chosen": -68.76797485351562, + "logps/rejected": -87.46711730957031, + "loss": 0.9435, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.5113807916641235, + "rewards/margins": -1.0765632390975952, + "rewards/rejected": 2.5879440307617188, + "step": 1249 + }, + { + "epoch": 0.2, + "learning_rate": 9.866836040073099e-06, + "logits/chosen": -0.8930876851081848, + "logits/rejected": -0.9002509713172913, + "logps/chosen": -94.42363739013672, + "logps/rejected": -111.16145324707031, + "loss": 0.6619, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.3967018127441406, + "rewards/margins": -0.9474731683731079, + "rewards/rejected": 1.3441749811172485, + "step": 1250 + }, + { + "epoch": 0.2, + "learning_rate": 9.866534577179312e-06, + "logits/chosen": -0.7326200008392334, + "logits/rejected": -0.9630520939826965, + "logps/chosen": -44.9591064453125, + "logps/rejected": -127.13845825195312, + "loss": 2.2988, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.5196259021759033, + "rewards/margins": -4.434086799621582, + "rewards/rejected": 5.953712463378906, + "step": 1251 + }, + { + "epoch": 0.2, + "learning_rate": 9.866232778054035e-06, + "logits/chosen": -0.9138352870941162, + "logits/rejected": -1.0090585947036743, + "logps/chosen": -106.20962524414062, + "logps/rejected": -120.98319244384766, + "loss": 1.7026, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.091860294342041, + "rewards/margins": -3.304133415222168, + "rewards/rejected": 5.395993709564209, + "step": 1252 + }, + { + "epoch": 0.2, + "learning_rate": 9.865930642718116e-06, + "logits/chosen": -0.963988721370697, + "logits/rejected": -0.8835647106170654, + "logps/chosen": -55.08263397216797, + "logps/rejected": -35.916927337646484, + "loss": 0.3452, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.5653679370880127, + "rewards/margins": 0.2169194221496582, + "rewards/rejected": 2.3484485149383545, + "step": 1253 + }, + { + "epoch": 0.2, + "learning_rate": 9.865628171192432e-06, + "logits/chosen": -0.6782201528549194, + "logits/rejected": -0.5885261297225952, + "logps/chosen": -70.38636016845703, + "logps/rejected": -89.18748474121094, + "loss": 0.8405, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9068214893341064, + "rewards/margins": 0.490537166595459, + "rewards/rejected": 2.4162843227386475, + "step": 1254 + }, + { + "epoch": 0.2, + "learning_rate": 9.865325363497883e-06, + "logits/chosen": -0.9154080748558044, + "logits/rejected": -0.8604925274848938, + "logps/chosen": -77.63510131835938, + "logps/rejected": -67.99930572509766, + "loss": 1.0798, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.1634628772735596, + "rewards/margins": -0.9475235939025879, + "rewards/rejected": 2.1109864711761475, + "step": 1255 + }, + { + "epoch": 0.2, + "learning_rate": 9.865022219655384e-06, + "logits/chosen": -1.1339318752288818, + "logits/rejected": -1.0898176431655884, + "logps/chosen": -57.072017669677734, + "logps/rejected": -56.76060485839844, + "loss": 0.9015, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.2558635473251343, + "rewards/margins": -0.9563595056533813, + "rewards/rejected": 2.2122230529785156, + "step": 1256 + }, + { + "epoch": 0.2, + "learning_rate": 9.864718739685883e-06, + "logits/chosen": -0.2661573588848114, + "logits/rejected": -0.33406445384025574, + "logps/chosen": -103.86778259277344, + "logps/rejected": -65.28349304199219, + "loss": 0.3926, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.1172981262207031, + "rewards/margins": -0.17322003841400146, + "rewards/rejected": 1.2905181646347046, + "step": 1257 + }, + { + "epoch": 0.2, + "learning_rate": 9.86441492361035e-06, + "logits/chosen": -0.2609289288520813, + "logits/rejected": -0.2609289288520813, + "logps/chosen": -15.563179016113281, + "logps/rejected": -15.563179016113281, + "loss": 0.5308, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.6967876553535461, + "rewards/margins": 0.0, + "rewards/rejected": 0.6967876553535461, + "step": 1258 + }, + { + "epoch": 0.2, + "learning_rate": 9.864110771449771e-06, + "logits/chosen": -0.8287367820739746, + "logits/rejected": -0.6876698136329651, + "logps/chosen": -69.66841125488281, + "logps/rejected": -57.56143569946289, + "loss": 0.1397, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.530812978744507, + "rewards/margins": 1.5471707582473755, + "rewards/rejected": 1.9836422204971313, + "step": 1259 + }, + { + "epoch": 0.2, + "learning_rate": 9.863806283225163e-06, + "logits/chosen": -0.7651901841163635, + "logits/rejected": -0.732676088809967, + "logps/chosen": -41.16230010986328, + "logps/rejected": -32.66505432128906, + "loss": 0.6894, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.060983657836914, + "rewards/margins": -0.2678501605987549, + "rewards/rejected": 1.328833818435669, + "step": 1260 + }, + { + "epoch": 0.2, + "learning_rate": 9.863501458957563e-06, + "logits/chosen": -0.8479368090629578, + "logits/rejected": -0.8487094044685364, + "logps/chosen": -131.18222045898438, + "logps/rejected": -100.20098876953125, + "loss": 0.2225, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.812198162078857, + "rewards/margins": 0.70050048828125, + "rewards/rejected": 5.111697673797607, + "step": 1261 + }, + { + "epoch": 0.2, + "learning_rate": 9.863196298668032e-06, + "logits/chosen": -0.788749098777771, + "logits/rejected": -0.7050409913063049, + "logps/chosen": -55.82606506347656, + "logps/rejected": -87.73846435546875, + "loss": 1.2451, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.6812546253204346, + "rewards/margins": 0.32403564453125, + "rewards/rejected": 1.3572189807891846, + "step": 1262 + }, + { + "epoch": 0.2, + "learning_rate": 9.86289080237765e-06, + "logits/chosen": -0.6806945204734802, + "logits/rejected": -0.6336385011672974, + "logps/chosen": -50.853668212890625, + "logps/rejected": -49.8079948425293, + "loss": 0.6976, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.9317924380302429, + "rewards/margins": 0.16692578792572021, + "rewards/rejected": 0.7648666501045227, + "step": 1263 + }, + { + "epoch": 0.21, + "learning_rate": 9.86258497010753e-06, + "logits/chosen": -0.6645192503929138, + "logits/rejected": -0.6488999128341675, + "logps/chosen": -29.426048278808594, + "logps/rejected": -65.36616516113281, + "loss": 0.7985, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.8192414045333862, + "rewards/margins": 0.18898165225982666, + "rewards/rejected": 1.6302597522735596, + "step": 1264 + }, + { + "epoch": 0.21, + "learning_rate": 9.862278801878797e-06, + "logits/chosen": -0.9933329820632935, + "logits/rejected": -0.9178155064582825, + "logps/chosen": -285.15301513671875, + "logps/rejected": -161.5146484375, + "loss": 0.7684, + "rewards/accuracies": 0.0, + "rewards/chosen": 6.4727935791015625, + "rewards/margins": -1.283219814300537, + "rewards/rejected": 7.7560133934021, + "step": 1265 + }, + { + "epoch": 0.21, + "learning_rate": 9.861972297712606e-06, + "logits/chosen": -0.9718742370605469, + "logits/rejected": -0.7816808223724365, + "logps/chosen": -90.71058654785156, + "logps/rejected": -128.23199462890625, + "loss": 0.9172, + "rewards/accuracies": 0.0, + "rewards/chosen": 4.809764385223389, + "rewards/margins": -0.2327284812927246, + "rewards/rejected": 5.042492866516113, + "step": 1266 + }, + { + "epoch": 0.21, + "learning_rate": 9.861665457630134e-06, + "logits/chosen": -0.514948844909668, + "logits/rejected": -0.5163848996162415, + "logps/chosen": -47.85161209106445, + "logps/rejected": -231.6491241455078, + "loss": 1.615, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.3170833587646484, + "rewards/margins": -3.1530447006225586, + "rewards/rejected": 5.470128059387207, + "step": 1267 + }, + { + "epoch": 0.21, + "learning_rate": 9.86135828165258e-06, + "logits/chosen": -1.0844115018844604, + "logits/rejected": -1.0019404888153076, + "logps/chosen": -68.11181640625, + "logps/rejected": -50.70540237426758, + "loss": 0.1573, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.616213321685791, + "rewards/margins": 1.0302914381027222, + "rewards/rejected": 1.5859218835830688, + "step": 1268 + }, + { + "epoch": 0.21, + "learning_rate": 9.861050769801167e-06, + "logits/chosen": -1.137615442276001, + "logits/rejected": -1.191659688949585, + "logps/chosen": -239.44125366210938, + "logps/rejected": -145.48741149902344, + "loss": 0.2895, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.395788669586182, + "rewards/margins": 0.3473219871520996, + "rewards/rejected": 6.048466682434082, + "step": 1269 + }, + { + "epoch": 0.21, + "learning_rate": 9.860742922097141e-06, + "logits/chosen": -0.6517953276634216, + "logits/rejected": -0.6464185118675232, + "logps/chosen": -54.73438262939453, + "logps/rejected": -57.01117706298828, + "loss": 0.2766, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.3857247829437256, + "rewards/margins": 0.36084914207458496, + "rewards/rejected": 2.0248756408691406, + "step": 1270 + }, + { + "epoch": 0.21, + "learning_rate": 9.860434738561775e-06, + "logits/chosen": -0.5933612585067749, + "logits/rejected": -0.7018394470214844, + "logps/chosen": -104.14116668701172, + "logps/rejected": -120.6607666015625, + "loss": 0.8005, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.7784690856933594, + "rewards/margins": -0.6864924430847168, + "rewards/rejected": 4.464961528778076, + "step": 1271 + }, + { + "epoch": 0.21, + "learning_rate": 9.860126219216355e-06, + "logits/chosen": -0.865853488445282, + "logits/rejected": -0.865853488445282, + "logps/chosen": -26.575061798095703, + "logps/rejected": -26.575061798095703, + "loss": 0.4658, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.0153255462646484, + "rewards/margins": 0.0, + "rewards/rejected": 2.0153255462646484, + "step": 1272 + }, + { + "epoch": 0.21, + "learning_rate": 9.859817364082203e-06, + "logits/chosen": -0.8180833458900452, + "logits/rejected": -0.7780302166938782, + "logps/chosen": -42.87467575073242, + "logps/rejected": -32.65634536743164, + "loss": 1.3654, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.876452624797821, + "rewards/margins": -0.01598435640335083, + "rewards/rejected": 0.8924369812011719, + "step": 1273 + }, + { + "epoch": 0.21, + "learning_rate": 9.859508173180653e-06, + "logits/chosen": -1.0174639225006104, + "logits/rejected": -0.9132459759712219, + "logps/chosen": -49.92173767089844, + "logps/rejected": -59.04258346557617, + "loss": 1.8393, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.7132866382598877, + "rewards/margins": -1.0084888935089111, + "rewards/rejected": 3.721775531768799, + "step": 1274 + }, + { + "epoch": 0.21, + "learning_rate": 9.85919864653307e-06, + "logits/chosen": -0.9074223041534424, + "logits/rejected": -1.027403473854065, + "logps/chosen": -56.28791809082031, + "logps/rejected": -86.33209228515625, + "loss": 0.9187, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.9720504879951477, + "rewards/margins": -0.9863250851631165, + "rewards/rejected": 1.9583755731582642, + "step": 1275 + }, + { + "epoch": 0.21, + "learning_rate": 9.858888784160838e-06, + "logits/chosen": -0.7950548529624939, + "logits/rejected": -0.6092464327812195, + "logps/chosen": -58.59431457519531, + "logps/rejected": -14.454880714416504, + "loss": 0.139, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.7235901355743408, + "rewards/margins": 1.1495270729064941, + "rewards/rejected": 0.5740630030632019, + "step": 1276 + }, + { + "epoch": 0.21, + "learning_rate": 9.858578586085368e-06, + "logits/chosen": -0.7560673356056213, + "logits/rejected": -0.6812907457351685, + "logps/chosen": -78.43539428710938, + "logps/rejected": -51.829010009765625, + "loss": 1.3807, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.4323914051055908, + "rewards/margins": -1.1039588451385498, + "rewards/rejected": 2.5363502502441406, + "step": 1277 + }, + { + "epoch": 0.21, + "learning_rate": 9.858268052328089e-06, + "logits/chosen": -1.0640734434127808, + "logits/rejected": -1.0640734434127808, + "logps/chosen": -42.310035705566406, + "logps/rejected": -42.310035705566406, + "loss": 1.0542, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.931295394897461, + "rewards/margins": 0.0, + "rewards/rejected": 1.931295394897461, + "step": 1278 + }, + { + "epoch": 0.21, + "learning_rate": 9.857957182910456e-06, + "logits/chosen": -1.088898777961731, + "logits/rejected": -1.2287768125534058, + "logps/chosen": -142.54563903808594, + "logps/rejected": -134.5765838623047, + "loss": 1.4582, + "rewards/accuracies": 0.0, + "rewards/chosen": 4.847631931304932, + "rewards/margins": -2.8194046020507812, + "rewards/rejected": 7.667036533355713, + "step": 1279 + }, + { + "epoch": 0.21, + "learning_rate": 9.857645977853949e-06, + "logits/chosen": -1.134621262550354, + "logits/rejected": -1.0895487070083618, + "logps/chosen": -206.1488037109375, + "logps/rejected": -16.30672836303711, + "loss": 0.1878, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.51774001121521, + "rewards/margins": 2.335785150527954, + "rewards/rejected": 0.1819547712802887, + "step": 1280 + }, + { + "epoch": 0.21, + "learning_rate": 9.857334437180068e-06, + "logits/chosen": -1.1019140481948853, + "logits/rejected": -0.9710386991500854, + "logps/chosen": -98.51333618164062, + "logps/rejected": -73.6777572631836, + "loss": 1.0284, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.922543525695801, + "rewards/margins": 2.8834848403930664, + "rewards/rejected": 3.0390586853027344, + "step": 1281 + }, + { + "epoch": 0.21, + "learning_rate": 9.857022560910338e-06, + "logits/chosen": -1.2615729570388794, + "logits/rejected": -1.3196120262145996, + "logps/chosen": -31.271934509277344, + "logps/rejected": -138.1100616455078, + "loss": 2.5672, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.9155181646347046, + "rewards/margins": -4.015660285949707, + "rewards/rejected": 5.931178569793701, + "step": 1282 + }, + { + "epoch": 0.21, + "learning_rate": 9.856710349066307e-06, + "logits/chosen": -1.0438389778137207, + "logits/rejected": -1.055909276008606, + "logps/chosen": -58.51220703125, + "logps/rejected": -68.0727310180664, + "loss": 0.2986, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.3188140392303467, + "rewards/margins": 0.5907019376754761, + "rewards/rejected": 1.7281121015548706, + "step": 1283 + }, + { + "epoch": 0.21, + "learning_rate": 9.856397801669547e-06, + "logits/chosen": -0.7820770144462585, + "logits/rejected": -0.6045753359794617, + "logps/chosen": -97.77255249023438, + "logps/rejected": -28.20615577697754, + "loss": 0.1601, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.1412757635116577, + "rewards/margins": 1.2046449184417725, + "rewards/rejected": -0.06336917728185654, + "step": 1284 + }, + { + "epoch": 0.21, + "learning_rate": 9.856084918741649e-06, + "logits/chosen": -1.1800017356872559, + "logits/rejected": -1.1545727252960205, + "logps/chosen": -100.05410766601562, + "logps/rejected": -59.938350677490234, + "loss": 0.8697, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.8254287242889404, + "rewards/margins": -0.8324971199035645, + "rewards/rejected": 3.657925844192505, + "step": 1285 + }, + { + "epoch": 0.21, + "learning_rate": 9.85577170030423e-06, + "logits/chosen": -0.9335442185401917, + "logits/rejected": -0.8817749619483948, + "logps/chosen": -111.44158172607422, + "logps/rejected": -87.6611328125, + "loss": 0.1311, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.458565711975098, + "rewards/margins": 1.6553070545196533, + "rewards/rejected": 2.8032586574554443, + "step": 1286 + }, + { + "epoch": 0.21, + "learning_rate": 9.855458146378936e-06, + "logits/chosen": -0.7486635446548462, + "logits/rejected": -0.7047131061553955, + "logps/chosen": -86.16867065429688, + "logps/rejected": -44.449684143066406, + "loss": 1.6063, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.8131401538848877, + "rewards/margins": -1.2197952270507812, + "rewards/rejected": 3.032935380935669, + "step": 1287 + }, + { + "epoch": 0.21, + "learning_rate": 9.855144256987423e-06, + "logits/chosen": -1.1947078704833984, + "logits/rejected": -1.1046350002288818, + "logps/chosen": -100.02357482910156, + "logps/rejected": -25.18014907836914, + "loss": 0.461, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.6921746730804443, + "rewards/margins": 2.3921971321105957, + "rewards/rejected": 0.29997751116752625, + "step": 1288 + }, + { + "epoch": 0.21, + "learning_rate": 9.854830032151385e-06, + "logits/chosen": -1.4186195135116577, + "logits/rejected": -1.412885069847107, + "logps/chosen": -79.82098388671875, + "logps/rejected": -81.95488739013672, + "loss": 1.4522, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.6579025983810425, + "rewards/margins": -2.3547897338867188, + "rewards/rejected": 4.012692451477051, + "step": 1289 + }, + { + "epoch": 0.21, + "learning_rate": 9.854515471892527e-06, + "logits/chosen": -0.4380251169204712, + "logits/rejected": -0.4045581817626953, + "logps/chosen": -58.08330535888672, + "logps/rejected": -20.512624740600586, + "loss": 0.6898, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.47484779357910156, + "rewards/margins": 0.29211461544036865, + "rewards/rejected": 0.18273316323757172, + "step": 1290 + }, + { + "epoch": 0.21, + "learning_rate": 9.854200576232585e-06, + "logits/chosen": -0.6090889573097229, + "logits/rejected": -0.6090889573097229, + "logps/chosen": -0.5066351890563965, + "logps/rejected": -0.5066351890563965, + "loss": 1.7448, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.22375349700450897, + "rewards/margins": 0.0, + "rewards/rejected": 0.22375349700450897, + "step": 1291 + }, + { + "epoch": 0.21, + "learning_rate": 9.853885345193312e-06, + "logits/chosen": -1.2837555408477783, + "logits/rejected": -1.193549633026123, + "logps/chosen": -66.50120544433594, + "logps/rejected": -36.10539627075195, + "loss": 0.9057, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.938037097454071, + "rewards/margins": 0.9376964569091797, + "rewards/rejected": 0.0003406524774618447, + "step": 1292 + }, + { + "epoch": 0.21, + "learning_rate": 9.85356977879649e-06, + "logits/chosen": -0.4194401502609253, + "logits/rejected": -0.4194401502609253, + "logps/chosen": -38.9839973449707, + "logps/rejected": -38.9839973449707, + "loss": 0.4536, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.2782185077667236, + "rewards/margins": 0.0, + "rewards/rejected": 1.2782185077667236, + "step": 1293 + }, + { + "epoch": 0.21, + "learning_rate": 9.853253877063922e-06, + "logits/chosen": -0.3862718343734741, + "logits/rejected": -0.37925148010253906, + "logps/chosen": -1.8219935894012451, + "logps/rejected": -4.391391277313232, + "loss": 0.7073, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.34415850043296814, + "rewards/margins": 0.2015450894832611, + "rewards/rejected": 0.14261341094970703, + "step": 1294 + }, + { + "epoch": 0.21, + "learning_rate": 9.852937640017432e-06, + "logits/chosen": -0.7042527794837952, + "logits/rejected": -0.6304170489311218, + "logps/chosen": -51.048851013183594, + "logps/rejected": -66.42042541503906, + "loss": 0.6097, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.4807121753692627, + "rewards/margins": -0.35308146476745605, + "rewards/rejected": 1.8337936401367188, + "step": 1295 + }, + { + "epoch": 0.21, + "learning_rate": 9.852621067678871e-06, + "logits/chosen": -0.5627936124801636, + "logits/rejected": -0.48285984992980957, + "logps/chosen": -23.929473876953125, + "logps/rejected": -4.397508144378662, + "loss": 0.4287, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.8545089960098267, + "rewards/margins": 0.41667440533638, + "rewards/rejected": 0.43783459067344666, + "step": 1296 + }, + { + "epoch": 0.21, + "learning_rate": 9.85230416007011e-06, + "logits/chosen": -0.6829634308815002, + "logits/rejected": -0.7426563501358032, + "logps/chosen": -73.65727233886719, + "logps/rejected": -90.37315368652344, + "loss": 0.993, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.4287315607070923, + "rewards/margins": -1.5316017866134644, + "rewards/rejected": 2.9603333473205566, + "step": 1297 + }, + { + "epoch": 0.21, + "learning_rate": 9.851986917213044e-06, + "logits/chosen": -1.2463171482086182, + "logits/rejected": -1.1217212677001953, + "logps/chosen": -65.75199890136719, + "logps/rejected": -30.06633758544922, + "loss": 0.0814, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.629908800125122, + "rewards/margins": 2.09785532951355, + "rewards/rejected": 0.5320534110069275, + "step": 1298 + }, + { + "epoch": 0.21, + "learning_rate": 9.851669339129593e-06, + "logits/chosen": -0.7776363492012024, + "logits/rejected": -0.6821624040603638, + "logps/chosen": -147.93255615234375, + "logps/rejected": -42.89140701293945, + "loss": 0.8165, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.952359199523926, + "rewards/margins": 1.4783918857574463, + "rewards/rejected": 3.4739673137664795, + "step": 1299 + }, + { + "epoch": 0.21, + "learning_rate": 9.851351425841697e-06, + "logits/chosen": -0.7310723662376404, + "logits/rejected": -0.7029752135276794, + "logps/chosen": -106.24556732177734, + "logps/rejected": -66.24701690673828, + "loss": 0.9505, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.8550026416778564, + "rewards/margins": -1.1186728477478027, + "rewards/rejected": 2.973675489425659, + "step": 1300 + }, + { + "epoch": 0.21, + "learning_rate": 9.851033177371321e-06, + "logits/chosen": -1.0913372039794922, + "logits/rejected": -0.9285071492195129, + "logps/chosen": -110.34700012207031, + "logps/rejected": -80.21781921386719, + "loss": 0.5918, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.640153408050537, + "rewards/margins": 1.1072378158569336, + "rewards/rejected": 3.5329155921936035, + "step": 1301 + }, + { + "epoch": 0.21, + "learning_rate": 9.850714593740454e-06, + "logits/chosen": -1.0861670970916748, + "logits/rejected": -1.0641371011734009, + "logps/chosen": -178.77398681640625, + "logps/rejected": -86.7990493774414, + "loss": 0.2629, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.2867372035980225, + "rewards/margins": 0.4408745765686035, + "rewards/rejected": 1.845862627029419, + "step": 1302 + }, + { + "epoch": 0.21, + "learning_rate": 9.850395674971105e-06, + "logits/chosen": -1.3465100526809692, + "logits/rejected": -1.2706189155578613, + "logps/chosen": -118.44525146484375, + "logps/rejected": -37.772151947021484, + "loss": 0.388, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.397283911705017, + "rewards/margins": 1.1252639293670654, + "rewards/rejected": 0.2720199525356293, + "step": 1303 + }, + { + "epoch": 0.21, + "learning_rate": 9.850076421085311e-06, + "logits/chosen": -1.0594109296798706, + "logits/rejected": -0.896744430065155, + "logps/chosen": -207.97451782226562, + "logps/rejected": -60.87891387939453, + "loss": 2.1785, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.221993923187256, + "rewards/margins": 2.2249808311462402, + "rewards/rejected": 2.9970130920410156, + "step": 1304 + }, + { + "epoch": 0.21, + "learning_rate": 9.849756832105128e-06, + "logits/chosen": -0.6831352710723877, + "logits/rejected": -0.6068187952041626, + "logps/chosen": -50.41334533691406, + "logps/rejected": -55.66413879394531, + "loss": 0.836, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.057978868484497, + "rewards/margins": -0.4796264171600342, + "rewards/rejected": 2.5376052856445312, + "step": 1305 + }, + { + "epoch": 0.21, + "learning_rate": 9.849436908052636e-06, + "logits/chosen": -0.9349716305732727, + "logits/rejected": -0.8512851595878601, + "logps/chosen": -137.47898864746094, + "logps/rejected": -52.06302261352539, + "loss": 0.213, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.3486359119415283, + "rewards/margins": 0.6630131006240845, + "rewards/rejected": 0.6856228113174438, + "step": 1306 + }, + { + "epoch": 0.21, + "learning_rate": 9.849116648949941e-06, + "logits/chosen": -0.8716479539871216, + "logits/rejected": -0.8660334944725037, + "logps/chosen": -53.28487014770508, + "logps/rejected": -51.82577896118164, + "loss": 0.1434, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.74377703666687, + "rewards/margins": 1.5743075609207153, + "rewards/rejected": 1.1694694757461548, + "step": 1307 + }, + { + "epoch": 0.21, + "learning_rate": 9.848796054819168e-06, + "logits/chosen": -1.2936981916427612, + "logits/rejected": -1.1848812103271484, + "logps/chosen": -95.25877380371094, + "logps/rejected": -96.69863891601562, + "loss": 0.011, + "rewards/accuracies": 1.0, + "rewards/chosen": 7.057814121246338, + "rewards/margins": 4.1160688400268555, + "rewards/rejected": 2.9417450428009033, + "step": 1308 + }, + { + "epoch": 0.21, + "learning_rate": 9.848475125682466e-06, + "logits/chosen": -0.6125052571296692, + "logits/rejected": -0.6125052571296692, + "logps/chosen": -60.320167541503906, + "logps/rejected": -60.320167541503906, + "loss": 0.5259, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.760736882686615, + "rewards/margins": 0.0, + "rewards/rejected": 0.760736882686615, + "step": 1309 + }, + { + "epoch": 0.21, + "learning_rate": 9.848153861562012e-06, + "logits/chosen": -0.8412784337997437, + "logits/rejected": -0.8603564500808716, + "logps/chosen": -52.67755126953125, + "logps/rejected": -81.42962646484375, + "loss": 0.4261, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.5769637823104858, + "rewards/margins": 0.00590360164642334, + "rewards/rejected": 1.5710601806640625, + "step": 1310 + }, + { + "epoch": 0.21, + "learning_rate": 9.847832262479998e-06, + "logits/chosen": -0.9010181427001953, + "logits/rejected": -0.8861680626869202, + "logps/chosen": -89.84626770019531, + "logps/rejected": -62.820465087890625, + "loss": 2.0668, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.5496948957443237, + "rewards/margins": -0.314208984375, + "rewards/rejected": 1.8639038801193237, + "step": 1311 + }, + { + "epoch": 0.21, + "learning_rate": 9.847510328458644e-06, + "logits/chosen": -0.869997501373291, + "logits/rejected": -0.8253609538078308, + "logps/chosen": -20.63660430908203, + "logps/rejected": -3.1961722373962402, + "loss": 0.2659, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.9906185269355774, + "rewards/margins": 0.37626588344573975, + "rewards/rejected": 0.6143526434898376, + "step": 1312 + }, + { + "epoch": 0.21, + "learning_rate": 9.847188059520195e-06, + "logits/chosen": -1.2318271398544312, + "logits/rejected": -0.9455269575119019, + "logps/chosen": -97.67901611328125, + "logps/rejected": -64.6502914428711, + "loss": 0.1349, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.6359314918518066, + "rewards/margins": 1.8259178400039673, + "rewards/rejected": 1.8100136518478394, + "step": 1313 + }, + { + "epoch": 0.21, + "learning_rate": 9.846865455686915e-06, + "logits/chosen": -0.8829185962677002, + "logits/rejected": -0.8497297167778015, + "logps/chosen": -119.1408462524414, + "logps/rejected": -77.35494995117188, + "loss": 1.8412, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.9958717823028564, + "rewards/margins": 0.98051917552948, + "rewards/rejected": 1.0153526067733765, + "step": 1314 + }, + { + "epoch": 0.21, + "learning_rate": 9.846542516981094e-06, + "logits/chosen": -0.6218826770782471, + "logits/rejected": -0.5770968198776245, + "logps/chosen": -38.344879150390625, + "logps/rejected": -64.16206359863281, + "loss": 0.3806, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.772108554840088, + "rewards/margins": 1.3749359846115112, + "rewards/rejected": 1.3971725702285767, + "step": 1315 + }, + { + "epoch": 0.21, + "learning_rate": 9.846219243425046e-06, + "logits/chosen": -1.3077725172042847, + "logits/rejected": -1.2902692556381226, + "logps/chosen": -58.04450225830078, + "logps/rejected": -81.17015075683594, + "loss": 0.605, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.320638418197632, + "rewards/margins": 0.6395630836486816, + "rewards/rejected": 1.6810753345489502, + "step": 1316 + }, + { + "epoch": 0.21, + "learning_rate": 9.8458956350411e-06, + "logits/chosen": -0.8970374464988708, + "logits/rejected": -0.8970374464988708, + "logps/chosen": -56.114463806152344, + "logps/rejected": -56.114463806152344, + "loss": 0.3605, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.0624940395355225, + "rewards/margins": 0.0, + "rewards/rejected": 2.0624940395355225, + "step": 1317 + }, + { + "epoch": 0.21, + "learning_rate": 9.84557169185162e-06, + "logits/chosen": -0.831712543964386, + "logits/rejected": -0.831712543964386, + "logps/chosen": -71.96426391601562, + "logps/rejected": -71.96426391601562, + "loss": 0.3934, + "rewards/accuracies": 0.0, + "rewards/chosen": 4.142281532287598, + "rewards/margins": 0.0, + "rewards/rejected": 4.142281532287598, + "step": 1318 + }, + { + "epoch": 0.21, + "learning_rate": 9.845247413878984e-06, + "logits/chosen": -0.7367086410522461, + "logits/rejected": -0.7497817277908325, + "logps/chosen": -85.8983154296875, + "logps/rejected": -70.25230407714844, + "loss": 0.8077, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.6208534240722656, + "rewards/margins": -1.3474647998809814, + "rewards/rejected": 1.968318223953247, + "step": 1319 + }, + { + "epoch": 0.21, + "learning_rate": 9.8449228011456e-06, + "logits/chosen": -0.6817441582679749, + "logits/rejected": -0.7441951632499695, + "logps/chosen": -32.12602996826172, + "logps/rejected": -45.835853576660156, + "loss": 0.5399, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.5552040338516235, + "rewards/margins": -0.5824722051620483, + "rewards/rejected": 1.1376762390136719, + "step": 1320 + }, + { + "epoch": 0.21, + "learning_rate": 9.844597853673891e-06, + "logits/chosen": -1.1158891916275024, + "logits/rejected": -0.967304527759552, + "logps/chosen": -98.67198181152344, + "logps/rejected": -54.90468978881836, + "loss": 0.1392, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.728880405426025, + "rewards/margins": 2.9710018634796143, + "rewards/rejected": 2.757878541946411, + "step": 1321 + }, + { + "epoch": 0.21, + "learning_rate": 9.844272571486313e-06, + "logits/chosen": -0.8761589527130127, + "logits/rejected": -0.7138929963111877, + "logps/chosen": -113.57635498046875, + "logps/rejected": -70.58301544189453, + "loss": 0.8311, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.1931138038635254, + "rewards/margins": -1.3575553894042969, + "rewards/rejected": 3.5506691932678223, + "step": 1322 + }, + { + "epoch": 0.21, + "learning_rate": 9.843946954605335e-06, + "logits/chosen": -0.8702937960624695, + "logits/rejected": -0.8702937960624695, + "logps/chosen": -48.85873794555664, + "logps/rejected": -48.85873794555664, + "loss": 0.4228, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.2391934394836426, + "rewards/margins": 0.0, + "rewards/rejected": 3.2391934394836426, + "step": 1323 + }, + { + "epoch": 0.21, + "learning_rate": 9.843621003053456e-06, + "logits/chosen": -0.8211988806724548, + "logits/rejected": -0.8300033807754517, + "logps/chosen": -63.00569534301758, + "logps/rejected": -99.65137481689453, + "loss": 1.798, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.5412830710411072, + "rewards/margins": -2.0521395206451416, + "rewards/rejected": 2.5934226512908936, + "step": 1324 + }, + { + "epoch": 0.22, + "learning_rate": 9.843294716853199e-06, + "logits/chosen": -0.5703685879707336, + "logits/rejected": -0.6087574362754822, + "logps/chosen": -37.379295349121094, + "logps/rejected": -111.4068603515625, + "loss": 0.2236, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.9035099744796753, + "rewards/margins": 1.0488804578781128, + "rewards/rejected": 0.8546295166015625, + "step": 1325 + }, + { + "epoch": 0.22, + "learning_rate": 9.842968096027102e-06, + "logits/chosen": -1.2119239568710327, + "logits/rejected": -1.2838759422302246, + "logps/chosen": -180.95950317382812, + "logps/rejected": -114.87962341308594, + "loss": 1.2427, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.1489975452423096, + "rewards/margins": -2.3949432373046875, + "rewards/rejected": 3.543940782546997, + "step": 1326 + }, + { + "epoch": 0.22, + "learning_rate": 9.842641140597735e-06, + "logits/chosen": -0.9304264187812805, + "logits/rejected": -0.9332455396652222, + "logps/chosen": -58.36328125, + "logps/rejected": -73.20982360839844, + "loss": 0.7416, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.613359808921814, + "rewards/margins": -1.2253915071487427, + "rewards/rejected": 2.8387513160705566, + "step": 1327 + }, + { + "epoch": 0.22, + "learning_rate": 9.842313850587688e-06, + "logits/chosen": -0.6486389636993408, + "logits/rejected": -0.33998650312423706, + "logps/chosen": -100.63688659667969, + "logps/rejected": -18.806739807128906, + "loss": 0.0014, + "rewards/accuracies": 1.0, + "rewards/chosen": 7.365922451019287, + "rewards/margins": 6.40961217880249, + "rewards/rejected": 0.9563100934028625, + "step": 1328 + }, + { + "epoch": 0.22, + "learning_rate": 9.841986226019571e-06, + "logits/chosen": -1.1867270469665527, + "logits/rejected": -1.1175930500030518, + "logps/chosen": -226.88316345214844, + "logps/rejected": -401.7531433105469, + "loss": 1.5728, + "rewards/accuracies": 0.0, + "rewards/chosen": 5.168086528778076, + "rewards/margins": -1.849513053894043, + "rewards/rejected": 7.017599582672119, + "step": 1329 + }, + { + "epoch": 0.22, + "learning_rate": 9.84165826691602e-06, + "logits/chosen": -0.9295453429222107, + "logits/rejected": -0.911289393901825, + "logps/chosen": -67.58427429199219, + "logps/rejected": -58.876060485839844, + "loss": 0.4032, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.206045627593994, + "rewards/margins": 0.573959469795227, + "rewards/rejected": 1.632086157798767, + "step": 1330 + }, + { + "epoch": 0.22, + "learning_rate": 9.841329973299698e-06, + "logits/chosen": -0.7535712718963623, + "logits/rejected": -0.7812497019767761, + "logps/chosen": -68.7367935180664, + "logps/rejected": -170.38632202148438, + "loss": 0.471, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.3346443176269531, + "rewards/margins": -0.11007153987884521, + "rewards/rejected": 1.4447158575057983, + "step": 1331 + }, + { + "epoch": 0.22, + "learning_rate": 9.841001345193282e-06, + "logits/chosen": -1.218531608581543, + "logits/rejected": -1.216726303100586, + "logps/chosen": -64.69747924804688, + "logps/rejected": -212.28199768066406, + "loss": 2.6782, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.513903021812439, + "rewards/margins": -4.8152337074279785, + "rewards/rejected": 6.329136848449707, + "step": 1332 + }, + { + "epoch": 0.22, + "learning_rate": 9.840672382619479e-06, + "logits/chosen": -0.5799537301063538, + "logits/rejected": -0.6621447801589966, + "logps/chosen": -83.65623474121094, + "logps/rejected": -95.66996002197266, + "loss": 0.6513, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.9294052124023438, + "rewards/margins": -0.9835609197616577, + "rewards/rejected": 1.9129661321640015, + "step": 1333 + }, + { + "epoch": 0.22, + "learning_rate": 9.840343085601018e-06, + "logits/chosen": -0.9926849007606506, + "logits/rejected": -0.9359032511711121, + "logps/chosen": -121.3736343383789, + "logps/rejected": -104.45939636230469, + "loss": 0.5349, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.2401421070098877, + "rewards/margins": -0.10635292530059814, + "rewards/rejected": 1.3464950323104858, + "step": 1334 + }, + { + "epoch": 0.22, + "learning_rate": 9.840013454160648e-06, + "logits/chosen": -0.6710260510444641, + "logits/rejected": -0.6314600110054016, + "logps/chosen": -47.509883880615234, + "logps/rejected": -65.14842987060547, + "loss": 1.1017, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.9327926635742188, + "rewards/margins": -0.6307967901229858, + "rewards/rejected": 1.5635894536972046, + "step": 1335 + }, + { + "epoch": 0.22, + "learning_rate": 9.839683488321145e-06, + "logits/chosen": -1.073981523513794, + "logits/rejected": -1.0595759153366089, + "logps/chosen": -74.15015411376953, + "logps/rejected": -91.6432113647461, + "loss": 1.3683, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.6389487981796265, + "rewards/margins": -0.45695507526397705, + "rewards/rejected": 2.0959038734436035, + "step": 1336 + }, + { + "epoch": 0.22, + "learning_rate": 9.83935318810531e-06, + "logits/chosen": -1.0056835412979126, + "logits/rejected": -1.0056835412979126, + "logps/chosen": -43.38591003417969, + "logps/rejected": -43.38591003417969, + "loss": 2.4499, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.051265001296997, + "rewards/margins": 0.0, + "rewards/rejected": 3.051265001296997, + "step": 1337 + }, + { + "epoch": 0.22, + "learning_rate": 9.839022553535957e-06, + "logits/chosen": -0.6980105042457581, + "logits/rejected": -0.657002866268158, + "logps/chosen": -34.850215911865234, + "logps/rejected": -53.708213806152344, + "loss": 0.4346, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.5215381383895874, + "rewards/margins": -0.15528380870819092, + "rewards/rejected": 1.6768219470977783, + "step": 1338 + }, + { + "epoch": 0.22, + "learning_rate": 9.838691584635933e-06, + "logits/chosen": -0.8603073358535767, + "logits/rejected": -1.0219262838363647, + "logps/chosen": -107.4027099609375, + "logps/rejected": -140.54254150390625, + "loss": 1.1332, + "rewards/accuracies": 0.0, + "rewards/chosen": 4.653802394866943, + "rewards/margins": -1.9432587623596191, + "rewards/rejected": 6.5970611572265625, + "step": 1339 + }, + { + "epoch": 0.22, + "learning_rate": 9.838360281428106e-06, + "logits/chosen": -0.750721275806427, + "logits/rejected": -0.6651626825332642, + "logps/chosen": -103.23591613769531, + "logps/rejected": -72.23336791992188, + "loss": 0.0845, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.233311653137207, + "rewards/margins": 1.8206942081451416, + "rewards/rejected": 3.4126174449920654, + "step": 1340 + }, + { + "epoch": 0.22, + "learning_rate": 9.838028643935363e-06, + "logits/chosen": -1.161590814590454, + "logits/rejected": -1.0807162523269653, + "logps/chosen": -123.95628356933594, + "logps/rejected": -88.42259216308594, + "loss": 0.4859, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.4641631841659546, + "rewards/margins": -0.3021087646484375, + "rewards/rejected": 1.766271948814392, + "step": 1341 + }, + { + "epoch": 0.22, + "learning_rate": 9.837696672180618e-06, + "logits/chosen": -1.0466312170028687, + "logits/rejected": -0.9377000331878662, + "logps/chosen": -208.4878692626953, + "logps/rejected": -63.833675384521484, + "loss": 0.3302, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.607661724090576, + "rewards/margins": 4.38941764831543, + "rewards/rejected": 2.2182438373565674, + "step": 1342 + }, + { + "epoch": 0.22, + "learning_rate": 9.837364366186809e-06, + "logits/chosen": -0.9658735990524292, + "logits/rejected": -0.9746971726417542, + "logps/chosen": -134.831787109375, + "logps/rejected": -48.815460205078125, + "loss": 0.0616, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.720155239105225, + "rewards/margins": 2.877310037612915, + "rewards/rejected": 1.8428452014923096, + "step": 1343 + }, + { + "epoch": 0.22, + "learning_rate": 9.837031725976893e-06, + "logits/chosen": -1.0265737771987915, + "logits/rejected": -0.9559309482574463, + "logps/chosen": -147.06884765625, + "logps/rejected": -238.00021362304688, + "loss": 1.2803, + "rewards/accuracies": 0.0, + "rewards/chosen": 6.182289123535156, + "rewards/margins": -0.3267045021057129, + "rewards/rejected": 6.508993625640869, + "step": 1344 + }, + { + "epoch": 0.22, + "learning_rate": 9.836698751573855e-06, + "logits/chosen": -0.4814402163028717, + "logits/rejected": -0.4814402163028717, + "logps/chosen": -34.13371276855469, + "logps/rejected": -34.13371276855469, + "loss": 0.3507, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.613690197467804, + "rewards/margins": 0.0, + "rewards/rejected": 0.613690197467804, + "step": 1345 + }, + { + "epoch": 0.22, + "learning_rate": 9.836365443000697e-06, + "logits/chosen": -0.6574130654335022, + "logits/rejected": -0.6574130654335022, + "logps/chosen": -11.694459915161133, + "logps/rejected": -11.694459915161133, + "loss": 0.8252, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.5000579953193665, + "rewards/margins": 0.0, + "rewards/rejected": 0.5000579953193665, + "step": 1346 + }, + { + "epoch": 0.22, + "learning_rate": 9.83603180028045e-06, + "logits/chosen": -0.9065991640090942, + "logits/rejected": -0.8006322979927063, + "logps/chosen": -52.06525421142578, + "logps/rejected": -54.584842681884766, + "loss": 1.0868, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.9546710848808289, + "rewards/margins": -0.36523669958114624, + "rewards/rejected": 1.319907784461975, + "step": 1347 + }, + { + "epoch": 0.22, + "learning_rate": 9.835697823436163e-06, + "logits/chosen": -0.7662238478660583, + "logits/rejected": -0.7198424339294434, + "logps/chosen": -58.34709930419922, + "logps/rejected": -26.525482177734375, + "loss": 0.9803, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.4056968688964844, + "rewards/margins": -1.2912652492523193, + "rewards/rejected": 2.6969621181488037, + "step": 1348 + }, + { + "epoch": 0.22, + "learning_rate": 9.835363512490913e-06, + "logits/chosen": -0.7976744771003723, + "logits/rejected": -0.7976744771003723, + "logps/chosen": -56.32489776611328, + "logps/rejected": -56.32489776611328, + "loss": 0.9641, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.125128984451294, + "rewards/margins": 0.0, + "rewards/rejected": 2.125128984451294, + "step": 1349 + }, + { + "epoch": 0.22, + "learning_rate": 9.835028867467798e-06, + "logits/chosen": -0.6681034564971924, + "logits/rejected": -0.6691451668739319, + "logps/chosen": -6.647517204284668, + "logps/rejected": -2.261275291442871, + "loss": 0.6162, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.10065574944019318, + "rewards/margins": -0.04160383343696594, + "rewards/rejected": 0.14225958287715912, + "step": 1350 + }, + { + "epoch": 0.22, + "learning_rate": 9.834693888389937e-06, + "logits/chosen": -0.8890932202339172, + "logits/rejected": -0.8889244794845581, + "logps/chosen": -136.24484252929688, + "logps/rejected": -148.65008544921875, + "loss": 0.154, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.2575912475585938, + "rewards/margins": 1.03785240650177, + "rewards/rejected": 1.2197388410568237, + "step": 1351 + }, + { + "epoch": 0.22, + "learning_rate": 9.834358575280473e-06, + "logits/chosen": -0.9028419256210327, + "logits/rejected": -0.7461439371109009, + "logps/chosen": -59.5641975402832, + "logps/rejected": -30.299968719482422, + "loss": 0.2655, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.194387435913086, + "rewards/margins": 2.708348035812378, + "rewards/rejected": 0.4860393702983856, + "step": 1352 + }, + { + "epoch": 0.22, + "learning_rate": 9.834022928162577e-06, + "logits/chosen": -0.8018828630447388, + "logits/rejected": -0.6629946231842041, + "logps/chosen": -59.71697998046875, + "logps/rejected": -75.2799301147461, + "loss": 0.2221, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.1547532081604004, + "rewards/margins": 0.9181190729141235, + "rewards/rejected": 1.2366341352462769, + "step": 1353 + }, + { + "epoch": 0.22, + "learning_rate": 9.833686947059436e-06, + "logits/chosen": -0.83925861120224, + "logits/rejected": -0.83925861120224, + "logps/chosen": -55.1119384765625, + "logps/rejected": -55.1119384765625, + "loss": 0.3935, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.889630079269409, + "rewards/margins": 0.0, + "rewards/rejected": 2.889630079269409, + "step": 1354 + }, + { + "epoch": 0.22, + "learning_rate": 9.833350631994262e-06, + "logits/chosen": -1.0359792709350586, + "logits/rejected": -1.0923362970352173, + "logps/chosen": -84.35488891601562, + "logps/rejected": -49.81647491455078, + "loss": 1.2982, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.6961548328399658, + "rewards/margins": -1.0163428783416748, + "rewards/rejected": 2.7124977111816406, + "step": 1355 + }, + { + "epoch": 0.22, + "learning_rate": 9.833013982990293e-06, + "logits/chosen": -0.47191479802131653, + "logits/rejected": -0.47772741317749023, + "logps/chosen": -1.305493712425232, + "logps/rejected": -13.63939380645752, + "loss": 0.5126, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.29328009486198425, + "rewards/margins": 0.14366070926189423, + "rewards/rejected": 0.14961938560009003, + "step": 1356 + }, + { + "epoch": 0.22, + "learning_rate": 9.83267700007079e-06, + "logits/chosen": -1.092211127281189, + "logits/rejected": -1.092211127281189, + "logps/chosen": -62.958526611328125, + "logps/rejected": -62.958526611328125, + "loss": 0.3507, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.904573917388916, + "rewards/margins": 0.0, + "rewards/rejected": 2.904573917388916, + "step": 1357 + }, + { + "epoch": 0.22, + "learning_rate": 9.832339683259033e-06, + "logits/chosen": -0.4044782221317291, + "logits/rejected": -0.4044782221317291, + "logps/chosen": -25.1275577545166, + "logps/rejected": -25.1275577545166, + "loss": 1.9951, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.6363351941108704, + "rewards/margins": 0.0, + "rewards/rejected": 0.6363351941108704, + "step": 1358 + }, + { + "epoch": 0.22, + "learning_rate": 9.832002032578328e-06, + "logits/chosen": -0.8298934102058411, + "logits/rejected": -0.7937777042388916, + "logps/chosen": -120.05180358886719, + "logps/rejected": -56.90003967285156, + "loss": 0.2017, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.170546054840088, + "rewards/margins": 0.7351639270782471, + "rewards/rejected": 2.435382127761841, + "step": 1359 + }, + { + "epoch": 0.22, + "learning_rate": 9.831664048052004e-06, + "logits/chosen": -0.7948695421218872, + "logits/rejected": -0.7677175402641296, + "logps/chosen": -115.62348937988281, + "logps/rejected": -104.96711730957031, + "loss": 0.5565, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.230067491531372, + "rewards/margins": -0.11869192123413086, + "rewards/rejected": 2.348759412765503, + "step": 1360 + }, + { + "epoch": 0.22, + "learning_rate": 9.83132572970341e-06, + "logits/chosen": -1.458025336265564, + "logits/rejected": -1.4724292755126953, + "logps/chosen": -109.9891128540039, + "logps/rejected": -195.4858856201172, + "loss": 1.2695, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.9508712887763977, + "rewards/margins": -2.284573554992676, + "rewards/rejected": 3.2354447841644287, + "step": 1361 + }, + { + "epoch": 0.22, + "learning_rate": 9.830987077555925e-06, + "logits/chosen": -0.913201093673706, + "logits/rejected": -0.913201093673706, + "logps/chosen": -32.51438903808594, + "logps/rejected": -32.51438903808594, + "loss": 0.8996, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.1293907165527344, + "rewards/margins": 0.0, + "rewards/rejected": 2.1293907165527344, + "step": 1362 + }, + { + "epoch": 0.22, + "learning_rate": 9.830648091632942e-06, + "logits/chosen": -0.4381217062473297, + "logits/rejected": -0.4491538405418396, + "logps/chosen": -33.59280014038086, + "logps/rejected": -34.84586715698242, + "loss": 0.7335, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.4301101863384247, + "rewards/margins": -0.40408363938331604, + "rewards/rejected": -0.026026535779237747, + "step": 1363 + }, + { + "epoch": 0.22, + "learning_rate": 9.830308771957884e-06, + "logits/chosen": -0.6361492872238159, + "logits/rejected": -0.6494917273521423, + "logps/chosen": -66.17901611328125, + "logps/rejected": -67.07377624511719, + "loss": 0.8908, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.709346055984497, + "rewards/margins": -0.08728563785552979, + "rewards/rejected": 1.7966316938400269, + "step": 1364 + }, + { + "epoch": 0.22, + "learning_rate": 9.829969118554195e-06, + "logits/chosen": -0.5453648567199707, + "logits/rejected": -0.41287174820899963, + "logps/chosen": -67.27576446533203, + "logps/rejected": -63.70939636230469, + "loss": 0.344, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.6931190490722656, + "rewards/margins": 1.3277359008789062, + "rewards/rejected": 1.3653831481933594, + "step": 1365 + }, + { + "epoch": 0.22, + "learning_rate": 9.829629131445342e-06, + "logits/chosen": -0.786553680896759, + "logits/rejected": -0.6838355660438538, + "logps/chosen": -77.48179626464844, + "logps/rejected": -70.75349426269531, + "loss": 0.7525, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.1261345148086548, + "rewards/margins": -0.976606011390686, + "rewards/rejected": 2.102740526199341, + "step": 1366 + }, + { + "epoch": 0.22, + "learning_rate": 9.829288810654815e-06, + "logits/chosen": -0.6831444501876831, + "logits/rejected": -0.7135569453239441, + "logps/chosen": -91.97917175292969, + "logps/rejected": -79.33575439453125, + "loss": 0.6399, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.4080941677093506, + "rewards/margins": -0.9409193992614746, + "rewards/rejected": 3.349013566970825, + "step": 1367 + }, + { + "epoch": 0.22, + "learning_rate": 9.828948156206124e-06, + "logits/chosen": -0.6968501806259155, + "logits/rejected": -0.7886407971382141, + "logps/chosen": -87.96358489990234, + "logps/rejected": -87.95820617675781, + "loss": 0.5699, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.2276434898376465, + "rewards/margins": 2.2216527462005615, + "rewards/rejected": 2.005990743637085, + "step": 1368 + }, + { + "epoch": 0.22, + "learning_rate": 9.82860716812281e-06, + "logits/chosen": -1.062788963317871, + "logits/rejected": -1.0868529081344604, + "logps/chosen": -62.88292694091797, + "logps/rejected": -74.58673858642578, + "loss": 1.0899, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.3856216669082642, + "rewards/margins": -1.33503258228302, + "rewards/rejected": 2.720654249191284, + "step": 1369 + }, + { + "epoch": 0.22, + "learning_rate": 9.828265846428428e-06, + "logits/chosen": -0.9958295226097107, + "logits/rejected": -0.6597044467926025, + "logps/chosen": -142.32049560546875, + "logps/rejected": -111.5120849609375, + "loss": 0.0553, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.562269687652588, + "rewards/margins": 2.2533538341522217, + "rewards/rejected": 3.308915853500366, + "step": 1370 + }, + { + "epoch": 0.22, + "learning_rate": 9.827924191146561e-06, + "logits/chosen": -1.0887444019317627, + "logits/rejected": -1.008073329925537, + "logps/chosen": -53.61102294921875, + "logps/rejected": -66.77987670898438, + "loss": 0.088, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.343049049377441, + "rewards/margins": 1.915085792541504, + "rewards/rejected": 2.4279632568359375, + "step": 1371 + }, + { + "epoch": 0.22, + "learning_rate": 9.827582202300815e-06, + "logits/chosen": -0.7632893919944763, + "logits/rejected": -0.730602502822876, + "logps/chosen": -95.50846862792969, + "logps/rejected": -44.73748779296875, + "loss": 1.1, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.6070587635040283, + "rewards/margins": -0.8453001976013184, + "rewards/rejected": 2.4523589611053467, + "step": 1372 + }, + { + "epoch": 0.22, + "learning_rate": 9.82723987991482e-06, + "logits/chosen": -0.7911158800125122, + "logits/rejected": -0.7202671766281128, + "logps/chosen": -102.11056518554688, + "logps/rejected": -44.676734924316406, + "loss": 0.3887, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.3172920942306519, + "rewards/margins": 0.27436375617980957, + "rewards/rejected": 1.0429283380508423, + "step": 1373 + }, + { + "epoch": 0.22, + "learning_rate": 9.826897224012221e-06, + "logits/chosen": -0.7946086525917053, + "logits/rejected": -0.8409423828125, + "logps/chosen": -80.82361602783203, + "logps/rejected": -113.96356964111328, + "loss": 0.2223, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.5010887384414673, + "rewards/margins": 0.794599175453186, + "rewards/rejected": 0.7064895629882812, + "step": 1374 + }, + { + "epoch": 0.22, + "learning_rate": 9.8265542346167e-06, + "logits/chosen": -1.1185351610183716, + "logits/rejected": -1.1680803298950195, + "logps/chosen": -219.01791381835938, + "logps/rejected": -192.1514892578125, + "loss": 1.6245, + "rewards/accuracies": 0.0, + "rewards/chosen": 5.179333686828613, + "rewards/margins": -3.1340694427490234, + "rewards/rejected": 8.313403129577637, + "step": 1375 + }, + { + "epoch": 0.22, + "learning_rate": 9.82621091175195e-06, + "logits/chosen": -0.8580518960952759, + "logits/rejected": -0.9646714925765991, + "logps/chosen": -112.27476501464844, + "logps/rejected": -129.45960998535156, + "loss": 0.8663, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.7600586414337158, + "rewards/margins": -1.4910781383514404, + "rewards/rejected": 3.2511367797851562, + "step": 1376 + }, + { + "epoch": 0.22, + "learning_rate": 9.82586725544169e-06, + "logits/chosen": -0.873346209526062, + "logits/rejected": -0.7778847813606262, + "logps/chosen": -182.40457153320312, + "logps/rejected": -38.29207229614258, + "loss": 0.6576, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.2534987926483154, + "rewards/margins": 0.514045238494873, + "rewards/rejected": 1.7394535541534424, + "step": 1377 + }, + { + "epoch": 0.22, + "learning_rate": 9.825523265709667e-06, + "logits/chosen": -0.40987053513526917, + "logits/rejected": -0.4071345031261444, + "logps/chosen": -59.16648483276367, + "logps/rejected": -76.49638366699219, + "loss": 0.7709, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.1159420013427734, + "rewards/margins": 1.1707415580749512, + "rewards/rejected": 0.9452003836631775, + "step": 1378 + }, + { + "epoch": 0.22, + "learning_rate": 9.825178942579646e-06, + "logits/chosen": -0.805410623550415, + "logits/rejected": -0.805410623550415, + "logps/chosen": -116.30300903320312, + "logps/rejected": -116.30300903320312, + "loss": 0.3482, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.8408844470977783, + "rewards/margins": 0.0, + "rewards/rejected": 1.8408844470977783, + "step": 1379 + }, + { + "epoch": 0.22, + "learning_rate": 9.824834286075416e-06, + "logits/chosen": -0.864859402179718, + "logits/rejected": -0.8671619296073914, + "logps/chosen": -66.67914581298828, + "logps/rejected": -57.43156814575195, + "loss": 0.9895, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.588910698890686, + "rewards/margins": -0.6199886798858643, + "rewards/rejected": 1.2088993787765503, + "step": 1380 + }, + { + "epoch": 0.22, + "learning_rate": 9.82448929622079e-06, + "logits/chosen": -0.7295399308204651, + "logits/rejected": -0.7802255749702454, + "logps/chosen": -63.92992401123047, + "logps/rejected": -118.19349670410156, + "loss": 0.3967, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.8815636038780212, + "rewards/margins": -0.07046276330947876, + "rewards/rejected": 0.9520263671875, + "step": 1381 + }, + { + "epoch": 0.22, + "learning_rate": 9.824143973039602e-06, + "logits/chosen": -0.8779580593109131, + "logits/rejected": -0.7949543595314026, + "logps/chosen": -113.6414794921875, + "logps/rejected": -51.7523078918457, + "loss": 0.4826, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.8376556634902954, + "rewards/margins": -0.4567958116531372, + "rewards/rejected": 2.2944514751434326, + "step": 1382 + }, + { + "epoch": 0.22, + "learning_rate": 9.823798316555713e-06, + "logits/chosen": -0.3847478926181793, + "logits/rejected": -0.36551633477211, + "logps/chosen": -27.8013916015625, + "logps/rejected": -9.815948486328125, + "loss": 0.3357, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.14936867356300354, + "rewards/margins": 0.12793369591236115, + "rewards/rejected": 0.021434975787997246, + "step": 1383 + }, + { + "epoch": 0.22, + "learning_rate": 9.823452326793004e-06, + "logits/chosen": -0.5953328609466553, + "logits/rejected": -0.5892611145973206, + "logps/chosen": -35.85717010498047, + "logps/rejected": -25.090290069580078, + "loss": 0.463, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.21496239304542542, + "rewards/margins": -0.25890618562698364, + "rewards/rejected": 0.47386857867240906, + "step": 1384 + }, + { + "epoch": 0.22, + "learning_rate": 9.82310600377538e-06, + "logits/chosen": -0.6442485451698303, + "logits/rejected": -0.6571663618087769, + "logps/chosen": -15.610197067260742, + "logps/rejected": -2.929227352142334, + "loss": 0.4905, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.22030238807201385, + "rewards/margins": -0.1020825058221817, + "rewards/rejected": 0.32238489389419556, + "step": 1385 + }, + { + "epoch": 0.22, + "learning_rate": 9.822759347526766e-06, + "logits/chosen": -0.7684510350227356, + "logits/rejected": -0.8009753823280334, + "logps/chosen": -106.59050750732422, + "logps/rejected": -129.41256713867188, + "loss": 0.5108, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.8635620474815369, + "rewards/margins": -0.2873534560203552, + "rewards/rejected": 1.150915503501892, + "step": 1386 + }, + { + "epoch": 0.23, + "learning_rate": 9.822412358071114e-06, + "logits/chosen": -1.154624342918396, + "logits/rejected": -1.0832257270812988, + "logps/chosen": -152.51959228515625, + "logps/rejected": -106.39901733398438, + "loss": 0.938, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.4275147914886475, + "rewards/margins": -1.1254408359527588, + "rewards/rejected": 3.5529556274414062, + "step": 1387 + }, + { + "epoch": 0.23, + "learning_rate": 9.8220650354324e-06, + "logits/chosen": -0.7764955759048462, + "logits/rejected": -0.8669737577438354, + "logps/chosen": -44.744163513183594, + "logps/rejected": -80.32957458496094, + "loss": 0.3392, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.414595127105713, + "rewards/margins": 0.5093185901641846, + "rewards/rejected": 1.9052765369415283, + "step": 1388 + }, + { + "epoch": 0.23, + "learning_rate": 9.821717379634618e-06, + "logits/chosen": -1.0047129392623901, + "logits/rejected": -0.881036639213562, + "logps/chosen": -49.4752197265625, + "logps/rejected": -63.537818908691406, + "loss": 1.8643, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.7878227233886719, + "rewards/margins": 0.7175513505935669, + "rewards/rejected": 1.070271372795105, + "step": 1389 + }, + { + "epoch": 0.23, + "learning_rate": 9.82136939070179e-06, + "logits/chosen": -1.2350071668624878, + "logits/rejected": -1.2201213836669922, + "logps/chosen": -39.1644172668457, + "logps/rejected": -10.111979484558105, + "loss": 0.4931, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.1629307270050049, + "rewards/margins": 0.6915910243988037, + "rewards/rejected": 0.47133970260620117, + "step": 1390 + }, + { + "epoch": 0.23, + "learning_rate": 9.821021068657955e-06, + "logits/chosen": -1.0712363719940186, + "logits/rejected": -1.2681516408920288, + "logps/chosen": -74.10456085205078, + "logps/rejected": -35.7151985168457, + "loss": 1.032, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.3556268215179443, + "rewards/margins": 2.0637872219085693, + "rewards/rejected": 0.291839599609375, + "step": 1391 + }, + { + "epoch": 0.23, + "learning_rate": 9.820672413527181e-06, + "logits/chosen": -0.782370924949646, + "logits/rejected": -0.7654942870140076, + "logps/chosen": -60.40008544921875, + "logps/rejected": -58.43157196044922, + "loss": 0.1456, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.539914846420288, + "rewards/margins": 1.3461869955062866, + "rewards/rejected": 1.1937278509140015, + "step": 1392 + }, + { + "epoch": 0.23, + "learning_rate": 9.820323425333559e-06, + "logits/chosen": -0.893447995185852, + "logits/rejected": -0.7421493530273438, + "logps/chosen": -109.11514282226562, + "logps/rejected": -30.406665802001953, + "loss": 0.4011, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.7983947992324829, + "rewards/margins": 0.4678848385810852, + "rewards/rejected": 0.3305099606513977, + "step": 1393 + }, + { + "epoch": 0.23, + "learning_rate": 9.819974104101198e-06, + "logits/chosen": -1.0173767805099487, + "logits/rejected": -0.9304893612861633, + "logps/chosen": -124.17440795898438, + "logps/rejected": -93.28824615478516, + "loss": 0.3495, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.4183197021484375, + "rewards/margins": 0.39681482315063477, + "rewards/rejected": 5.021504878997803, + "step": 1394 + }, + { + "epoch": 0.23, + "learning_rate": 9.819624449854232e-06, + "logits/chosen": -0.48732990026474, + "logits/rejected": -0.48732990026474, + "logps/chosen": -79.50277709960938, + "logps/rejected": -79.50277709960938, + "loss": 1.1796, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.9961883425712585, + "rewards/margins": 0.0, + "rewards/rejected": 0.9961883425712585, + "step": 1395 + }, + { + "epoch": 0.23, + "learning_rate": 9.81927446261682e-06, + "logits/chosen": -0.7034106254577637, + "logits/rejected": -0.697645366191864, + "logps/chosen": -11.987288475036621, + "logps/rejected": -21.782182693481445, + "loss": 0.454, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.305315226316452, + "rewards/margins": 0.2177034616470337, + "rewards/rejected": 0.08761177211999893, + "step": 1396 + }, + { + "epoch": 0.23, + "learning_rate": 9.818924142413144e-06, + "logits/chosen": -0.8786627650260925, + "logits/rejected": -0.7876309156417847, + "logps/chosen": -82.3662109375, + "logps/rejected": -44.606685638427734, + "loss": 0.2225, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.1913743019104004, + "rewards/margins": 0.6247334480285645, + "rewards/rejected": 1.566640853881836, + "step": 1397 + }, + { + "epoch": 0.23, + "learning_rate": 9.818573489267408e-06, + "logits/chosen": -1.0162168741226196, + "logits/rejected": -1.065256953239441, + "logps/chosen": -57.00323486328125, + "logps/rejected": -66.3293685913086, + "loss": 0.0946, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.8719193935394287, + "rewards/margins": 1.573861002922058, + "rewards/rejected": 1.2980583906173706, + "step": 1398 + }, + { + "epoch": 0.23, + "learning_rate": 9.818222503203836e-06, + "logits/chosen": -0.8975590467453003, + "logits/rejected": -0.8464637994766235, + "logps/chosen": -80.99540710449219, + "logps/rejected": -76.98063659667969, + "loss": 0.3007, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.633643388748169, + "rewards/margins": 0.5762871503829956, + "rewards/rejected": 1.0573562383651733, + "step": 1399 + }, + { + "epoch": 0.23, + "learning_rate": 9.817871184246682e-06, + "logits/chosen": -1.3444651365280151, + "logits/rejected": -1.1897860765457153, + "logps/chosen": -133.20664978027344, + "logps/rejected": -73.48467254638672, + "loss": 0.0317, + "rewards/accuracies": 1.0, + "rewards/chosen": 9.172511100769043, + "rewards/margins": 6.315762042999268, + "rewards/rejected": 2.8567490577697754, + "step": 1400 + }, + { + "epoch": 0.23, + "learning_rate": 9.817519532420214e-06, + "logits/chosen": -0.958329439163208, + "logits/rejected": -0.9175311326980591, + "logps/chosen": -86.65948486328125, + "logps/rejected": -48.65095901489258, + "loss": 1.4704, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.6143509149551392, + "rewards/margins": 0.347520112991333, + "rewards/rejected": 1.2668308019638062, + "step": 1401 + }, + { + "epoch": 0.23, + "learning_rate": 9.817167547748729e-06, + "logits/chosen": -0.8880851864814758, + "logits/rejected": -0.9172304272651672, + "logps/chosen": -94.00147247314453, + "logps/rejected": -109.9195785522461, + "loss": 0.9667, + "rewards/accuracies": 0.0, + "rewards/chosen": 4.102345943450928, + "rewards/margins": -1.6638259887695312, + "rewards/rejected": 5.766171932220459, + "step": 1402 + }, + { + "epoch": 0.23, + "learning_rate": 9.816815230256549e-06, + "logits/chosen": -0.49469634890556335, + "logits/rejected": -0.49440842866897583, + "logps/chosen": -86.31243133544922, + "logps/rejected": -134.2646942138672, + "loss": 0.8626, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.4331191778182983, + "rewards/margins": -0.16278076171875, + "rewards/rejected": 1.5958999395370483, + "step": 1403 + }, + { + "epoch": 0.23, + "learning_rate": 9.816462579968014e-06, + "logits/chosen": -1.0126149654388428, + "logits/rejected": -0.7723380327224731, + "logps/chosen": -100.81062316894531, + "logps/rejected": -112.6800765991211, + "loss": 0.3114, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.843655586242676, + "rewards/margins": 2.1302454471588135, + "rewards/rejected": 2.7134101390838623, + "step": 1404 + }, + { + "epoch": 0.23, + "learning_rate": 9.816109596907488e-06, + "logits/chosen": -1.103757619857788, + "logits/rejected": -0.9232551455497742, + "logps/chosen": -129.41134643554688, + "logps/rejected": -94.15068817138672, + "loss": 0.0246, + "rewards/accuracies": 1.0, + "rewards/chosen": 7.019641399383545, + "rewards/margins": 3.982866048812866, + "rewards/rejected": 3.0367753505706787, + "step": 1405 + }, + { + "epoch": 0.23, + "learning_rate": 9.815756281099357e-06, + "logits/chosen": -0.8568373322486877, + "logits/rejected": -0.8139373660087585, + "logps/chosen": -51.238426208496094, + "logps/rejected": -37.6489143371582, + "loss": 0.2594, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.796331763267517, + "rewards/margins": 0.417293906211853, + "rewards/rejected": 1.379037857055664, + "step": 1406 + }, + { + "epoch": 0.23, + "learning_rate": 9.815402632568037e-06, + "logits/chosen": -1.0226582288742065, + "logits/rejected": -0.9745053052902222, + "logps/chosen": -89.25222778320312, + "logps/rejected": -88.04409790039062, + "loss": 0.9418, + "rewards/accuracies": 0.0, + "rewards/chosen": 5.456124782562256, + "rewards/margins": -0.5616059303283691, + "rewards/rejected": 6.017730712890625, + "step": 1407 + }, + { + "epoch": 0.23, + "learning_rate": 9.815048651337956e-06, + "logits/chosen": -0.8959064483642578, + "logits/rejected": -0.8784207701683044, + "logps/chosen": -90.56304931640625, + "logps/rejected": -50.05887222290039, + "loss": 1.3784, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.022681474685669, + "rewards/margins": -0.8452823162078857, + "rewards/rejected": 1.8679637908935547, + "step": 1408 + }, + { + "epoch": 0.23, + "learning_rate": 9.814694337433577e-06, + "logits/chosen": -0.8741129636764526, + "logits/rejected": -0.8461058139801025, + "logps/chosen": -91.53385925292969, + "logps/rejected": -59.3127555847168, + "loss": 0.4363, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.8462417125701904, + "rewards/margins": -0.21567654609680176, + "rewards/rejected": 4.061918258666992, + "step": 1409 + }, + { + "epoch": 0.23, + "learning_rate": 9.814339690879376e-06, + "logits/chosen": -0.8021464347839355, + "logits/rejected": -0.8443889021873474, + "logps/chosen": -161.234619140625, + "logps/rejected": -51.14176559448242, + "loss": 0.1192, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.83878493309021, + "rewards/margins": 1.5144100189208984, + "rewards/rejected": 2.3243749141693115, + "step": 1410 + }, + { + "epoch": 0.23, + "learning_rate": 9.813984711699855e-06, + "logits/chosen": -0.7671195268630981, + "logits/rejected": -0.7671195268630981, + "logps/chosen": -38.34944534301758, + "logps/rejected": -38.34944534301758, + "loss": 1.0476, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.2876869440078735, + "rewards/margins": 0.0, + "rewards/rejected": 1.2876869440078735, + "step": 1411 + }, + { + "epoch": 0.23, + "learning_rate": 9.81362939991954e-06, + "logits/chosen": -0.9007020592689514, + "logits/rejected": -0.8446071147918701, + "logps/chosen": -60.42390823364258, + "logps/rejected": -38.89107131958008, + "loss": 0.8053, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.8706508874893188, + "rewards/margins": 1.0456719398498535, + "rewards/rejected": 0.8249790072441101, + "step": 1412 + }, + { + "epoch": 0.23, + "learning_rate": 9.813273755562982e-06, + "logits/chosen": -1.0065408945083618, + "logits/rejected": -0.9380553960800171, + "logps/chosen": -115.17384338378906, + "logps/rejected": -92.37777709960938, + "loss": 0.1071, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.539405822753906, + "rewards/margins": 1.4576551914215088, + "rewards/rejected": 3.0817506313323975, + "step": 1413 + }, + { + "epoch": 0.23, + "learning_rate": 9.81291777865475e-06, + "logits/chosen": -0.8461389541625977, + "logits/rejected": -0.8606702089309692, + "logps/chosen": -86.96293640136719, + "logps/rejected": -53.066749572753906, + "loss": 0.3109, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.6240830421447754, + "rewards/margins": 0.33673715591430664, + "rewards/rejected": 2.2873458862304688, + "step": 1414 + }, + { + "epoch": 0.23, + "learning_rate": 9.812561469219439e-06, + "logits/chosen": -0.7007614374160767, + "logits/rejected": -0.6740411520004272, + "logps/chosen": -77.42970275878906, + "logps/rejected": -116.59169006347656, + "loss": 0.4845, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.6051788330078125, + "rewards/margins": 0.8618842959403992, + "rewards/rejected": 0.7432945370674133, + "step": 1415 + }, + { + "epoch": 0.23, + "learning_rate": 9.812204827281669e-06, + "logits/chosen": -1.0451372861862183, + "logits/rejected": -0.9622447490692139, + "logps/chosen": -54.78357696533203, + "logps/rejected": -40.650238037109375, + "loss": 0.6268, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.0954102277755737, + "rewards/margins": -0.35243868827819824, + "rewards/rejected": 1.447848916053772, + "step": 1416 + }, + { + "epoch": 0.23, + "learning_rate": 9.811847852866079e-06, + "logits/chosen": -0.6018654704093933, + "logits/rejected": -0.6199285984039307, + "logps/chosen": -52.685279846191406, + "logps/rejected": -74.81430053710938, + "loss": 0.6965, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.2700774669647217, + "rewards/margins": 0.12839126586914062, + "rewards/rejected": 2.141686201095581, + "step": 1417 + }, + { + "epoch": 0.23, + "learning_rate": 9.811490545997331e-06, + "logits/chosen": -0.7810566425323486, + "logits/rejected": -0.7293164134025574, + "logps/chosen": -61.183204650878906, + "logps/rejected": -29.5632266998291, + "loss": 0.5733, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.4466896057128906, + "rewards/margins": -0.08486771583557129, + "rewards/rejected": 1.531557321548462, + "step": 1418 + }, + { + "epoch": 0.23, + "learning_rate": 9.811132906700114e-06, + "logits/chosen": -1.0187021493911743, + "logits/rejected": -0.8900231122970581, + "logps/chosen": -81.40715789794922, + "logps/rejected": -46.89038848876953, + "loss": 0.0684, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.1183128356933594, + "rewards/margins": 2.073838472366333, + "rewards/rejected": 0.04447441175580025, + "step": 1419 + }, + { + "epoch": 0.23, + "learning_rate": 9.810774934999136e-06, + "logits/chosen": -0.7780190110206604, + "logits/rejected": -0.8240765929222107, + "logps/chosen": -87.95724487304688, + "logps/rejected": -147.73004150390625, + "loss": 2.0081, + "rewards/accuracies": 0.0, + "rewards/chosen": 4.323751926422119, + "rewards/margins": -1.8876280784606934, + "rewards/rejected": 6.2113800048828125, + "step": 1420 + }, + { + "epoch": 0.23, + "learning_rate": 9.81041663091913e-06, + "logits/chosen": -1.149672031402588, + "logits/rejected": -0.9295374155044556, + "logps/chosen": -95.96258544921875, + "logps/rejected": -79.69905090332031, + "loss": 0.0638, + "rewards/accuracies": 1.0, + "rewards/chosen": 7.060618877410889, + "rewards/margins": 3.9152209758758545, + "rewards/rejected": 3.145397901535034, + "step": 1421 + }, + { + "epoch": 0.23, + "learning_rate": 9.810057994484851e-06, + "logits/chosen": -0.9828371405601501, + "logits/rejected": -0.9801461696624756, + "logps/chosen": -137.978759765625, + "logps/rejected": -103.03160095214844, + "loss": 0.7106, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.608163595199585, + "rewards/margins": -1.0035841464996338, + "rewards/rejected": 4.611747741699219, + "step": 1422 + }, + { + "epoch": 0.23, + "learning_rate": 9.80969902572108e-06, + "logits/chosen": -0.6559385061264038, + "logits/rejected": -0.6559385061264038, + "logps/chosen": -17.293458938598633, + "logps/rejected": -17.293458938598633, + "loss": 1.1406, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.43786296248435974, + "rewards/margins": 0.0, + "rewards/rejected": 0.43786296248435974, + "step": 1423 + }, + { + "epoch": 0.23, + "learning_rate": 9.809339724652613e-06, + "logits/chosen": -1.134367823600769, + "logits/rejected": -1.0153471231460571, + "logps/chosen": -86.58798217773438, + "logps/rejected": -116.6375732421875, + "loss": 0.4765, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.4351303577423096, + "rewards/margins": -0.45478355884552, + "rewards/rejected": 1.8899139165878296, + "step": 1424 + }, + { + "epoch": 0.23, + "learning_rate": 9.808980091304279e-06, + "logits/chosen": -0.7300485968589783, + "logits/rejected": -0.7535690665245056, + "logps/chosen": -64.8046646118164, + "logps/rejected": -110.22239685058594, + "loss": 0.9143, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.0846909284591675, + "rewards/margins": 0.1353539228439331, + "rewards/rejected": 0.9493370056152344, + "step": 1425 + }, + { + "epoch": 0.23, + "learning_rate": 9.808620125700925e-06, + "logits/chosen": -0.8308464288711548, + "logits/rejected": -0.9401999711990356, + "logps/chosen": -35.388397216796875, + "logps/rejected": -49.288421630859375, + "loss": 3.0836, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.1124413013458252, + "rewards/margins": -5.807918548583984, + "rewards/rejected": 6.920360088348389, + "step": 1426 + }, + { + "epoch": 0.23, + "learning_rate": 9.808259827867417e-06, + "logits/chosen": -0.8583246469497681, + "logits/rejected": -0.8340738415718079, + "logps/chosen": -46.09569549560547, + "logps/rejected": -23.286720275878906, + "loss": 0.5963, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.6679153442382812, + "rewards/margins": 0.0181390643119812, + "rewards/rejected": 0.6497762799263, + "step": 1427 + }, + { + "epoch": 0.23, + "learning_rate": 9.807899197828655e-06, + "logits/chosen": -0.6748468279838562, + "logits/rejected": -0.6585890054702759, + "logps/chosen": -74.88055419921875, + "logps/rejected": -31.209640502929688, + "loss": 1.0718, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.7777938842773438, + "rewards/margins": 1.3828274011611938, + "rewards/rejected": 1.39496648311615, + "step": 1428 + }, + { + "epoch": 0.23, + "learning_rate": 9.807538235609549e-06, + "logits/chosen": -0.6827892065048218, + "logits/rejected": -0.6842381954193115, + "logps/chosen": -2.9786744117736816, + "logps/rejected": -1.8632839918136597, + "loss": 0.9086, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.1381085366010666, + "rewards/margins": -0.1728098839521408, + "rewards/rejected": 0.3109184205532074, + "step": 1429 + }, + { + "epoch": 0.23, + "learning_rate": 9.80717694123504e-06, + "logits/chosen": -1.0486284494400024, + "logits/rejected": -1.0280964374542236, + "logps/chosen": -101.23263549804688, + "logps/rejected": -125.01116180419922, + "loss": 2.5833, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.5904678106307983, + "rewards/margins": -1.1308997869491577, + "rewards/rejected": 2.721367597579956, + "step": 1430 + }, + { + "epoch": 0.23, + "learning_rate": 9.80681531473009e-06, + "logits/chosen": -1.211956262588501, + "logits/rejected": -1.1919746398925781, + "logps/chosen": -77.03521728515625, + "logps/rejected": -69.95922088623047, + "loss": 0.8263, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.8378372192382812, + "rewards/margins": 0.7589668035507202, + "rewards/rejected": 1.078870415687561, + "step": 1431 + }, + { + "epoch": 0.23, + "learning_rate": 9.806453356119685e-06, + "logits/chosen": -0.38448309898376465, + "logits/rejected": -0.38448309898376465, + "logps/chosen": -1.5674383640289307, + "logps/rejected": -1.5674383640289307, + "loss": 0.4896, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.16716642677783966, + "rewards/margins": 0.0, + "rewards/rejected": 0.16716642677783966, + "step": 1432 + }, + { + "epoch": 0.23, + "learning_rate": 9.806091065428831e-06, + "logits/chosen": -0.8162040114402771, + "logits/rejected": -0.6227399706840515, + "logps/chosen": -59.55854034423828, + "logps/rejected": -26.21780776977539, + "loss": 2.1069, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.989154815673828, + "rewards/margins": 2.4655349254608154, + "rewards/rejected": 0.5236198306083679, + "step": 1433 + }, + { + "epoch": 0.23, + "learning_rate": 9.80572844268256e-06, + "logits/chosen": -0.6204530000686646, + "logits/rejected": -0.4831746220588684, + "logps/chosen": -37.76337432861328, + "logps/rejected": -59.56156921386719, + "loss": 0.8691, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.2026344537734985, + "rewards/margins": -0.611609697341919, + "rewards/rejected": 1.8142441511154175, + "step": 1434 + }, + { + "epoch": 0.23, + "learning_rate": 9.805365487905926e-06, + "logits/chosen": -1.0175049304962158, + "logits/rejected": -1.232774019241333, + "logps/chosen": -63.176536560058594, + "logps/rejected": -34.49365997314453, + "loss": 0.0603, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.3862671852111816, + "rewards/margins": 2.1045830249786377, + "rewards/rejected": 0.28168413043022156, + "step": 1435 + }, + { + "epoch": 0.23, + "learning_rate": 9.805002201124008e-06, + "logits/chosen": -0.9223230481147766, + "logits/rejected": -0.9215749502182007, + "logps/chosen": -32.667564392089844, + "logps/rejected": -53.40961456298828, + "loss": 0.8983, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.209977388381958, + "rewards/margins": -1.611795425415039, + "rewards/rejected": 2.821772813796997, + "step": 1436 + }, + { + "epoch": 0.23, + "learning_rate": 9.8046385823619e-06, + "logits/chosen": -0.8787326812744141, + "logits/rejected": -0.8519056439399719, + "logps/chosen": -68.19935607910156, + "logps/rejected": -51.34136199951172, + "loss": 0.7225, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.6263046264648438, + "rewards/margins": -0.11411595344543457, + "rewards/rejected": 1.7404205799102783, + "step": 1437 + }, + { + "epoch": 0.23, + "learning_rate": 9.80427463164473e-06, + "logits/chosen": -0.6002478003501892, + "logits/rejected": -0.6143465042114258, + "logps/chosen": -58.33472442626953, + "logps/rejected": -127.5922622680664, + "loss": 0.2159, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.015143632888794, + "rewards/margins": 0.6728897094726562, + "rewards/rejected": 0.3422538936138153, + "step": 1438 + }, + { + "epoch": 0.23, + "learning_rate": 9.80391034899764e-06, + "logits/chosen": -1.104780912399292, + "logits/rejected": -1.0665876865386963, + "logps/chosen": -109.56723022460938, + "logps/rejected": -75.65263366699219, + "loss": 1.2911, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.1157439947128296, + "rewards/margins": -1.4241670370101929, + "rewards/rejected": 2.5399110317230225, + "step": 1439 + }, + { + "epoch": 0.23, + "learning_rate": 9.803545734445802e-06, + "logits/chosen": -0.9700284004211426, + "logits/rejected": -0.8359267115592957, + "logps/chosen": -117.59624481201172, + "logps/rejected": -149.37576293945312, + "loss": 0.0405, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.132181644439697, + "rewards/margins": 2.8196418285369873, + "rewards/rejected": 3.31253981590271, + "step": 1440 + }, + { + "epoch": 0.23, + "learning_rate": 9.803180788014404e-06, + "logits/chosen": -0.9833840727806091, + "logits/rejected": -1.2193892002105713, + "logps/chosen": -68.1070327758789, + "logps/rejected": -191.17481994628906, + "loss": 5.2709, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.3138389587402344, + "rewards/margins": -8.611285209655762, + "rewards/rejected": 10.925124168395996, + "step": 1441 + }, + { + "epoch": 0.23, + "learning_rate": 9.802815509728662e-06, + "logits/chosen": -0.29834461212158203, + "logits/rejected": -0.29834461212158203, + "logps/chosen": -1.8741776943206787, + "logps/rejected": -1.8741776943206787, + "loss": 0.6104, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.1657334566116333, + "rewards/margins": 0.0, + "rewards/rejected": 0.1657334566116333, + "step": 1442 + }, + { + "epoch": 0.23, + "learning_rate": 9.802449899613814e-06, + "logits/chosen": -0.7757880091667175, + "logits/rejected": -0.9247838258743286, + "logps/chosen": -135.26089477539062, + "logps/rejected": -166.69680786132812, + "loss": 3.3016, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.3756226301193237, + "rewards/margins": -6.435421466827393, + "rewards/rejected": 7.811044216156006, + "step": 1443 + }, + { + "epoch": 0.23, + "learning_rate": 9.802083957695116e-06, + "logits/chosen": -1.1242034435272217, + "logits/rejected": -1.0197657346725464, + "logps/chosen": -111.23066711425781, + "logps/rejected": -45.4809455871582, + "loss": 1.3357, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.128787517547607, + "rewards/margins": 3.9069197177886963, + "rewards/rejected": 0.22186775505542755, + "step": 1444 + }, + { + "epoch": 0.23, + "learning_rate": 9.801717683997856e-06, + "logits/chosen": -1.157844066619873, + "logits/rejected": -1.0094822645187378, + "logps/chosen": -104.45458984375, + "logps/rejected": -49.20832061767578, + "loss": 0.0594, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.7952210903167725, + "rewards/margins": 2.3190529346466064, + "rewards/rejected": 0.47616806626319885, + "step": 1445 + }, + { + "epoch": 0.23, + "learning_rate": 9.801351078547339e-06, + "logits/chosen": -0.8941054940223694, + "logits/rejected": -0.8941054940223694, + "logps/chosen": -51.71213150024414, + "logps/rejected": -51.71213150024414, + "loss": 0.6376, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.592249631881714, + "rewards/margins": 0.0, + "rewards/rejected": 2.592249631881714, + "step": 1446 + }, + { + "epoch": 0.23, + "learning_rate": 9.800984141368892e-06, + "logits/chosen": -1.075014352798462, + "logits/rejected": -1.081671953201294, + "logps/chosen": -80.20519256591797, + "logps/rejected": -104.7479248046875, + "loss": 0.3059, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.4699349403381348, + "rewards/margins": 1.699164628982544, + "rewards/rejected": 0.770770251750946, + "step": 1447 + }, + { + "epoch": 0.24, + "learning_rate": 9.800616872487869e-06, + "logits/chosen": -0.8116558194160461, + "logits/rejected": -0.8116558194160461, + "logps/chosen": -42.8642692565918, + "logps/rejected": -42.8642692565918, + "loss": 1.0611, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.7421771883964539, + "rewards/margins": 0.0, + "rewards/rejected": 0.7421771883964539, + "step": 1448 + }, + { + "epoch": 0.24, + "learning_rate": 9.800249271929645e-06, + "logits/chosen": -1.275158166885376, + "logits/rejected": -1.3165044784545898, + "logps/chosen": -246.78121948242188, + "logps/rejected": -47.60959243774414, + "loss": 0.1094, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.164938449859619, + "rewards/margins": 1.4135005474090576, + "rewards/rejected": 3.7514379024505615, + "step": 1449 + }, + { + "epoch": 0.24, + "learning_rate": 9.799881339719615e-06, + "logits/chosen": -0.8786641359329224, + "logits/rejected": -0.9041182398796082, + "logps/chosen": -62.71476745605469, + "logps/rejected": -67.96363830566406, + "loss": 1.0236, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.4583427906036377, + "rewards/margins": 0.3144240379333496, + "rewards/rejected": 2.143918752670288, + "step": 1450 + }, + { + "epoch": 0.24, + "learning_rate": 9.799513075883202e-06, + "logits/chosen": -0.7084856629371643, + "logits/rejected": -0.7427960634231567, + "logps/chosen": -128.72286987304688, + "logps/rejected": -69.60809326171875, + "loss": 1.207, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.5116485953330994, + "rewards/margins": -2.0195083618164062, + "rewards/rejected": 2.5311570167541504, + "step": 1451 + }, + { + "epoch": 0.24, + "learning_rate": 9.799144480445849e-06, + "logits/chosen": -0.685734212398529, + "logits/rejected": -0.5510356426239014, + "logps/chosen": -65.03904724121094, + "logps/rejected": -14.209437370300293, + "loss": 0.0717, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.028887987136841, + "rewards/margins": 1.9284430742263794, + "rewards/rejected": 1.1004449129104614, + "step": 1452 + }, + { + "epoch": 0.24, + "learning_rate": 9.798775553433022e-06, + "logits/chosen": -0.7403131723403931, + "logits/rejected": -0.7316851615905762, + "logps/chosen": -73.0804443359375, + "logps/rejected": -50.1316032409668, + "loss": 1.2816, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.0212318897247314, + "rewards/margins": 1.440877914428711, + "rewards/rejected": 0.5803539156913757, + "step": 1453 + }, + { + "epoch": 0.24, + "learning_rate": 9.79840629487021e-06, + "logits/chosen": -0.8869913816452026, + "logits/rejected": -0.9029797911643982, + "logps/chosen": -3.9779369831085205, + "logps/rejected": -13.560287475585938, + "loss": 0.4136, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.36180898547172546, + "rewards/margins": 0.19066105782985687, + "rewards/rejected": 0.1711479276418686, + "step": 1454 + }, + { + "epoch": 0.24, + "learning_rate": 9.798036704782927e-06, + "logits/chosen": -0.8977634906768799, + "logits/rejected": -0.7147353291511536, + "logps/chosen": -82.86138916015625, + "logps/rejected": -32.881839752197266, + "loss": 0.0339, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.351758003234863, + "rewards/margins": 2.7257916927337646, + "rewards/rejected": 1.6259663105010986, + "step": 1455 + }, + { + "epoch": 0.24, + "learning_rate": 9.797666783196707e-06, + "logits/chosen": -1.1054571866989136, + "logits/rejected": -1.1432409286499023, + "logps/chosen": -97.61074829101562, + "logps/rejected": -135.7197265625, + "loss": 0.6771, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.9337601065635681, + "rewards/margins": -0.9105057120323181, + "rewards/rejected": 1.8442658185958862, + "step": 1456 + }, + { + "epoch": 0.24, + "learning_rate": 9.797296530137108e-06, + "logits/chosen": -0.46997594833374023, + "logits/rejected": -0.46997594833374023, + "logps/chosen": -40.33574676513672, + "logps/rejected": -40.33574676513672, + "loss": 1.1287, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.2794242799282074, + "rewards/margins": 0.0, + "rewards/rejected": 0.2794242799282074, + "step": 1457 + }, + { + "epoch": 0.24, + "learning_rate": 9.796925945629711e-06, + "logits/chosen": -1.046186089515686, + "logits/rejected": -1.0722156763076782, + "logps/chosen": -51.05333709716797, + "logps/rejected": -50.72948455810547, + "loss": 0.5963, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.7248542308807373, + "rewards/margins": 0.41455984115600586, + "rewards/rejected": 2.3102943897247314, + "step": 1458 + }, + { + "epoch": 0.24, + "learning_rate": 9.79655502970012e-06, + "logits/chosen": -1.03084397315979, + "logits/rejected": -1.0464955568313599, + "logps/chosen": -42.545326232910156, + "logps/rejected": -77.60543823242188, + "loss": 1.0972, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.33540415763855, + "rewards/margins": 0.6875373125076294, + "rewards/rejected": 1.6478668451309204, + "step": 1459 + }, + { + "epoch": 0.24, + "learning_rate": 9.796183782373962e-06, + "logits/chosen": -1.254162073135376, + "logits/rejected": -1.2705036401748657, + "logps/chosen": -109.88836669921875, + "logps/rejected": -156.557861328125, + "loss": 0.274, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.4081573486328125, + "rewards/margins": 0.4401642084121704, + "rewards/rejected": 1.967993140220642, + "step": 1460 + }, + { + "epoch": 0.24, + "learning_rate": 9.795812203676887e-06, + "logits/chosen": -0.7959632277488708, + "logits/rejected": -0.6871716976165771, + "logps/chosen": -70.52734375, + "logps/rejected": -16.718299865722656, + "loss": 0.8291, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.173940420150757, + "rewards/margins": 2.0298516750335693, + "rewards/rejected": 0.1440887451171875, + "step": 1461 + }, + { + "epoch": 0.24, + "learning_rate": 9.795440293634566e-06, + "logits/chosen": -0.629543125629425, + "logits/rejected": -0.629543125629425, + "logps/chosen": -85.54768371582031, + "logps/rejected": -85.54768371582031, + "loss": 0.6476, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.7204238772392273, + "rewards/margins": 0.0, + "rewards/rejected": 0.7204238772392273, + "step": 1462 + }, + { + "epoch": 0.24, + "learning_rate": 9.795068052272698e-06, + "logits/chosen": -0.8884097337722778, + "logits/rejected": -0.8048689961433411, + "logps/chosen": -72.62138366699219, + "logps/rejected": -25.90389633178711, + "loss": 0.1187, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.7580337524414062, + "rewards/margins": 1.5415313243865967, + "rewards/rejected": 1.2165024280548096, + "step": 1463 + }, + { + "epoch": 0.24, + "learning_rate": 9.794695479616996e-06, + "logits/chosen": -1.235804796218872, + "logits/rejected": -0.8568255305290222, + "logps/chosen": -167.2122802734375, + "logps/rejected": -78.52291870117188, + "loss": 0.0195, + "rewards/accuracies": 1.0, + "rewards/chosen": 7.947262763977051, + "rewards/margins": 6.090553283691406, + "rewards/rejected": 1.856709361076355, + "step": 1464 + }, + { + "epoch": 0.24, + "learning_rate": 9.794322575693207e-06, + "logits/chosen": -0.6395488381385803, + "logits/rejected": -0.6166461110115051, + "logps/chosen": -65.12590789794922, + "logps/rejected": -64.90082550048828, + "loss": 0.2725, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.3020179271698, + "rewards/margins": 0.46938014030456543, + "rewards/rejected": 1.8326377868652344, + "step": 1465 + }, + { + "epoch": 0.24, + "learning_rate": 9.793949340527091e-06, + "logits/chosen": -0.8109131455421448, + "logits/rejected": -0.5636789798736572, + "logps/chosen": -116.56999206542969, + "logps/rejected": -43.80475616455078, + "loss": 0.1891, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.7369704246521, + "rewards/margins": 2.7075254917144775, + "rewards/rejected": 2.029444932937622, + "step": 1466 + }, + { + "epoch": 0.24, + "learning_rate": 9.793575774144436e-06, + "logits/chosen": -0.8929963707923889, + "logits/rejected": -0.894034206867218, + "logps/chosen": -74.07551574707031, + "logps/rejected": -56.479698181152344, + "loss": 0.6806, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.3722076416015625, + "rewards/margins": -0.07480084896087646, + "rewards/rejected": 1.447008490562439, + "step": 1467 + }, + { + "epoch": 0.24, + "learning_rate": 9.793201876571052e-06, + "logits/chosen": -0.739241898059845, + "logits/rejected": -0.933256208896637, + "logps/chosen": -114.77606201171875, + "logps/rejected": -108.40616607666016, + "loss": 1.9634, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.5538910627365112, + "rewards/margins": -3.223212242126465, + "rewards/rejected": 4.777103424072266, + "step": 1468 + }, + { + "epoch": 0.24, + "learning_rate": 9.792827647832773e-06, + "logits/chosen": -0.7325522899627686, + "logits/rejected": -0.7773848176002502, + "logps/chosen": -52.12174606323242, + "logps/rejected": -78.58209228515625, + "loss": 2.0759, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.1700531244277954, + "rewards/margins": -3.1141719818115234, + "rewards/rejected": 4.284224987030029, + "step": 1469 + }, + { + "epoch": 0.24, + "learning_rate": 9.792453087955454e-06, + "logits/chosen": -1.1492549180984497, + "logits/rejected": -0.9304440021514893, + "logps/chosen": -111.736328125, + "logps/rejected": -21.989124298095703, + "loss": 0.9172, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.311276435852051, + "rewards/margins": 4.794698238372803, + "rewards/rejected": 0.5165783166885376, + "step": 1470 + }, + { + "epoch": 0.24, + "learning_rate": 9.792078196964971e-06, + "logits/chosen": -1.1405529975891113, + "logits/rejected": -1.1159663200378418, + "logps/chosen": -78.45580291748047, + "logps/rejected": -10.580093383789062, + "loss": 0.2514, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.7715301513671875, + "rewards/margins": 0.9688640236854553, + "rewards/rejected": 0.8026661276817322, + "step": 1471 + }, + { + "epoch": 0.24, + "learning_rate": 9.79170297488723e-06, + "logits/chosen": -0.7539321780204773, + "logits/rejected": -0.7407183051109314, + "logps/chosen": -40.863258361816406, + "logps/rejected": -63.45155715942383, + "loss": 0.2495, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.155602216720581, + "rewards/margins": 0.5836611986160278, + "rewards/rejected": 1.5719410181045532, + "step": 1472 + }, + { + "epoch": 0.24, + "learning_rate": 9.791327421748151e-06, + "logits/chosen": -0.9715359210968018, + "logits/rejected": -0.9729205965995789, + "logps/chosen": -98.328857421875, + "logps/rejected": -73.54039001464844, + "loss": 1.2784, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.834173560142517, + "rewards/margins": -0.599418044090271, + "rewards/rejected": 2.433591604232788, + "step": 1473 + }, + { + "epoch": 0.24, + "learning_rate": 9.790951537573686e-06, + "logits/chosen": -0.7488531470298767, + "logits/rejected": -0.7397010922431946, + "logps/chosen": -78.43739318847656, + "logps/rejected": -78.03083801269531, + "loss": 0.6571, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.4196197986602783, + "rewards/margins": -0.899956464767456, + "rewards/rejected": 2.3195762634277344, + "step": 1474 + }, + { + "epoch": 0.24, + "learning_rate": 9.790575322389799e-06, + "logits/chosen": -0.8862642049789429, + "logits/rejected": -0.8609411716461182, + "logps/chosen": -36.317840576171875, + "logps/rejected": -7.076122283935547, + "loss": 0.7265, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.1592693328857422, + "rewards/margins": 0.32706135511398315, + "rewards/rejected": 0.832207977771759, + "step": 1475 + }, + { + "epoch": 0.24, + "learning_rate": 9.790198776222488e-06, + "logits/chosen": -0.8512038588523865, + "logits/rejected": -0.8346018195152283, + "logps/chosen": -103.883056640625, + "logps/rejected": -137.44183349609375, + "loss": 0.798, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.3305389881134033, + "rewards/margins": -1.0242218971252441, + "rewards/rejected": 3.3547608852386475, + "step": 1476 + }, + { + "epoch": 0.24, + "learning_rate": 9.789821899097767e-06, + "logits/chosen": -0.918215274810791, + "logits/rejected": -0.918215274810791, + "logps/chosen": -0.9980454444885254, + "logps/rejected": -0.9980454444885254, + "loss": 0.8816, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.2855856120586395, + "rewards/margins": 0.0, + "rewards/rejected": 0.2855856120586395, + "step": 1477 + }, + { + "epoch": 0.24, + "learning_rate": 9.789444691041673e-06, + "logits/chosen": -1.3321831226348877, + "logits/rejected": -1.1826642751693726, + "logps/chosen": -131.3455047607422, + "logps/rejected": -52.24603271484375, + "loss": 0.1385, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.101576328277588, + "rewards/margins": 1.5818696022033691, + "rewards/rejected": 0.5197067260742188, + "step": 1478 + }, + { + "epoch": 0.24, + "learning_rate": 9.78906715208027e-06, + "logits/chosen": -0.978563666343689, + "logits/rejected": -0.9468473196029663, + "logps/chosen": -75.6688003540039, + "logps/rejected": -93.51077270507812, + "loss": 0.4562, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.6987686157226562, + "rewards/margins": -0.11068272590637207, + "rewards/rejected": 1.8094513416290283, + "step": 1479 + }, + { + "epoch": 0.24, + "learning_rate": 9.78868928223964e-06, + "logits/chosen": -0.828710675239563, + "logits/rejected": -0.7640289664268494, + "logps/chosen": -34.06134796142578, + "logps/rejected": -17.62076759338379, + "loss": 0.9265, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3328590393066406, + "rewards/margins": 0.002235591411590576, + "rewards/rejected": 0.33062344789505005, + "step": 1480 + }, + { + "epoch": 0.24, + "learning_rate": 9.788311081545894e-06, + "logits/chosen": -0.8587989211082458, + "logits/rejected": -0.872707188129425, + "logps/chosen": -62.867515563964844, + "logps/rejected": -42.3294792175293, + "loss": 1.3288, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.5200058221817017, + "rewards/margins": -0.5252643823623657, + "rewards/rejected": 2.0452702045440674, + "step": 1481 + }, + { + "epoch": 0.24, + "learning_rate": 9.787932550025158e-06, + "logits/chosen": -1.1859073638916016, + "logits/rejected": -1.2865477800369263, + "logps/chosen": -182.74395751953125, + "logps/rejected": -162.97677612304688, + "loss": 1.8792, + "rewards/accuracies": 0.0, + "rewards/chosen": 4.838693141937256, + "rewards/margins": -3.295274257659912, + "rewards/rejected": 8.133967399597168, + "step": 1482 + }, + { + "epoch": 0.24, + "learning_rate": 9.787553687703586e-06, + "logits/chosen": -0.6361171007156372, + "logits/rejected": -0.6325987577438354, + "logps/chosen": -2.371062994003296, + "logps/rejected": -6.93922758102417, + "loss": 0.6688, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.3186749815940857, + "rewards/margins": -0.04623281955718994, + "rewards/rejected": 0.36490780115127563, + "step": 1483 + }, + { + "epoch": 0.24, + "learning_rate": 9.787174494607357e-06, + "logits/chosen": -0.7655963897705078, + "logits/rejected": -0.7775679230690002, + "logps/chosen": -39.41337203979492, + "logps/rejected": -104.38523864746094, + "loss": 0.557, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.482129693031311, + "rewards/margins": 0.5235252380371094, + "rewards/rejected": 0.9586044549942017, + "step": 1484 + }, + { + "epoch": 0.24, + "learning_rate": 9.786794970762664e-06, + "logits/chosen": -0.7164024710655212, + "logits/rejected": -0.7361848950386047, + "logps/chosen": -61.298919677734375, + "logps/rejected": -44.956626892089844, + "loss": 0.5384, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.7001510858535767, + "rewards/margins": 0.7597587704658508, + "rewards/rejected": 0.9403923153877258, + "step": 1485 + }, + { + "epoch": 0.24, + "learning_rate": 9.786415116195733e-06, + "logits/chosen": -1.0295311212539673, + "logits/rejected": -0.9896060824394226, + "logps/chosen": -76.37992858886719, + "logps/rejected": -95.72737121582031, + "loss": 0.3795, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.324893474578857, + "rewards/margins": 0.3238525390625, + "rewards/rejected": 4.001040935516357, + "step": 1486 + }, + { + "epoch": 0.24, + "learning_rate": 9.786034930932808e-06, + "logits/chosen": -1.0916416645050049, + "logits/rejected": -1.0802645683288574, + "logps/chosen": -107.73628234863281, + "logps/rejected": -50.2476921081543, + "loss": 1.4183, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.5300537347793579, + "rewards/margins": -1.345828652381897, + "rewards/rejected": 1.8758823871612549, + "step": 1487 + }, + { + "epoch": 0.24, + "learning_rate": 9.785654415000155e-06, + "logits/chosen": -0.8994265794754028, + "logits/rejected": -0.9690906405448914, + "logps/chosen": -79.01080322265625, + "logps/rejected": -139.89129638671875, + "loss": 1.9759, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.3843125104904175, + "rewards/margins": -3.144914150238037, + "rewards/rejected": 4.529226779937744, + "step": 1488 + }, + { + "epoch": 0.24, + "learning_rate": 9.785273568424064e-06, + "logits/chosen": -0.6156138777732849, + "logits/rejected": -0.5126557350158691, + "logps/chosen": -34.873924255371094, + "logps/rejected": -57.22180938720703, + "loss": 0.0749, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.7968239784240723, + "rewards/margins": 1.8369072675704956, + "rewards/rejected": 0.9599167108535767, + "step": 1489 + }, + { + "epoch": 0.24, + "learning_rate": 9.784892391230847e-06, + "logits/chosen": -0.7088137269020081, + "logits/rejected": -0.7470302581787109, + "logps/chosen": -40.1612434387207, + "logps/rejected": -45.10132598876953, + "loss": 0.5442, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.3761730194091797, + "rewards/margins": -0.05841422080993652, + "rewards/rejected": 2.434587240219116, + "step": 1490 + }, + { + "epoch": 0.24, + "learning_rate": 9.784510883446841e-06, + "logits/chosen": -1.0154926776885986, + "logits/rejected": -0.8491749167442322, + "logps/chosen": -87.79063415527344, + "logps/rejected": -27.699199676513672, + "loss": 0.0531, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.87290358543396, + "rewards/margins": 2.207803726196289, + "rewards/rejected": 0.6650997400283813, + "step": 1491 + }, + { + "epoch": 0.24, + "learning_rate": 9.784129045098405e-06, + "logits/chosen": -1.0288196802139282, + "logits/rejected": -0.9158037900924683, + "logps/chosen": -99.84060668945312, + "logps/rejected": -35.74763488769531, + "loss": 0.3841, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.7089500427246094, + "rewards/margins": 1.5476043224334717, + "rewards/rejected": 0.1613456755876541, + "step": 1492 + }, + { + "epoch": 0.24, + "learning_rate": 9.78374687621192e-06, + "logits/chosen": -1.295852780342102, + "logits/rejected": -1.1916325092315674, + "logps/chosen": -90.38262939453125, + "logps/rejected": -29.831682205200195, + "loss": 1.7484, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.8903213739395142, + "rewards/margins": 1.6716760396957397, + "rewards/rejected": 0.21864528954029083, + "step": 1493 + }, + { + "epoch": 0.24, + "learning_rate": 9.78336437681379e-06, + "logits/chosen": -0.9217774868011475, + "logits/rejected": -1.0484230518341064, + "logps/chosen": -43.075809478759766, + "logps/rejected": -106.26789855957031, + "loss": 1.3192, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.857548236846924, + "rewards/margins": -2.5013113021850586, + "rewards/rejected": 5.358859539031982, + "step": 1494 + }, + { + "epoch": 0.24, + "learning_rate": 9.782981546930442e-06, + "logits/chosen": -1.0443122386932373, + "logits/rejected": -1.021441102027893, + "logps/chosen": -64.56690216064453, + "logps/rejected": -99.23479461669922, + "loss": 0.3501, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.1857011318206787, + "rewards/margins": 0.023792266845703125, + "rewards/rejected": 2.1619088649749756, + "step": 1495 + }, + { + "epoch": 0.24, + "learning_rate": 9.782598386588324e-06, + "logits/chosen": -0.6302773952484131, + "logits/rejected": -0.615727961063385, + "logps/chosen": -28.837282180786133, + "logps/rejected": -1.4869908094406128, + "loss": 0.7691, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.04782581329345703, + "rewards/margins": -0.4061306118965149, + "rewards/rejected": 0.4539564251899719, + "step": 1496 + }, + { + "epoch": 0.24, + "learning_rate": 9.782214895813913e-06, + "logits/chosen": -0.8784348368644714, + "logits/rejected": -1.0049906969070435, + "logps/chosen": -109.43766784667969, + "logps/rejected": -136.33212280273438, + "loss": 2.6529, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.7570899724960327, + "rewards/margins": -5.275454044342041, + "rewards/rejected": 7.032544136047363, + "step": 1497 + }, + { + "epoch": 0.24, + "learning_rate": 9.781831074633703e-06, + "logits/chosen": -1.1362663507461548, + "logits/rejected": -1.3001550436019897, + "logps/chosen": -80.24903869628906, + "logps/rejected": -37.396080017089844, + "loss": 0.1349, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.5363807678222656, + "rewards/margins": 1.250624418258667, + "rewards/rejected": 0.28575631976127625, + "step": 1498 + }, + { + "epoch": 0.24, + "learning_rate": 9.781446923074212e-06, + "logits/chosen": -1.1851532459259033, + "logits/rejected": -1.0254615545272827, + "logps/chosen": -149.96743774414062, + "logps/rejected": -94.13966369628906, + "loss": 0.1285, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.170932292938232, + "rewards/margins": 1.2514269351959229, + "rewards/rejected": 3.9195053577423096, + "step": 1499 + }, + { + "epoch": 0.24, + "learning_rate": 9.781062441161979e-06, + "logits/chosen": -0.8772566318511963, + "logits/rejected": -0.7484308481216431, + "logps/chosen": -57.01393127441406, + "logps/rejected": -16.788806915283203, + "loss": 0.9552, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.524871826171875, + "rewards/margins": 1.2251983880996704, + "rewards/rejected": 0.299673467874527, + "step": 1500 + }, + { + "epoch": 0.24, + "learning_rate": 9.78067762892357e-06, + "logits/chosen": -0.8014318346977234, + "logits/rejected": -0.7520307302474976, + "logps/chosen": -87.68830871582031, + "logps/rejected": -59.382606506347656, + "loss": 0.5395, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.7792305946350098, + "rewards/margins": 0.54886794090271, + "rewards/rejected": 2.2303626537323, + "step": 1501 + }, + { + "epoch": 0.24, + "learning_rate": 9.780292486385575e-06, + "logits/chosen": -0.9999932050704956, + "logits/rejected": -0.790783166885376, + "logps/chosen": -103.4359130859375, + "logps/rejected": -43.97731399536133, + "loss": 0.0342, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.68271803855896, + "rewards/margins": 3.1958119869232178, + "rewards/rejected": 0.4869060516357422, + "step": 1502 + }, + { + "epoch": 0.24, + "learning_rate": 9.7799070135746e-06, + "logits/chosen": -0.7102030515670776, + "logits/rejected": -0.7102030515670776, + "logps/chosen": -21.4853572845459, + "logps/rejected": -21.4853572845459, + "loss": 0.5621, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.122844934463501, + "rewards/margins": 0.0, + "rewards/rejected": 1.122844934463501, + "step": 1503 + }, + { + "epoch": 0.24, + "learning_rate": 9.779521210517277e-06, + "logits/chosen": -0.8567198514938354, + "logits/rejected": -0.8620771765708923, + "logps/chosen": -94.81766510009766, + "logps/rejected": -161.9098663330078, + "loss": 0.3338, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.8241866827011108, + "rewards/margins": 0.29861748218536377, + "rewards/rejected": 1.525569200515747, + "step": 1504 + }, + { + "epoch": 0.24, + "learning_rate": 9.779135077240262e-06, + "logits/chosen": -0.569701611995697, + "logits/rejected": -0.5213701128959656, + "logps/chosen": -39.269596099853516, + "logps/rejected": -36.72677993774414, + "loss": 0.5893, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.1071712970733643, + "rewards/margins": 1.1304851770401, + "rewards/rejected": 0.9766861200332642, + "step": 1505 + }, + { + "epoch": 0.24, + "learning_rate": 9.778748613770234e-06, + "logits/chosen": -0.4286744296550751, + "logits/rejected": -0.4286744296550751, + "logps/chosen": -36.751136779785156, + "logps/rejected": -36.751136779785156, + "loss": 0.4571, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.3969825506210327, + "rewards/margins": 0.0, + "rewards/rejected": 1.3969825506210327, + "step": 1506 + }, + { + "epoch": 0.24, + "learning_rate": 9.778361820133896e-06, + "logits/chosen": -0.6461842060089111, + "logits/rejected": -0.6486066579818726, + "logps/chosen": -6.660622596740723, + "logps/rejected": -16.985984802246094, + "loss": 0.7615, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.013492775149643421, + "rewards/margins": 0.15486183762550354, + "rewards/rejected": -0.1413690596818924, + "step": 1507 + }, + { + "epoch": 0.24, + "learning_rate": 9.777974696357969e-06, + "logits/chosen": -0.703579306602478, + "logits/rejected": -0.7045972943305969, + "logps/chosen": -23.123451232910156, + "logps/rejected": -41.15799331665039, + "loss": 0.6087, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.634132981300354, + "rewards/margins": 0.17867910861968994, + "rewards/rejected": 0.45545387268066406, + "step": 1508 + }, + { + "epoch": 0.24, + "learning_rate": 9.777587242469197e-06, + "logits/chosen": -0.6118648648262024, + "logits/rejected": -0.6118648648262024, + "logps/chosen": -29.420019149780273, + "logps/rejected": -29.420019149780273, + "loss": 1.556, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.7287306189537048, + "rewards/margins": 0.0, + "rewards/rejected": 0.7287306189537048, + "step": 1509 + }, + { + "epoch": 0.25, + "learning_rate": 9.777199458494356e-06, + "logits/chosen": -1.1590665578842163, + "logits/rejected": -1.0874615907669067, + "logps/chosen": -143.55239868164062, + "logps/rejected": -45.169830322265625, + "loss": 0.0613, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.422094821929932, + "rewards/margins": 5.047575950622559, + "rewards/rejected": 0.3745189607143402, + "step": 1510 + }, + { + "epoch": 0.25, + "learning_rate": 9.776811344460233e-06, + "logits/chosen": -1.009263277053833, + "logits/rejected": -0.7696248888969421, + "logps/chosen": -83.463134765625, + "logps/rejected": -49.6712532043457, + "loss": 0.3543, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.7748939394950867, + "rewards/margins": 0.46821439266204834, + "rewards/rejected": 0.30667954683303833, + "step": 1511 + }, + { + "epoch": 0.25, + "learning_rate": 9.776422900393645e-06, + "logits/chosen": -0.614257276058197, + "logits/rejected": -0.9612492322921753, + "logps/chosen": -44.05884552001953, + "logps/rejected": -39.42517852783203, + "loss": 0.1282, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.2987053394317627, + "rewards/margins": 1.3329200744628906, + "rewards/rejected": 0.9657852053642273, + "step": 1512 + }, + { + "epoch": 0.25, + "learning_rate": 9.776034126321429e-06, + "logits/chosen": -1.1537976264953613, + "logits/rejected": -0.8985092639923096, + "logps/chosen": -103.85316467285156, + "logps/rejected": -78.32598876953125, + "loss": 0.8463, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.3028496503829956, + "rewards/margins": -0.6425079107284546, + "rewards/rejected": 1.9453575611114502, + "step": 1513 + }, + { + "epoch": 0.25, + "learning_rate": 9.775645022270448e-06, + "logits/chosen": -1.3008451461791992, + "logits/rejected": -1.3548409938812256, + "logps/chosen": -106.39700317382812, + "logps/rejected": -139.51651000976562, + "loss": 0.7822, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.7690064907073975, + "rewards/margins": -1.1096618175506592, + "rewards/rejected": 4.878668308258057, + "step": 1514 + }, + { + "epoch": 0.25, + "learning_rate": 9.775255588267582e-06, + "logits/chosen": -0.7387905716896057, + "logits/rejected": -0.7124733924865723, + "logps/chosen": -55.08832550048828, + "logps/rejected": -92.23145294189453, + "loss": 0.2688, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.361461639404297, + "rewards/margins": 0.487762451171875, + "rewards/rejected": 1.8736991882324219, + "step": 1515 + }, + { + "epoch": 0.25, + "learning_rate": 9.774865824339738e-06, + "logits/chosen": -0.37332165241241455, + "logits/rejected": -0.38881218433380127, + "logps/chosen": -23.428855895996094, + "logps/rejected": -29.673091888427734, + "loss": 0.5132, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.2142818421125412, + "rewards/margins": -0.42886316776275635, + "rewards/rejected": 0.6431450247764587, + "step": 1516 + }, + { + "epoch": 0.25, + "learning_rate": 9.774475730513847e-06, + "logits/chosen": -0.8013870716094971, + "logits/rejected": -0.7306302785873413, + "logps/chosen": -85.41629028320312, + "logps/rejected": -61.45459747314453, + "loss": 0.9888, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.3551911115646362, + "rewards/margins": -0.14698410034179688, + "rewards/rejected": 1.502175211906433, + "step": 1517 + }, + { + "epoch": 0.25, + "learning_rate": 9.774085306816859e-06, + "logits/chosen": -1.1048592329025269, + "logits/rejected": -1.1050572395324707, + "logps/chosen": -104.3596420288086, + "logps/rejected": -67.23062133789062, + "loss": 0.6299, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.8220771551132202, + "rewards/margins": -0.7087196111679077, + "rewards/rejected": 2.530796766281128, + "step": 1518 + }, + { + "epoch": 0.25, + "learning_rate": 9.77369455327575e-06, + "logits/chosen": -0.799109160900116, + "logits/rejected": -0.7868516445159912, + "logps/chosen": -82.14404296875, + "logps/rejected": -77.71221160888672, + "loss": 1.9765, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.8800086975097656, + "rewards/margins": -0.003650665283203125, + "rewards/rejected": 1.8836593627929688, + "step": 1519 + }, + { + "epoch": 0.25, + "learning_rate": 9.773303469917516e-06, + "logits/chosen": -0.6363345384597778, + "logits/rejected": -0.6188555359840393, + "logps/chosen": -18.55117416381836, + "logps/rejected": -6.244017124176025, + "loss": 0.363, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.2596418559551239, + "rewards/margins": -0.0015224814414978027, + "rewards/rejected": 0.2611643373966217, + "step": 1520 + }, + { + "epoch": 0.25, + "learning_rate": 9.772912056769177e-06, + "logits/chosen": -1.1518456935882568, + "logits/rejected": -0.9473250508308411, + "logps/chosen": -167.15835571289062, + "logps/rejected": -32.956634521484375, + "loss": 0.2351, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.838494896888733, + "rewards/margins": 1.9295822381973267, + "rewards/rejected": -0.09108734130859375, + "step": 1521 + }, + { + "epoch": 0.25, + "learning_rate": 9.772520313857777e-06, + "logits/chosen": -0.8909116983413696, + "logits/rejected": -0.8694998621940613, + "logps/chosen": -93.99263000488281, + "logps/rejected": -88.51826477050781, + "loss": 0.4615, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.540963649749756, + "rewards/margins": 1.4312419891357422, + "rewards/rejected": 4.109721660614014, + "step": 1522 + }, + { + "epoch": 0.25, + "learning_rate": 9.772128241210381e-06, + "logits/chosen": -0.8246636390686035, + "logits/rejected": -0.7734432816505432, + "logps/chosen": -80.14016723632812, + "logps/rejected": -56.287200927734375, + "loss": 0.9266, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.7965774536132812, + "rewards/margins": -0.6486732959747314, + "rewards/rejected": 1.4452507495880127, + "step": 1523 + }, + { + "epoch": 0.25, + "learning_rate": 9.77173583885408e-06, + "logits/chosen": -0.6054477095603943, + "logits/rejected": -0.626067578792572, + "logps/chosen": -25.218103408813477, + "logps/rejected": -61.72137451171875, + "loss": 1.2669, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.4566503763198853, + "rewards/margins": 0.7490713000297546, + "rewards/rejected": 0.7075790762901306, + "step": 1524 + }, + { + "epoch": 0.25, + "learning_rate": 9.771343106815981e-06, + "logits/chosen": -0.6329940557479858, + "logits/rejected": -0.6393745541572571, + "logps/chosen": -8.134262084960938, + "logps/rejected": -1.3756296634674072, + "loss": 0.4491, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.21606293320655823, + "rewards/margins": -0.07929781079292297, + "rewards/rejected": 0.2953607439994812, + "step": 1525 + }, + { + "epoch": 0.25, + "learning_rate": 9.77095004512322e-06, + "logits/chosen": -0.7207462787628174, + "logits/rejected": -0.6347429156303406, + "logps/chosen": -43.38246536254883, + "logps/rejected": -68.98722839355469, + "loss": 0.3651, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.9407402276992798, + "rewards/margins": -0.06812679767608643, + "rewards/rejected": 2.008867025375366, + "step": 1526 + }, + { + "epoch": 0.25, + "learning_rate": 9.770556653802954e-06, + "logits/chosen": -0.6907873749732971, + "logits/rejected": -0.4695177376270294, + "logps/chosen": -64.6058349609375, + "logps/rejected": -18.717899322509766, + "loss": 0.6128, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.4030685424804688, + "rewards/margins": 1.796091079711914, + "rewards/rejected": 0.6069774627685547, + "step": 1527 + }, + { + "epoch": 0.25, + "learning_rate": 9.770162932882363e-06, + "logits/chosen": -0.7396957874298096, + "logits/rejected": -0.659517765045166, + "logps/chosen": -64.54843139648438, + "logps/rejected": -20.93927764892578, + "loss": 0.3189, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.14593505859375, + "rewards/margins": 0.7477563619613647, + "rewards/rejected": 0.39817866683006287, + "step": 1528 + }, + { + "epoch": 0.25, + "learning_rate": 9.769768882388648e-06, + "logits/chosen": -1.093282699584961, + "logits/rejected": -0.9740983247756958, + "logps/chosen": -119.91680908203125, + "logps/rejected": -41.98080062866211, + "loss": 0.1275, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.8826370239257812, + "rewards/margins": 1.6599705219268799, + "rewards/rejected": 0.22266654670238495, + "step": 1529 + }, + { + "epoch": 0.25, + "learning_rate": 9.769374502349038e-06, + "logits/chosen": -1.2068538665771484, + "logits/rejected": -1.0765186548233032, + "logps/chosen": -90.10676574707031, + "logps/rejected": -28.700965881347656, + "loss": 0.2506, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.1501785516738892, + "rewards/margins": 0.8955124020576477, + "rewards/rejected": 0.25466614961624146, + "step": 1530 + }, + { + "epoch": 0.25, + "learning_rate": 9.768979792790775e-06, + "logits/chosen": -0.6004800796508789, + "logits/rejected": -0.6004800796508789, + "logps/chosen": -65.07524108886719, + "logps/rejected": -65.07524108886719, + "loss": 0.4309, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.8236083984375, + "rewards/margins": 0.0, + "rewards/rejected": 0.8236083984375, + "step": 1531 + }, + { + "epoch": 0.25, + "learning_rate": 9.768584753741134e-06, + "logits/chosen": -1.1482247114181519, + "logits/rejected": -1.0539932250976562, + "logps/chosen": -76.2723388671875, + "logps/rejected": -60.80004119873047, + "loss": 1.9572, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.176458716392517, + "rewards/margins": -2.0076308250427246, + "rewards/rejected": 3.1840896606445312, + "step": 1532 + }, + { + "epoch": 0.25, + "learning_rate": 9.768189385227409e-06, + "logits/chosen": -0.7643141150474548, + "logits/rejected": -0.631623387336731, + "logps/chosen": -94.15735626220703, + "logps/rejected": -37.480987548828125, + "loss": 0.3505, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.014127492904663, + "rewards/margins": 2.0024807453155518, + "rewards/rejected": 0.011646652594208717, + "step": 1533 + }, + { + "epoch": 0.25, + "learning_rate": 9.767793687276913e-06, + "logits/chosen": -1.0611690282821655, + "logits/rejected": -1.0601450204849243, + "logps/chosen": -84.52611541748047, + "logps/rejected": -50.576210021972656, + "loss": 0.8788, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.9049522280693054, + "rewards/margins": -1.1150307655334473, + "rewards/rejected": 2.0199830532073975, + "step": 1534 + }, + { + "epoch": 0.25, + "learning_rate": 9.767397659916987e-06, + "logits/chosen": -1.0620158910751343, + "logits/rejected": -1.017946481704712, + "logps/chosen": -247.92678833007812, + "logps/rejected": -108.98605346679688, + "loss": 0.0346, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.191150188446045, + "rewards/margins": 3.814448833465576, + "rewards/rejected": 2.3767013549804688, + "step": 1535 + }, + { + "epoch": 0.25, + "learning_rate": 9.767001303174992e-06, + "logits/chosen": -0.8650248050689697, + "logits/rejected": -0.7995830178260803, + "logps/chosen": -109.01349639892578, + "logps/rejected": -47.02525329589844, + "loss": 0.5654, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.6220116019248962, + "rewards/margins": -0.31773412227630615, + "rewards/rejected": 0.9397457242012024, + "step": 1536 + }, + { + "epoch": 0.25, + "learning_rate": 9.766604617078313e-06, + "logits/chosen": -0.6697421669960022, + "logits/rejected": -0.6228575706481934, + "logps/chosen": -90.65202331542969, + "logps/rejected": -84.33976745605469, + "loss": 0.5123, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.7878693342208862, + "rewards/margins": -0.49566733837127686, + "rewards/rejected": 2.283536672592163, + "step": 1537 + }, + { + "epoch": 0.25, + "learning_rate": 9.766207601654356e-06, + "logits/chosen": -0.822087287902832, + "logits/rejected": -0.9174224138259888, + "logps/chosen": -64.65902709960938, + "logps/rejected": -34.15340042114258, + "loss": 1.7697, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.4501054286956787, + "rewards/margins": 0.5778756141662598, + "rewards/rejected": 1.872229814529419, + "step": 1538 + }, + { + "epoch": 0.25, + "learning_rate": 9.765810256930553e-06, + "logits/chosen": -0.7535549998283386, + "logits/rejected": -0.7535549998283386, + "logps/chosen": -85.61831665039062, + "logps/rejected": -85.61831665039062, + "loss": 1.3642, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.5641846656799316, + "rewards/margins": 0.0, + "rewards/rejected": 2.5641846656799316, + "step": 1539 + }, + { + "epoch": 0.25, + "learning_rate": 9.765412582934355e-06, + "logits/chosen": -0.9576118588447571, + "logits/rejected": -1.1206048727035522, + "logps/chosen": -164.63418579101562, + "logps/rejected": -124.20091247558594, + "loss": 0.1913, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.450963020324707, + "rewards/margins": 1.0073332786560059, + "rewards/rejected": 4.443629741668701, + "step": 1540 + }, + { + "epoch": 0.25, + "learning_rate": 9.765014579693239e-06, + "logits/chosen": -0.8472582101821899, + "logits/rejected": -0.6613426208496094, + "logps/chosen": -203.99075317382812, + "logps/rejected": -86.34550476074219, + "loss": 1.0945, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.2088820934295654, + "rewards/margins": -1.5418121814727783, + "rewards/rejected": 4.750694274902344, + "step": 1541 + }, + { + "epoch": 0.25, + "learning_rate": 9.764616247234702e-06, + "logits/chosen": -0.5928990244865417, + "logits/rejected": -0.5506259202957153, + "logps/chosen": -133.88636779785156, + "logps/rejected": -66.66432189941406, + "loss": 0.6326, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.20559082925319672, + "rewards/margins": -0.608111560344696, + "rewards/rejected": 0.813702404499054, + "step": 1542 + }, + { + "epoch": 0.25, + "learning_rate": 9.764217585586266e-06, + "logits/chosen": -0.7920228838920593, + "logits/rejected": -0.8120933771133423, + "logps/chosen": -66.6800537109375, + "logps/rejected": -111.32777404785156, + "loss": 0.669, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.4718148708343506, + "rewards/margins": -1.0174446105957031, + "rewards/rejected": 3.4892594814300537, + "step": 1543 + }, + { + "epoch": 0.25, + "learning_rate": 9.763818594775474e-06, + "logits/chosen": -0.8374782800674438, + "logits/rejected": -0.7094562649726868, + "logps/chosen": -44.3544807434082, + "logps/rejected": -48.314231872558594, + "loss": 0.3201, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.5992634296417236, + "rewards/margins": 0.8735783100128174, + "rewards/rejected": 1.7256851196289062, + "step": 1544 + }, + { + "epoch": 0.25, + "learning_rate": 9.763419274829893e-06, + "logits/chosen": -0.9456902146339417, + "logits/rejected": -0.8271272778511047, + "logps/chosen": -88.41757202148438, + "logps/rejected": -54.54177474975586, + "loss": 0.2953, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.314250946044922, + "rewards/margins": 1.7677359580993652, + "rewards/rejected": 0.5465149283409119, + "step": 1545 + }, + { + "epoch": 0.25, + "learning_rate": 9.763019625777111e-06, + "logits/chosen": -0.9582834243774414, + "logits/rejected": -1.0150564908981323, + "logps/chosen": -111.08485412597656, + "logps/rejected": -74.17312622070312, + "loss": 1.0238, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.2842353582382202, + "rewards/margins": -1.056036353111267, + "rewards/rejected": 2.3402717113494873, + "step": 1546 + }, + { + "epoch": 0.25, + "learning_rate": 9.762619647644741e-06, + "logits/chosen": -0.7879843711853027, + "logits/rejected": -0.957749605178833, + "logps/chosen": -122.66407012939453, + "logps/rejected": -100.66280364990234, + "loss": 1.369, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.6534432172775269, + "rewards/margins": -2.6199307441711426, + "rewards/rejected": 4.273374080657959, + "step": 1547 + }, + { + "epoch": 0.25, + "learning_rate": 9.762219340460419e-06, + "logits/chosen": -0.8912383317947388, + "logits/rejected": -0.9708846211433411, + "logps/chosen": -115.08518981933594, + "logps/rejected": -145.54473876953125, + "loss": 1.8597, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.6770645380020142, + "rewards/margins": -3.6160888671875, + "rewards/rejected": 5.293153285980225, + "step": 1548 + }, + { + "epoch": 0.25, + "learning_rate": 9.761818704251801e-06, + "logits/chosen": -0.9550876021385193, + "logits/rejected": -0.8458188772201538, + "logps/chosen": -51.37961959838867, + "logps/rejected": -17.166006088256836, + "loss": 0.099, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.9791996479034424, + "rewards/margins": 1.5443075895309448, + "rewards/rejected": 0.43489208817481995, + "step": 1549 + }, + { + "epoch": 0.25, + "learning_rate": 9.761417739046566e-06, + "logits/chosen": -0.8613227605819702, + "logits/rejected": -0.8938164710998535, + "logps/chosen": -92.69650268554688, + "logps/rejected": -90.66990661621094, + "loss": 2.7444, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.6307846307754517, + "rewards/margins": -1.543073296546936, + "rewards/rejected": 3.1738579273223877, + "step": 1550 + }, + { + "epoch": 0.25, + "learning_rate": 9.76101644487242e-06, + "logits/chosen": -0.7563945651054382, + "logits/rejected": -0.844914972782135, + "logps/chosen": -31.72283172607422, + "logps/rejected": -114.03783416748047, + "loss": 0.9934, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.9277175664901733, + "rewards/margins": -1.3128091096878052, + "rewards/rejected": 3.2405266761779785, + "step": 1551 + }, + { + "epoch": 0.25, + "learning_rate": 9.760614821757085e-06, + "logits/chosen": -0.497600257396698, + "logits/rejected": -0.5602139830589294, + "logps/chosen": -36.77250289916992, + "logps/rejected": -89.76832580566406, + "loss": 0.7976, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.0765308141708374, + "rewards/margins": 0.33060336112976074, + "rewards/rejected": 0.7459274530410767, + "step": 1552 + }, + { + "epoch": 0.25, + "learning_rate": 9.760212869728312e-06, + "logits/chosen": -0.4499908983707428, + "logits/rejected": -0.8728981018066406, + "logps/chosen": -40.25693130493164, + "logps/rejected": -80.5810317993164, + "loss": 0.4181, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.4735149145126343, + "rewards/margins": 1.872520089149475, + "rewards/rejected": -0.39900514483451843, + "step": 1553 + }, + { + "epoch": 0.25, + "learning_rate": 9.759810588813872e-06, + "logits/chosen": -1.0601156949996948, + "logits/rejected": -0.9918279647827148, + "logps/chosen": -115.47982788085938, + "logps/rejected": -50.70769500732422, + "loss": 0.0263, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.059587001800537, + "rewards/margins": 3.0411086082458496, + "rewards/rejected": 1.0184783935546875, + "step": 1554 + }, + { + "epoch": 0.25, + "learning_rate": 9.759407979041557e-06, + "logits/chosen": -1.0879672765731812, + "logits/rejected": -0.8962595462799072, + "logps/chosen": -91.63517761230469, + "logps/rejected": -68.21867370605469, + "loss": 0.0282, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.988307476043701, + "rewards/margins": 4.370065689086914, + "rewards/rejected": 1.6182419061660767, + "step": 1555 + }, + { + "epoch": 0.25, + "learning_rate": 9.759005040439184e-06, + "logits/chosen": -0.8594478368759155, + "logits/rejected": -0.8641479015350342, + "logps/chosen": -65.11029052734375, + "logps/rejected": -73.53900146484375, + "loss": 2.6784, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.565847873687744, + "rewards/margins": -3.7786865234375, + "rewards/rejected": 6.344534397125244, + "step": 1556 + }, + { + "epoch": 0.25, + "learning_rate": 9.758601773034595e-06, + "logits/chosen": -1.1453397274017334, + "logits/rejected": -0.9387674927711487, + "logps/chosen": -200.2457275390625, + "logps/rejected": -86.6767578125, + "loss": 0.3186, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2938950061798096, + "rewards/margins": 0.5826125144958496, + "rewards/rejected": 2.71128249168396, + "step": 1557 + }, + { + "epoch": 0.25, + "learning_rate": 9.758198176855648e-06, + "logits/chosen": -0.8307804465293884, + "logits/rejected": -0.7969781160354614, + "logps/chosen": -100.63296508789062, + "logps/rejected": -87.46907043457031, + "loss": 0.1655, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.427956461906433, + "rewards/margins": 0.982617974281311, + "rewards/rejected": 0.4453384578227997, + "step": 1558 + }, + { + "epoch": 0.25, + "learning_rate": 9.75779425193023e-06, + "logits/chosen": -0.8829380869865417, + "logits/rejected": -0.8548539876937866, + "logps/chosen": -177.61624145507812, + "logps/rejected": -139.75030517578125, + "loss": 1.7409, + "rewards/accuracies": 0.0, + "rewards/chosen": 5.061755657196045, + "rewards/margins": -3.4479947090148926, + "rewards/rejected": 8.509750366210938, + "step": 1559 + }, + { + "epoch": 0.25, + "learning_rate": 9.757389998286247e-06, + "logits/chosen": -0.6152467727661133, + "logits/rejected": -0.5425959229469299, + "logps/chosen": -67.2274169921875, + "logps/rejected": -40.8168830871582, + "loss": 0.1799, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.3959640264511108, + "rewards/margins": 1.3319636583328247, + "rewards/rejected": 0.06400032341480255, + "step": 1560 + }, + { + "epoch": 0.25, + "learning_rate": 9.756985415951631e-06, + "logits/chosen": -0.8766472935676575, + "logits/rejected": -0.8563424348831177, + "logps/chosen": -95.82886505126953, + "logps/rejected": -152.0905303955078, + "loss": 2.677, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.7826111316680908, + "rewards/margins": -4.990147590637207, + "rewards/rejected": 6.772758483886719, + "step": 1561 + }, + { + "epoch": 0.25, + "learning_rate": 9.756580504954334e-06, + "logits/chosen": -0.7485910058021545, + "logits/rejected": -0.7839797735214233, + "logps/chosen": -85.64591979980469, + "logps/rejected": -87.22586059570312, + "loss": 1.0972, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.5746673345565796, + "rewards/margins": -2.0278382301330566, + "rewards/rejected": 3.6025054454803467, + "step": 1562 + }, + { + "epoch": 0.25, + "learning_rate": 9.75617526532233e-06, + "logits/chosen": -1.4319944381713867, + "logits/rejected": -1.2956300973892212, + "logps/chosen": -102.3450927734375, + "logps/rejected": -54.6036262512207, + "loss": 1.6983, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.432612657546997, + "rewards/margins": -0.650169849395752, + "rewards/rejected": 2.082782506942749, + "step": 1563 + }, + { + "epoch": 0.25, + "learning_rate": 9.75576969708362e-06, + "logits/chosen": -0.7163271903991699, + "logits/rejected": -0.8008231520652771, + "logps/chosen": -59.83356475830078, + "logps/rejected": -99.14427947998047, + "loss": 1.2652, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.6727731227874756, + "rewards/margins": -1.6371915340423584, + "rewards/rejected": 4.309964656829834, + "step": 1564 + }, + { + "epoch": 0.25, + "learning_rate": 9.755363800266224e-06, + "logits/chosen": -0.7214904427528381, + "logits/rejected": -0.7214904427528381, + "logps/chosen": -40.01137161254883, + "logps/rejected": -40.01137161254883, + "loss": 0.3471, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.447479009628296, + "rewards/margins": 0.0, + "rewards/rejected": 2.447479009628296, + "step": 1565 + }, + { + "epoch": 0.25, + "learning_rate": 9.754957574898183e-06, + "logits/chosen": -1.0151939392089844, + "logits/rejected": -1.0402888059616089, + "logps/chosen": -63.02825927734375, + "logps/rejected": -134.1728515625, + "loss": 0.2923, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.2542632818222046, + "rewards/margins": 0.2991454601287842, + "rewards/rejected": 0.9551178216934204, + "step": 1566 + }, + { + "epoch": 0.25, + "learning_rate": 9.754551021007566e-06, + "logits/chosen": -1.2890992164611816, + "logits/rejected": -1.219489574432373, + "logps/chosen": -41.19635009765625, + "logps/rejected": -105.586669921875, + "loss": 0.3697, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.3317208290100098, + "rewards/margins": 0.6316063404083252, + "rewards/rejected": 1.7001144886016846, + "step": 1567 + }, + { + "epoch": 0.25, + "learning_rate": 9.754144138622462e-06, + "logits/chosen": -0.8730078935623169, + "logits/rejected": -0.8730078935623169, + "logps/chosen": -76.00432586669922, + "logps/rejected": -76.00432586669922, + "loss": 0.4068, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.0088722705841064, + "rewards/margins": 0.0, + "rewards/rejected": 3.0088722705841064, + "step": 1568 + }, + { + "epoch": 0.25, + "learning_rate": 9.753736927770982e-06, + "logits/chosen": -1.0128253698349, + "logits/rejected": -0.9361553192138672, + "logps/chosen": -63.919403076171875, + "logps/rejected": -13.796531677246094, + "loss": 0.356, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.0155930519104004, + "rewards/margins": 0.8212794065475464, + "rewards/rejected": 1.194313645362854, + "step": 1569 + }, + { + "epoch": 0.25, + "learning_rate": 9.753329388481261e-06, + "logits/chosen": -0.9773816466331482, + "logits/rejected": -0.8475926518440247, + "logps/chosen": -117.6794204711914, + "logps/rejected": -106.35659790039062, + "loss": 0.2661, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.6200690269470215, + "rewards/margins": 1.2589926719665527, + "rewards/rejected": 4.361076354980469, + "step": 1570 + }, + { + "epoch": 0.25, + "learning_rate": 9.752921520781454e-06, + "logits/chosen": -0.49605417251586914, + "logits/rejected": -0.49605417251586914, + "logps/chosen": -46.648624420166016, + "logps/rejected": -46.648624420166016, + "loss": 0.3531, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.233904242515564, + "rewards/margins": 0.0, + "rewards/rejected": 1.233904242515564, + "step": 1571 + }, + { + "epoch": 0.26, + "learning_rate": 9.752513324699744e-06, + "logits/chosen": -1.1143786907196045, + "logits/rejected": -1.0143641233444214, + "logps/chosen": -149.43089294433594, + "logps/rejected": -107.3111572265625, + "loss": 1.2169, + "rewards/accuracies": 0.0, + "rewards/chosen": 4.676823616027832, + "rewards/margins": -0.8488583564758301, + "rewards/rejected": 5.525681972503662, + "step": 1572 + }, + { + "epoch": 0.26, + "learning_rate": 9.752104800264332e-06, + "logits/chosen": -0.4432949721813202, + "logits/rejected": -0.47710350155830383, + "logps/chosen": -56.616172790527344, + "logps/rejected": -81.50431060791016, + "loss": 0.757, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0286788940429688, + "rewards/margins": 0.06438660621643066, + "rewards/rejected": 2.964292287826538, + "step": 1573 + }, + { + "epoch": 0.26, + "learning_rate": 9.751695947503442e-06, + "logits/chosen": -1.2652380466461182, + "logits/rejected": -1.2157803773880005, + "logps/chosen": -111.36924743652344, + "logps/rejected": -148.1958465576172, + "loss": 0.8382, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.3476760387420654, + "rewards/margins": -1.4680497646331787, + "rewards/rejected": 3.815725803375244, + "step": 1574 + }, + { + "epoch": 0.26, + "learning_rate": 9.751286766445324e-06, + "logits/chosen": -0.525383472442627, + "logits/rejected": -0.525383472442627, + "logps/chosen": -25.04330825805664, + "logps/rejected": -25.04330825805664, + "loss": 0.7185, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.35928401350975037, + "rewards/margins": 0.0, + "rewards/rejected": 0.35928401350975037, + "step": 1575 + }, + { + "epoch": 0.26, + "learning_rate": 9.750877257118248e-06, + "logits/chosen": -0.293612539768219, + "logits/rejected": -0.3493364453315735, + "logps/chosen": -66.80130767822266, + "logps/rejected": -99.2037582397461, + "loss": 0.6023, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.4007110595703125, + "rewards/margins": -0.25369346141815186, + "rewards/rejected": 1.6544045209884644, + "step": 1576 + }, + { + "epoch": 0.26, + "learning_rate": 9.750467419550505e-06, + "logits/chosen": -0.452625036239624, + "logits/rejected": -0.45064979791641235, + "logps/chosen": -1.1397844552993774, + "logps/rejected": -2.811145305633545, + "loss": 0.4789, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.1919923722743988, + "rewards/margins": -0.1546606719493866, + "rewards/rejected": 0.3466530442237854, + "step": 1577 + }, + { + "epoch": 0.26, + "learning_rate": 9.750057253770413e-06, + "logits/chosen": -0.9919837117195129, + "logits/rejected": -0.9919837117195129, + "logps/chosen": -34.07444763183594, + "logps/rejected": -34.07444763183594, + "loss": 0.4206, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.5548591613769531, + "rewards/margins": 0.0, + "rewards/rejected": 1.5548591613769531, + "step": 1578 + }, + { + "epoch": 0.26, + "learning_rate": 9.749646759806312e-06, + "logits/chosen": -0.8270817399024963, + "logits/rejected": -0.8014642596244812, + "logps/chosen": -78.8277587890625, + "logps/rejected": -49.269134521484375, + "loss": 0.6338, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.8324737548828125, + "rewards/margins": -0.9190337657928467, + "rewards/rejected": 2.751507520675659, + "step": 1579 + }, + { + "epoch": 0.26, + "learning_rate": 9.74923593768656e-06, + "logits/chosen": -0.7924162149429321, + "logits/rejected": -0.8707142472267151, + "logps/chosen": -69.96746826171875, + "logps/rejected": -55.735382080078125, + "loss": 0.7636, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.6664505004882812, + "rewards/margins": -0.49826669692993164, + "rewards/rejected": 2.164717197418213, + "step": 1580 + }, + { + "epoch": 0.26, + "learning_rate": 9.748824787439542e-06, + "logits/chosen": -0.8470433950424194, + "logits/rejected": -0.8184989094734192, + "logps/chosen": -70.91143035888672, + "logps/rejected": -68.04782104492188, + "loss": 1.0986, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.9622238874435425, + "rewards/margins": 0.21932220458984375, + "rewards/rejected": 1.7429016828536987, + "step": 1581 + }, + { + "epoch": 0.26, + "learning_rate": 9.748413309093666e-06, + "logits/chosen": -0.6846061944961548, + "logits/rejected": -0.6469038724899292, + "logps/chosen": -96.03414154052734, + "logps/rejected": -95.18020629882812, + "loss": 2.211, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.8859398365020752, + "rewards/margins": 0.21853411197662354, + "rewards/rejected": 1.6674057245254517, + "step": 1582 + }, + { + "epoch": 0.26, + "learning_rate": 9.74800150267736e-06, + "logits/chosen": -0.8527392148971558, + "logits/rejected": -0.8777011632919312, + "logps/chosen": -122.61749267578125, + "logps/rejected": -136.37448120117188, + "loss": 0.3435, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.568856954574585, + "rewards/margins": 0.5873414278030396, + "rewards/rejected": 1.9815155267715454, + "step": 1583 + }, + { + "epoch": 0.26, + "learning_rate": 9.747589368219076e-06, + "logits/chosen": -0.6725403070449829, + "logits/rejected": -0.6832598447799683, + "logps/chosen": -39.5610466003418, + "logps/rejected": -46.713985443115234, + "loss": 0.338, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.840117335319519, + "rewards/margins": 0.17907226085662842, + "rewards/rejected": 1.6610450744628906, + "step": 1584 + }, + { + "epoch": 0.26, + "learning_rate": 9.747176905747289e-06, + "logits/chosen": -1.3383049964904785, + "logits/rejected": -1.3036521673202515, + "logps/chosen": -79.157958984375, + "logps/rejected": -56.202667236328125, + "loss": 1.1249, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.7014236450195312, + "rewards/margins": -1.9997506141662598, + "rewards/rejected": 3.701174259185791, + "step": 1585 + }, + { + "epoch": 0.26, + "learning_rate": 9.746764115290496e-06, + "logits/chosen": -0.9213434457778931, + "logits/rejected": -0.936772882938385, + "logps/chosen": -127.64969635009766, + "logps/rejected": -133.9488067626953, + "loss": 0.986, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.4517250061035156, + "rewards/margins": -1.1974709033966064, + "rewards/rejected": 1.649195909500122, + "step": 1586 + }, + { + "epoch": 0.26, + "learning_rate": 9.746350996877216e-06, + "logits/chosen": -1.0243076086044312, + "logits/rejected": -0.9971172213554382, + "logps/chosen": -141.94923400878906, + "logps/rejected": -21.214229583740234, + "loss": 0.0077, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.129422187805176, + "rewards/margins": 4.610063552856445, + "rewards/rejected": 0.5193588137626648, + "step": 1587 + }, + { + "epoch": 0.26, + "learning_rate": 9.745937550535993e-06, + "logits/chosen": -1.0107979774475098, + "logits/rejected": -1.0106385946273804, + "logps/chosen": -54.22154998779297, + "logps/rejected": -56.80864715576172, + "loss": 0.6973, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.514540910720825, + "rewards/margins": 0.18562769889831543, + "rewards/rejected": 2.3289132118225098, + "step": 1588 + }, + { + "epoch": 0.26, + "learning_rate": 9.745523776295394e-06, + "logits/chosen": -1.4105491638183594, + "logits/rejected": -1.4385181665420532, + "logps/chosen": -139.4488525390625, + "logps/rejected": -158.94808959960938, + "loss": 0.6784, + "rewards/accuracies": 0.0, + "rewards/chosen": 4.987127780914307, + "rewards/margins": -1.0461335182189941, + "rewards/rejected": 6.033261299133301, + "step": 1589 + }, + { + "epoch": 0.26, + "learning_rate": 9.745109674184001e-06, + "logits/chosen": -0.7779066562652588, + "logits/rejected": -0.8011404275894165, + "logps/chosen": -123.05272674560547, + "logps/rejected": -58.350399017333984, + "loss": 0.6428, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.8625892996788025, + "rewards/margins": -0.6705135703086853, + "rewards/rejected": 1.5331028699874878, + "step": 1590 + }, + { + "epoch": 0.26, + "learning_rate": 9.74469524423043e-06, + "logits/chosen": -0.7482442259788513, + "logits/rejected": -0.7302361726760864, + "logps/chosen": -44.635860443115234, + "logps/rejected": -18.815954208374023, + "loss": 0.3939, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.6695449948310852, + "rewards/margins": 0.3318811357021332, + "rewards/rejected": 0.337663859128952, + "step": 1591 + }, + { + "epoch": 0.26, + "learning_rate": 9.744280486463313e-06, + "logits/chosen": -1.076098918914795, + "logits/rejected": -1.1589100360870361, + "logps/chosen": -177.0880126953125, + "logps/rejected": -140.5937957763672, + "loss": 0.9808, + "rewards/accuracies": 0.0, + "rewards/chosen": 4.008520603179932, + "rewards/margins": -1.7696332931518555, + "rewards/rejected": 5.778153896331787, + "step": 1592 + }, + { + "epoch": 0.26, + "learning_rate": 9.743865400911305e-06, + "logits/chosen": -0.845852255821228, + "logits/rejected": -0.8144164681434631, + "logps/chosen": -49.2834358215332, + "logps/rejected": -86.25836944580078, + "loss": 1.0472, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.5595226287841797, + "rewards/margins": -1.2636516094207764, + "rewards/rejected": 2.823174238204956, + "step": 1593 + }, + { + "epoch": 0.26, + "learning_rate": 9.743449987603082e-06, + "logits/chosen": -0.6783756017684937, + "logits/rejected": -0.6696720719337463, + "logps/chosen": -46.03837203979492, + "logps/rejected": -41.75438690185547, + "loss": 0.9915, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.9931080341339111, + "rewards/margins": 0.1805175542831421, + "rewards/rejected": 1.812590479850769, + "step": 1594 + }, + { + "epoch": 0.26, + "learning_rate": 9.743034246567352e-06, + "logits/chosen": -0.5972929000854492, + "logits/rejected": -0.5628927946090698, + "logps/chosen": -83.65449523925781, + "logps/rejected": -68.58109283447266, + "loss": 0.668, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.9741432666778564, + "rewards/margins": 0.9493111371994019, + "rewards/rejected": 1.0248321294784546, + "step": 1595 + }, + { + "epoch": 0.26, + "learning_rate": 9.74261817783283e-06, + "logits/chosen": -1.121185302734375, + "logits/rejected": -0.8383376598358154, + "logps/chosen": -141.20925903320312, + "logps/rejected": -38.3886604309082, + "loss": 0.4282, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.8215179443359375, + "rewards/margins": 5.630642890930176, + "rewards/rejected": 0.19087524712085724, + "step": 1596 + }, + { + "epoch": 0.26, + "learning_rate": 9.742201781428271e-06, + "logits/chosen": -1.0512135028839111, + "logits/rejected": -1.279177188873291, + "logps/chosen": -101.947998046875, + "logps/rejected": -36.445533752441406, + "loss": 0.1148, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.7682831287384033, + "rewards/margins": 1.4430427551269531, + "rewards/rejected": 0.3252403438091278, + "step": 1597 + }, + { + "epoch": 0.26, + "learning_rate": 9.741785057382438e-06, + "logits/chosen": -0.8500184416770935, + "logits/rejected": -0.8007656335830688, + "logps/chosen": -82.48796844482422, + "logps/rejected": -94.10728454589844, + "loss": 0.4149, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.5281150341033936, + "rewards/margins": 0.3433349132537842, + "rewards/rejected": 2.1847801208496094, + "step": 1598 + }, + { + "epoch": 0.26, + "learning_rate": 9.741368005724125e-06, + "logits/chosen": -0.9787752628326416, + "logits/rejected": -0.9129173159599304, + "logps/chosen": -100.63878631591797, + "logps/rejected": -139.81100463867188, + "loss": 0.0258, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.236396312713623, + "rewards/margins": 3.0630991458892822, + "rewards/rejected": 3.173297166824341, + "step": 1599 + }, + { + "epoch": 0.26, + "learning_rate": 9.740950626482146e-06, + "logits/chosen": -0.9612816572189331, + "logits/rejected": -0.9612816572189331, + "logps/chosen": -56.993858337402344, + "logps/rejected": -56.993858337402344, + "loss": 1.1859, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.4934204816818237, + "rewards/margins": 0.0, + "rewards/rejected": 1.4934204816818237, + "step": 1600 + }, + { + "epoch": 0.26, + "learning_rate": 9.74053291968534e-06, + "logits/chosen": -0.36599865555763245, + "logits/rejected": -0.3633827567100525, + "logps/chosen": -8.468952178955078, + "logps/rejected": -1.4021530151367188, + "loss": 0.4181, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.29067906737327576, + "rewards/margins": 0.01042988896369934, + "rewards/rejected": 0.2802491784095764, + "step": 1601 + }, + { + "epoch": 0.26, + "learning_rate": 9.740114885362562e-06, + "logits/chosen": -0.655707061290741, + "logits/rejected": -0.5985234975814819, + "logps/chosen": -58.2877311706543, + "logps/rejected": -64.26860809326172, + "loss": 0.5684, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.4797489643096924, + "rewards/margins": 0.420088529586792, + "rewards/rejected": 2.0596604347229004, + "step": 1602 + }, + { + "epoch": 0.26, + "learning_rate": 9.739696523542696e-06, + "logits/chosen": -0.8247426748275757, + "logits/rejected": -0.8247426748275757, + "logps/chosen": -27.6624755859375, + "logps/rejected": -27.6624755859375, + "loss": 0.3482, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.8489028811454773, + "rewards/margins": 0.0, + "rewards/rejected": 0.8489028811454773, + "step": 1603 + }, + { + "epoch": 0.26, + "learning_rate": 9.73927783425465e-06, + "logits/chosen": -0.9573824405670166, + "logits/rejected": -0.9662458896636963, + "logps/chosen": -54.79924774169922, + "logps/rejected": -91.23528289794922, + "loss": 0.3544, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.672228217124939, + "rewards/margins": 0.8392059206962585, + "rewards/rejected": 0.8330222964286804, + "step": 1604 + }, + { + "epoch": 0.26, + "learning_rate": 9.738858817527348e-06, + "logits/chosen": -0.9177906513214111, + "logits/rejected": -0.9230226874351501, + "logps/chosen": -31.46869659423828, + "logps/rejected": -44.7666130065918, + "loss": 1.1658, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.6639354825019836, + "rewards/margins": 0.02524489164352417, + "rewards/rejected": 0.6386905908584595, + "step": 1605 + }, + { + "epoch": 0.26, + "learning_rate": 9.738439473389743e-06, + "logits/chosen": -0.8638821840286255, + "logits/rejected": -0.8316061496734619, + "logps/chosen": -40.252357482910156, + "logps/rejected": -27.999975204467773, + "loss": 0.7137, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.101461410522461, + "rewards/margins": 0.7792434692382812, + "rewards/rejected": 0.3222179412841797, + "step": 1606 + }, + { + "epoch": 0.26, + "learning_rate": 9.738019801870803e-06, + "logits/chosen": -0.663977861404419, + "logits/rejected": -0.7231000661849976, + "logps/chosen": -20.195079803466797, + "logps/rejected": -67.15304565429688, + "loss": 0.8423, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.6900760531425476, + "rewards/margins": -1.4561245441436768, + "rewards/rejected": 2.146200656890869, + "step": 1607 + }, + { + "epoch": 0.26, + "learning_rate": 9.737599802999528e-06, + "logits/chosen": -1.0801801681518555, + "logits/rejected": -0.9845268726348877, + "logps/chosen": -47.11150360107422, + "logps/rejected": -38.35026168823242, + "loss": 0.3343, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.3546669483184814, + "rewards/margins": 1.1750659942626953, + "rewards/rejected": 2.179600954055786, + "step": 1608 + }, + { + "epoch": 0.26, + "learning_rate": 9.737179476804934e-06, + "logits/chosen": -0.7876718044281006, + "logits/rejected": -0.9091630578041077, + "logps/chosen": -40.93431854248047, + "logps/rejected": -110.20883178710938, + "loss": 2.4357, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.1291122436523438, + "rewards/margins": -4.322920322418213, + "rewards/rejected": 6.452032566070557, + "step": 1609 + }, + { + "epoch": 0.26, + "learning_rate": 9.736758823316062e-06, + "logits/chosen": -0.9313628077507019, + "logits/rejected": -0.9314727187156677, + "logps/chosen": -48.74034881591797, + "logps/rejected": -27.00078582763672, + "loss": 1.0684, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.4192780256271362, + "rewards/margins": -0.092559814453125, + "rewards/rejected": 1.5118378400802612, + "step": 1610 + }, + { + "epoch": 0.26, + "learning_rate": 9.736337842561973e-06, + "logits/chosen": -0.5557165741920471, + "logits/rejected": -0.5557165741920471, + "logps/chosen": -51.89435958862305, + "logps/rejected": -51.89435958862305, + "loss": 1.5049, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.1852749586105347, + "rewards/margins": 0.0, + "rewards/rejected": 1.1852749586105347, + "step": 1611 + }, + { + "epoch": 0.26, + "learning_rate": 9.735916534571758e-06, + "logits/chosen": -0.7596368193626404, + "logits/rejected": -0.7434749007225037, + "logps/chosen": -100.12310791015625, + "logps/rejected": -77.8052978515625, + "loss": 1.7299, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.407110571861267, + "rewards/margins": -0.10291755199432373, + "rewards/rejected": 1.5100281238555908, + "step": 1612 + }, + { + "epoch": 0.26, + "learning_rate": 9.73549489937452e-06, + "logits/chosen": -0.6181449890136719, + "logits/rejected": -0.6783877611160278, + "logps/chosen": -7.502152919769287, + "logps/rejected": -45.599266052246094, + "loss": 0.8728, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.38032346963882446, + "rewards/margins": -0.6229724287986755, + "rewards/rejected": 1.0032958984375, + "step": 1613 + }, + { + "epoch": 0.26, + "learning_rate": 9.735072936999392e-06, + "logits/chosen": -0.6771383881568909, + "logits/rejected": -0.8328522443771362, + "logps/chosen": -53.86743927001953, + "logps/rejected": -90.45957946777344, + "loss": 1.1035, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.6303550601005554, + "rewards/margins": -1.2654213905334473, + "rewards/rejected": 1.895776391029358, + "step": 1614 + }, + { + "epoch": 0.26, + "learning_rate": 9.73465064747553e-06, + "logits/chosen": -0.9046515226364136, + "logits/rejected": -0.7928937673568726, + "logps/chosen": -78.537353515625, + "logps/rejected": -17.771251678466797, + "loss": 2.775, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.906323194503784, + "rewards/margins": 2.8149619102478027, + "rewards/rejected": 1.0913612842559814, + "step": 1615 + }, + { + "epoch": 0.26, + "learning_rate": 9.734228030832105e-06, + "logits/chosen": -1.0229747295379639, + "logits/rejected": -0.9759488701820374, + "logps/chosen": -87.01264953613281, + "logps/rejected": -89.62339782714844, + "loss": 3.2186, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.1208771467208862, + "rewards/margins": -5.995008945465088, + "rewards/rejected": 7.115886211395264, + "step": 1616 + }, + { + "epoch": 0.26, + "learning_rate": 9.73380508709832e-06, + "logits/chosen": -0.5648363828659058, + "logits/rejected": -0.554724931716919, + "logps/chosen": -1.1817619800567627, + "logps/rejected": -19.13730812072754, + "loss": 1.6294, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.2552330791950226, + "rewards/margins": -0.04577761888504028, + "rewards/rejected": 0.30101069808006287, + "step": 1617 + }, + { + "epoch": 0.26, + "learning_rate": 9.733381816303395e-06, + "logits/chosen": -0.8555432558059692, + "logits/rejected": -0.9434633851051331, + "logps/chosen": -60.441558837890625, + "logps/rejected": -93.47305297851562, + "loss": 1.2332, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.0976028442382812, + "rewards/margins": 0.945940375328064, + "rewards/rejected": 0.1516624540090561, + "step": 1618 + }, + { + "epoch": 0.26, + "learning_rate": 9.732958218476575e-06, + "logits/chosen": -0.5466951131820679, + "logits/rejected": -0.4407002925872803, + "logps/chosen": -60.7979621887207, + "logps/rejected": -71.02523803710938, + "loss": 0.3243, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.5653560161590576, + "rewards/margins": 0.27326011657714844, + "rewards/rejected": 2.292095899581909, + "step": 1619 + }, + { + "epoch": 0.26, + "learning_rate": 9.732534293647124e-06, + "logits/chosen": -0.4857442378997803, + "logits/rejected": -0.4857442378997803, + "logps/chosen": -0.9707862138748169, + "logps/rejected": -0.9707862138748169, + "loss": 0.7094, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.15028797090053558, + "rewards/margins": 0.0, + "rewards/rejected": 0.15028797090053558, + "step": 1620 + }, + { + "epoch": 0.26, + "learning_rate": 9.732110041844334e-06, + "logits/chosen": -0.952619731426239, + "logits/rejected": -0.9719576239585876, + "logps/chosen": -42.02957534790039, + "logps/rejected": -28.48978042602539, + "loss": 0.7183, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.5089733004570007, + "rewards/margins": -0.8795974850654602, + "rewards/rejected": 1.388570785522461, + "step": 1621 + }, + { + "epoch": 0.26, + "learning_rate": 9.731685463097518e-06, + "logits/chosen": -0.8280695080757141, + "logits/rejected": -0.6072193384170532, + "logps/chosen": -44.114036560058594, + "logps/rejected": -36.31378173828125, + "loss": 0.073, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.295602560043335, + "rewards/margins": 2.137449026107788, + "rewards/rejected": 0.15815353393554688, + "step": 1622 + }, + { + "epoch": 0.26, + "learning_rate": 9.731260557436005e-06, + "logits/chosen": -0.97084641456604, + "logits/rejected": -0.9192570447921753, + "logps/chosen": -173.97940063476562, + "logps/rejected": -78.10986328125, + "loss": 0.0058, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.728000164031982, + "rewards/margins": 4.503483772277832, + "rewards/rejected": 2.2245163917541504, + "step": 1623 + }, + { + "epoch": 0.26, + "learning_rate": 9.730835324889156e-06, + "logits/chosen": -0.5801512002944946, + "logits/rejected": -0.5972006916999817, + "logps/chosen": -64.3446044921875, + "logps/rejected": -90.29328155517578, + "loss": 1.0881, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.7242180109024048, + "rewards/margins": -0.8751603364944458, + "rewards/rejected": 2.5993783473968506, + "step": 1624 + }, + { + "epoch": 0.26, + "learning_rate": 9.73040976548635e-06, + "logits/chosen": -0.8928337097167969, + "logits/rejected": -0.9272525310516357, + "logps/chosen": -88.78579711914062, + "logps/rejected": -101.06590270996094, + "loss": 2.2993, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.9350380301475525, + "rewards/margins": -0.1689552664756775, + "rewards/rejected": 1.10399329662323, + "step": 1625 + }, + { + "epoch": 0.26, + "learning_rate": 9.729983879256988e-06, + "logits/chosen": -0.6969841122627258, + "logits/rejected": -0.6984758377075195, + "logps/chosen": -38.08106994628906, + "logps/rejected": -54.39515686035156, + "loss": 1.452, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.5564861297607422, + "rewards/margins": -0.9661252498626709, + "rewards/rejected": 2.522611379623413, + "step": 1626 + }, + { + "epoch": 0.26, + "learning_rate": 9.729557666230495e-06, + "logits/chosen": -0.8874641060829163, + "logits/rejected": -1.005944013595581, + "logps/chosen": -47.165931701660156, + "logps/rejected": -80.69473266601562, + "loss": 1.6411, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.455673933029175, + "rewards/margins": -2.5619513988494873, + "rewards/rejected": 5.017625331878662, + "step": 1627 + }, + { + "epoch": 0.26, + "learning_rate": 9.72913112643632e-06, + "logits/chosen": -0.6233780384063721, + "logits/rejected": -0.6231354475021362, + "logps/chosen": -2.6319448947906494, + "logps/rejected": -1.6549471616744995, + "loss": 0.6644, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.34921640157699585, + "rewards/margins": 0.07335087656974792, + "rewards/rejected": 0.2758655250072479, + "step": 1628 + }, + { + "epoch": 0.26, + "learning_rate": 9.728704259903932e-06, + "logits/chosen": -1.1382951736450195, + "logits/rejected": -0.9982436299324036, + "logps/chosen": -165.1367950439453, + "logps/rejected": -52.42869567871094, + "loss": 0.085, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.928706645965576, + "rewards/margins": 2.864692211151123, + "rewards/rejected": 3.064014434814453, + "step": 1629 + }, + { + "epoch": 0.26, + "learning_rate": 9.728277066662821e-06, + "logits/chosen": -1.008827805519104, + "logits/rejected": -1.117358922958374, + "logps/chosen": -84.1634521484375, + "logps/rejected": -103.95552062988281, + "loss": 0.8609, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.8019866943359375, + "rewards/margins": -1.408787727355957, + "rewards/rejected": 4.2107744216918945, + "step": 1630 + }, + { + "epoch": 0.26, + "learning_rate": 9.727849546742508e-06, + "logits/chosen": -1.3967018127441406, + "logits/rejected": -1.2712750434875488, + "logps/chosen": -85.9502944946289, + "logps/rejected": -63.66679382324219, + "loss": 0.0134, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.849547863006592, + "rewards/margins": 4.087154388427734, + "rewards/rejected": 2.7623932361602783, + "step": 1631 + }, + { + "epoch": 0.26, + "learning_rate": 9.727421700172522e-06, + "logits/chosen": -1.1453970670700073, + "logits/rejected": -1.0538769960403442, + "logps/chosen": -69.47137451171875, + "logps/rejected": -53.22967529296875, + "loss": 0.4029, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.763555884361267, + "rewards/margins": 0.02608335018157959, + "rewards/rejected": 1.7374725341796875, + "step": 1632 + }, + { + "epoch": 0.27, + "learning_rate": 9.72699352698243e-06, + "logits/chosen": -0.7691099047660828, + "logits/rejected": -0.7691099047660828, + "logps/chosen": -138.25180053710938, + "logps/rejected": -138.25180053710938, + "loss": 0.6998, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.3548706769943237, + "rewards/margins": 0.0, + "rewards/rejected": 1.3548706769943237, + "step": 1633 + }, + { + "epoch": 0.27, + "learning_rate": 9.726565027201813e-06, + "logits/chosen": -0.9912304878234863, + "logits/rejected": -0.997003972530365, + "logps/chosen": -106.06758117675781, + "logps/rejected": -98.97696685791016, + "loss": 0.6024, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.8230079412460327, + "rewards/margins": -0.8402000665664673, + "rewards/rejected": 2.6632080078125, + "step": 1634 + }, + { + "epoch": 0.27, + "learning_rate": 9.726136200860274e-06, + "logits/chosen": -0.6757778525352478, + "logits/rejected": -0.629298210144043, + "logps/chosen": -105.72482299804688, + "logps/rejected": -74.75985717773438, + "loss": 0.4309, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.256213426589966, + "rewards/margins": -0.23034429550170898, + "rewards/rejected": 2.486557722091675, + "step": 1635 + }, + { + "epoch": 0.27, + "learning_rate": 9.725707047987445e-06, + "logits/chosen": -0.871485710144043, + "logits/rejected": -0.8751155734062195, + "logps/chosen": -70.68727111816406, + "logps/rejected": -61.37486267089844, + "loss": 0.4524, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.5622650384902954, + "rewards/margins": -0.3540046215057373, + "rewards/rejected": 1.9162696599960327, + "step": 1636 + }, + { + "epoch": 0.27, + "learning_rate": 9.725277568612972e-06, + "logits/chosen": -0.9016479849815369, + "logits/rejected": -0.9311793446540833, + "logps/chosen": -54.20912170410156, + "logps/rejected": -50.34520721435547, + "loss": 0.7792, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.6850494146347046, + "rewards/margins": 0.15061569213867188, + "rewards/rejected": 1.5344337224960327, + "step": 1637 + }, + { + "epoch": 0.27, + "learning_rate": 9.72484776276653e-06, + "logits/chosen": -0.8454973101615906, + "logits/rejected": -0.8510939478874207, + "logps/chosen": -56.091529846191406, + "logps/rejected": -82.06635284423828, + "loss": 0.7191, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.3363288640975952, + "rewards/margins": -0.10122537612915039, + "rewards/rejected": 1.4375542402267456, + "step": 1638 + }, + { + "epoch": 0.27, + "learning_rate": 9.724417630477817e-06, + "logits/chosen": -1.1342840194702148, + "logits/rejected": -0.9590166807174683, + "logps/chosen": -72.83255004882812, + "logps/rejected": -45.10564422607422, + "loss": 0.2818, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.083091735839844, + "rewards/margins": 0.7058660984039307, + "rewards/rejected": 3.377225637435913, + "step": 1639 + }, + { + "epoch": 0.27, + "learning_rate": 9.723987171776547e-06, + "logits/chosen": -0.37269094586372375, + "logits/rejected": -0.37269094586372375, + "logps/chosen": -39.97599792480469, + "logps/rejected": -39.97599792480469, + "loss": 0.404, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.4388946294784546, + "rewards/margins": 0.0, + "rewards/rejected": 1.4388946294784546, + "step": 1640 + }, + { + "epoch": 0.27, + "learning_rate": 9.72355638669246e-06, + "logits/chosen": -1.543432593345642, + "logits/rejected": -1.4431369304656982, + "logps/chosen": -88.66400146484375, + "logps/rejected": -20.26972007751465, + "loss": 0.2487, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.362063884735107, + "rewards/margins": 5.913020610809326, + "rewards/rejected": 0.4490430951118469, + "step": 1641 + }, + { + "epoch": 0.27, + "learning_rate": 9.723125275255325e-06, + "logits/chosen": -1.0919796228408813, + "logits/rejected": -1.0566701889038086, + "logps/chosen": -59.61237716674805, + "logps/rejected": -132.10894775390625, + "loss": 0.8441, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9292988777160645, + "rewards/margins": 0.389357328414917, + "rewards/rejected": 2.5399415493011475, + "step": 1642 + }, + { + "epoch": 0.27, + "learning_rate": 9.722693837494923e-06, + "logits/chosen": -0.9065614342689514, + "logits/rejected": -0.7789393663406372, + "logps/chosen": -39.44621658325195, + "logps/rejected": -28.39596176147461, + "loss": 0.2628, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.4297428131103516, + "rewards/margins": 0.48969727754592896, + "rewards/rejected": 0.9400455355644226, + "step": 1643 + }, + { + "epoch": 0.27, + "learning_rate": 9.722262073441063e-06, + "logits/chosen": -0.9460118412971497, + "logits/rejected": -0.9210851788520813, + "logps/chosen": -75.38957214355469, + "logps/rejected": -52.50541687011719, + "loss": 0.3972, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.195501685142517, + "rewards/margins": -0.14893198013305664, + "rewards/rejected": 1.3444336652755737, + "step": 1644 + }, + { + "epoch": 0.27, + "learning_rate": 9.721829983123576e-06, + "logits/chosen": -0.9419646859169006, + "logits/rejected": -0.9300976395606995, + "logps/chosen": -129.89688110351562, + "logps/rejected": -58.54460144042969, + "loss": 0.9567, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.5933822393417358, + "rewards/margins": -0.21449899673461914, + "rewards/rejected": 1.807881236076355, + "step": 1645 + }, + { + "epoch": 0.27, + "learning_rate": 9.721397566572315e-06, + "logits/chosen": -0.6698580980300903, + "logits/rejected": -0.6506485342979431, + "logps/chosen": -79.88578796386719, + "logps/rejected": -50.06488037109375, + "loss": 0.2732, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.5085281133651733, + "rewards/margins": 0.8089599013328552, + "rewards/rejected": 0.6995682120323181, + "step": 1646 + }, + { + "epoch": 0.27, + "learning_rate": 9.720964823817159e-06, + "logits/chosen": -0.5249444246292114, + "logits/rejected": -0.5249444246292114, + "logps/chosen": -69.70592498779297, + "logps/rejected": -69.70592498779297, + "loss": 0.4902, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.886267900466919, + "rewards/margins": 0.0, + "rewards/rejected": 1.886267900466919, + "step": 1647 + }, + { + "epoch": 0.27, + "learning_rate": 9.720531754888002e-06, + "logits/chosen": -0.9771392345428467, + "logits/rejected": -0.9993767738342285, + "logps/chosen": -45.55455780029297, + "logps/rejected": -135.16734313964844, + "loss": 1.0146, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.4010941982269287, + "rewards/margins": -1.8876798152923584, + "rewards/rejected": 4.288774013519287, + "step": 1648 + }, + { + "epoch": 0.27, + "learning_rate": 9.720098359814764e-06, + "logits/chosen": -0.6327038407325745, + "logits/rejected": -0.5941966772079468, + "logps/chosen": -44.546485900878906, + "logps/rejected": -65.31394958496094, + "loss": 1.0821, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.896463394165039, + "rewards/margins": 0.47951698303222656, + "rewards/rejected": 1.4169464111328125, + "step": 1649 + }, + { + "epoch": 0.27, + "learning_rate": 9.719664638627395e-06, + "logits/chosen": -0.7666700482368469, + "logits/rejected": -0.7331311106681824, + "logps/chosen": -57.143882751464844, + "logps/rejected": -53.52925109863281, + "loss": 1.0324, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.331427812576294, + "rewards/margins": -0.4313368797302246, + "rewards/rejected": 2.7627646923065186, + "step": 1650 + }, + { + "epoch": 0.27, + "learning_rate": 9.719230591355858e-06, + "logits/chosen": -0.8134192228317261, + "logits/rejected": -0.7600957155227661, + "logps/chosen": -46.889312744140625, + "logps/rejected": -46.006736755371094, + "loss": 0.564, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.2797222137451172, + "rewards/margins": -0.6642436981201172, + "rewards/rejected": 1.9439659118652344, + "step": 1651 + }, + { + "epoch": 0.27, + "learning_rate": 9.718796218030138e-06, + "logits/chosen": -1.2919648885726929, + "logits/rejected": -1.3957288265228271, + "logps/chosen": -128.06192016601562, + "logps/rejected": -35.6125373840332, + "loss": 0.3499, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.34454345703125, + "rewards/margins": 2.097838878631592, + "rewards/rejected": 0.24670448899269104, + "step": 1652 + }, + { + "epoch": 0.27, + "learning_rate": 9.71836151868025e-06, + "logits/chosen": -0.638134241104126, + "logits/rejected": -0.6180558204650879, + "logps/chosen": -60.604427337646484, + "logps/rejected": -56.44905090332031, + "loss": 0.2757, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.5086231231689453, + "rewards/margins": 0.32059359550476074, + "rewards/rejected": 2.1880295276641846, + "step": 1653 + }, + { + "epoch": 0.27, + "learning_rate": 9.717926493336227e-06, + "logits/chosen": -1.1559404134750366, + "logits/rejected": -1.1357072591781616, + "logps/chosen": -74.37635803222656, + "logps/rejected": -121.91363525390625, + "loss": 1.4975, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.10787887871265411, + "rewards/margins": -0.9734291434288025, + "rewards/rejected": 1.0813080072402954, + "step": 1654 + }, + { + "epoch": 0.27, + "learning_rate": 9.717491142028127e-06, + "logits/chosen": -0.832703709602356, + "logits/rejected": -1.0117336511611938, + "logps/chosen": -63.656028747558594, + "logps/rejected": -125.29832458496094, + "loss": 1.827, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.7876968383789062, + "rewards/margins": -3.5610718727111816, + "rewards/rejected": 5.348768711090088, + "step": 1655 + }, + { + "epoch": 0.27, + "learning_rate": 9.717055464786022e-06, + "logits/chosen": -0.6709114909172058, + "logits/rejected": -0.6709114909172058, + "logps/chosen": -55.175743103027344, + "logps/rejected": -55.175743103027344, + "loss": 1.1878, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.7202370166778564, + "rewards/margins": 0.0, + "rewards/rejected": 1.7202370166778564, + "step": 1656 + }, + { + "epoch": 0.27, + "learning_rate": 9.71661946164002e-06, + "logits/chosen": -1.242214560508728, + "logits/rejected": -1.2390697002410889, + "logps/chosen": -138.90066528320312, + "logps/rejected": -87.27920532226562, + "loss": 0.7095, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.84476637840271, + "rewards/margins": -1.136289119720459, + "rewards/rejected": 3.981055498123169, + "step": 1657 + }, + { + "epoch": 0.27, + "learning_rate": 9.716183132620242e-06, + "logits/chosen": -0.8154971599578857, + "logits/rejected": -0.8296703696250916, + "logps/chosen": -78.16974639892578, + "logps/rejected": -92.41105651855469, + "loss": 0.1372, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.521003007888794, + "rewards/margins": 1.325385332107544, + "rewards/rejected": 1.19561767578125, + "step": 1658 + }, + { + "epoch": 0.27, + "learning_rate": 9.715746477756835e-06, + "logits/chosen": -0.9878309369087219, + "logits/rejected": -0.9751707315444946, + "logps/chosen": -59.27143096923828, + "logps/rejected": -57.936866760253906, + "loss": 1.0296, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.733945608139038, + "rewards/margins": 0.18206262588500977, + "rewards/rejected": 2.5518829822540283, + "step": 1659 + }, + { + "epoch": 0.27, + "learning_rate": 9.715309497079967e-06, + "logits/chosen": -1.1895537376403809, + "logits/rejected": -1.0721749067306519, + "logps/chosen": -89.9420166015625, + "logps/rejected": -38.49736785888672, + "loss": 1.0837, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.4476044178009033, + "rewards/margins": 2.332617998123169, + "rewards/rejected": 0.11498641967773438, + "step": 1660 + }, + { + "epoch": 0.27, + "learning_rate": 9.714872190619829e-06, + "logits/chosen": -0.43790268898010254, + "logits/rejected": -0.5478114485740662, + "logps/chosen": -55.963993072509766, + "logps/rejected": -72.70000457763672, + "loss": 0.6134, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.4659992456436157, + "rewards/margins": -0.7670918703079224, + "rewards/rejected": 2.233091115951538, + "step": 1661 + }, + { + "epoch": 0.27, + "learning_rate": 9.714434558406636e-06, + "logits/chosen": -0.6371414661407471, + "logits/rejected": -0.7186761498451233, + "logps/chosen": -69.84736633300781, + "logps/rejected": -106.61116027832031, + "loss": 1.2872, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.3685439825057983, + "rewards/margins": -2.4422531127929688, + "rewards/rejected": 3.8107972145080566, + "step": 1662 + }, + { + "epoch": 0.27, + "learning_rate": 9.713996600470623e-06, + "logits/chosen": -1.0465764999389648, + "logits/rejected": -0.8532571196556091, + "logps/chosen": -110.19407653808594, + "logps/rejected": -61.804283142089844, + "loss": 0.2423, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2537002563476562, + "rewards/margins": 2.0838751792907715, + "rewards/rejected": 1.1698249578475952, + "step": 1663 + }, + { + "epoch": 0.27, + "learning_rate": 9.713558316842047e-06, + "logits/chosen": -0.9477590918540955, + "logits/rejected": -0.8617927432060242, + "logps/chosen": -79.17022705078125, + "logps/rejected": -67.6786880493164, + "loss": 0.2619, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.7642128467559814, + "rewards/margins": 0.3966583013534546, + "rewards/rejected": 1.3675545454025269, + "step": 1664 + }, + { + "epoch": 0.27, + "learning_rate": 9.713119707551194e-06, + "logits/chosen": -0.9811197519302368, + "logits/rejected": -1.0711610317230225, + "logps/chosen": -149.79461669921875, + "logps/rejected": -144.0433349609375, + "loss": 1.2578, + "rewards/accuracies": 0.0, + "rewards/chosen": 5.679037570953369, + "rewards/margins": -0.3682403564453125, + "rewards/rejected": 6.047277927398682, + "step": 1665 + }, + { + "epoch": 0.27, + "learning_rate": 9.712680772628365e-06, + "logits/chosen": -0.5900610685348511, + "logits/rejected": -0.6661834716796875, + "logps/chosen": -51.82836151123047, + "logps/rejected": -79.6153335571289, + "loss": 2.258, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5766677856445312, + "rewards/margins": 0.4554695188999176, + "rewards/rejected": 0.12119827419519424, + "step": 1666 + }, + { + "epoch": 0.27, + "learning_rate": 9.712241512103884e-06, + "logits/chosen": -1.0310877561569214, + "logits/rejected": -1.008668065071106, + "logps/chosen": -133.9656982421875, + "logps/rejected": -232.0059051513672, + "loss": 1.3944, + "rewards/accuracies": 0.0, + "rewards/chosen": 5.01654052734375, + "rewards/margins": -2.4255905151367188, + "rewards/rejected": 7.442131042480469, + "step": 1667 + }, + { + "epoch": 0.27, + "learning_rate": 9.711801926008106e-06, + "logits/chosen": -1.1834990978240967, + "logits/rejected": -1.228440761566162, + "logps/chosen": -52.49518585205078, + "logps/rejected": -111.30745697021484, + "loss": 1.6939, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.065065860748291, + "rewards/margins": 0.8274567127227783, + "rewards/rejected": 2.2376091480255127, + "step": 1668 + }, + { + "epoch": 0.27, + "learning_rate": 9.711362014371396e-06, + "logits/chosen": -1.1537351608276367, + "logits/rejected": -1.1137492656707764, + "logps/chosen": -45.527976989746094, + "logps/rejected": -81.06146240234375, + "loss": 0.6284, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.5309035778045654, + "rewards/margins": -0.2610161304473877, + "rewards/rejected": 2.791919708251953, + "step": 1669 + }, + { + "epoch": 0.27, + "learning_rate": 9.710921777224149e-06, + "logits/chosen": -0.9488211274147034, + "logits/rejected": -0.8733764290809631, + "logps/chosen": -49.10052490234375, + "logps/rejected": -63.5797233581543, + "loss": 0.9221, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.5836013555526733, + "rewards/margins": -0.3201909065246582, + "rewards/rejected": 1.9037922620773315, + "step": 1670 + }, + { + "epoch": 0.27, + "learning_rate": 9.710481214596786e-06, + "logits/chosen": -1.136435866355896, + "logits/rejected": -1.0614001750946045, + "logps/chosen": -53.82422637939453, + "logps/rejected": -69.58690643310547, + "loss": 0.487, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.79156494140625, + "rewards/margins": -0.48265767097473145, + "rewards/rejected": 2.2742226123809814, + "step": 1671 + }, + { + "epoch": 0.27, + "learning_rate": 9.710040326519739e-06, + "logits/chosen": -0.9574711322784424, + "logits/rejected": -0.8929277062416077, + "logps/chosen": -37.38068389892578, + "logps/rejected": -12.923404693603516, + "loss": 1.4326, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.3754318356513977, + "rewards/margins": -0.7037234902381897, + "rewards/rejected": 1.0791553258895874, + "step": 1672 + }, + { + "epoch": 0.27, + "learning_rate": 9.709599113023474e-06, + "logits/chosen": -0.5302965044975281, + "logits/rejected": -0.5302965044975281, + "logps/chosen": -56.39841842651367, + "logps/rejected": -56.39841842651367, + "loss": 0.3651, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.27174264192581177, + "rewards/margins": 0.0, + "rewards/rejected": 0.27174264192581177, + "step": 1673 + }, + { + "epoch": 0.27, + "learning_rate": 9.70915757413847e-06, + "logits/chosen": -1.1164308786392212, + "logits/rejected": -1.113893747329712, + "logps/chosen": -75.18094635009766, + "logps/rejected": -146.94174194335938, + "loss": 0.9017, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.9600143432617188, + "rewards/margins": -1.3376832008361816, + "rewards/rejected": 3.2976975440979004, + "step": 1674 + }, + { + "epoch": 0.27, + "learning_rate": 9.708715709895238e-06, + "logits/chosen": -0.8405650854110718, + "logits/rejected": -1.1760666370391846, + "logps/chosen": -80.46092987060547, + "logps/rejected": -70.44232940673828, + "loss": 4.4516, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.069443464279175, + "rewards/margins": -8.365877151489258, + "rewards/rejected": 11.435320854187012, + "step": 1675 + }, + { + "epoch": 0.27, + "learning_rate": 9.708273520324306e-06, + "logits/chosen": -0.9467191696166992, + "logits/rejected": -0.906328022480011, + "logps/chosen": -152.18397521972656, + "logps/rejected": -74.8508071899414, + "loss": 0.4014, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.9727203845977783, + "rewards/margins": 0.5808953046798706, + "rewards/rejected": 1.3918250799179077, + "step": 1676 + }, + { + "epoch": 0.27, + "learning_rate": 9.707831005456222e-06, + "logits/chosen": -0.9408655762672424, + "logits/rejected": -0.97587651014328, + "logps/chosen": -68.42672729492188, + "logps/rejected": -88.93324279785156, + "loss": 1.3839, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.3928741216659546, + "rewards/margins": -2.687289237976074, + "rewards/rejected": 4.080163478851318, + "step": 1677 + }, + { + "epoch": 0.27, + "learning_rate": 9.707388165321563e-06, + "logits/chosen": -1.1194140911102295, + "logits/rejected": -1.1074882745742798, + "logps/chosen": -68.91102600097656, + "logps/rejected": -119.48181915283203, + "loss": 1.4652, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.531114339828491, + "rewards/margins": -1.7573297023773193, + "rewards/rejected": 4.2884440422058105, + "step": 1678 + }, + { + "epoch": 0.27, + "learning_rate": 9.706944999950923e-06, + "logits/chosen": -1.0310771465301514, + "logits/rejected": -0.9284949898719788, + "logps/chosen": -172.0347900390625, + "logps/rejected": -197.04100036621094, + "loss": 1.236, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.6140244007110596, + "rewards/margins": -0.09795224666595459, + "rewards/rejected": 1.7119766473770142, + "step": 1679 + }, + { + "epoch": 0.27, + "learning_rate": 9.70650150937492e-06, + "logits/chosen": -1.1424683332443237, + "logits/rejected": -0.98702073097229, + "logps/chosen": -99.23051452636719, + "logps/rejected": -116.93694305419922, + "loss": 0.6496, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.575618267059326, + "rewards/margins": 0.1748971939086914, + "rewards/rejected": 4.400721073150635, + "step": 1680 + }, + { + "epoch": 0.27, + "learning_rate": 9.706057693624197e-06, + "logits/chosen": -1.3610347509384155, + "logits/rejected": -1.3887938261032104, + "logps/chosen": -63.91719436645508, + "logps/rejected": -63.29286575317383, + "loss": 0.1678, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.48339581489563, + "rewards/margins": 0.9236259460449219, + "rewards/rejected": 1.559769868850708, + "step": 1681 + }, + { + "epoch": 0.27, + "learning_rate": 9.705613552729416e-06, + "logits/chosen": -0.6546732783317566, + "logits/rejected": -0.6182530522346497, + "logps/chosen": -87.3560562133789, + "logps/rejected": -59.236534118652344, + "loss": 0.1991, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.5960724353790283, + "rewards/margins": 0.7246086597442627, + "rewards/rejected": 1.8714637756347656, + "step": 1682 + }, + { + "epoch": 0.27, + "learning_rate": 9.705169086721264e-06, + "logits/chosen": -0.7332617044448853, + "logits/rejected": -0.7911154627799988, + "logps/chosen": -101.43513488769531, + "logps/rejected": -86.70977783203125, + "loss": 0.7575, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.8461349606513977, + "rewards/margins": -0.7545319199562073, + "rewards/rejected": 1.600666880607605, + "step": 1683 + }, + { + "epoch": 0.27, + "learning_rate": 9.70472429563045e-06, + "logits/chosen": -0.8495456576347351, + "logits/rejected": -0.7485259771347046, + "logps/chosen": -57.696964263916016, + "logps/rejected": -36.72919464111328, + "loss": 0.5514, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.0282604694366455, + "rewards/margins": 0.7307083606719971, + "rewards/rejected": 0.29755210876464844, + "step": 1684 + }, + { + "epoch": 0.27, + "learning_rate": 9.704279179487702e-06, + "logits/chosen": -0.8100026249885559, + "logits/rejected": -0.7395893931388855, + "logps/chosen": -42.67278289794922, + "logps/rejected": -65.43802642822266, + "loss": 0.8474, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.1536338329315186, + "rewards/margins": -1.0735282897949219, + "rewards/rejected": 3.2271621227264404, + "step": 1685 + }, + { + "epoch": 0.27, + "learning_rate": 9.703833738323774e-06, + "logits/chosen": -0.7310615181922913, + "logits/rejected": -0.7310615181922913, + "logps/chosen": -51.7850456237793, + "logps/rejected": -51.7850456237793, + "loss": 0.3633, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.6974937319755554, + "rewards/margins": 0.0, + "rewards/rejected": 0.6974937319755554, + "step": 1686 + }, + { + "epoch": 0.27, + "learning_rate": 9.703387972169445e-06, + "logits/chosen": -0.6777904629707336, + "logits/rejected": -0.6777904629707336, + "logps/chosen": -95.67791748046875, + "logps/rejected": -95.67791748046875, + "loss": 0.434, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.0853424072265625, + "rewards/margins": 0.0, + "rewards/rejected": 1.0853424072265625, + "step": 1687 + }, + { + "epoch": 0.27, + "learning_rate": 9.70294188105551e-06, + "logits/chosen": -0.4187976121902466, + "logits/rejected": -0.4192441701889038, + "logps/chosen": -8.487876892089844, + "logps/rejected": -11.62686538696289, + "loss": 0.6106, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.11835756152868271, + "rewards/margins": 0.23771724104881287, + "rewards/rejected": -0.11935968697071075, + "step": 1688 + }, + { + "epoch": 0.27, + "learning_rate": 9.70249546501279e-06, + "logits/chosen": -0.8815351128578186, + "logits/rejected": -0.8815351128578186, + "logps/chosen": -30.08085060119629, + "logps/rejected": -30.08085060119629, + "loss": 0.7395, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.9198957681655884, + "rewards/margins": 0.0, + "rewards/rejected": 1.9198957681655884, + "step": 1689 + }, + { + "epoch": 0.27, + "learning_rate": 9.702048724072128e-06, + "logits/chosen": -0.48121413588523865, + "logits/rejected": -0.4264490604400635, + "logps/chosen": -40.80633544921875, + "logps/rejected": -1.573288083076477, + "loss": 0.4534, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.48608970642089844, + "rewards/margins": 0.10787001252174377, + "rewards/rejected": 0.37821969389915466, + "step": 1690 + }, + { + "epoch": 0.27, + "learning_rate": 9.701601658264392e-06, + "logits/chosen": -0.8094394207000732, + "logits/rejected": -0.7442309260368347, + "logps/chosen": -45.799720764160156, + "logps/rejected": -104.27230834960938, + "loss": 0.4596, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.307868242263794, + "rewards/margins": 0.306668758392334, + "rewards/rejected": 2.00119948387146, + "step": 1691 + }, + { + "epoch": 0.27, + "learning_rate": 9.70115426762047e-06, + "logits/chosen": -1.1185963153839111, + "logits/rejected": -1.0579807758331299, + "logps/chosen": -170.60951232910156, + "logps/rejected": -137.18934631347656, + "loss": 2.2056, + "rewards/accuracies": 0.0, + "rewards/chosen": 4.040565490722656, + "rewards/margins": -2.4764022827148438, + "rewards/rejected": 6.5169677734375, + "step": 1692 + }, + { + "epoch": 0.27, + "learning_rate": 9.700706552171268e-06, + "logits/chosen": -1.0342872142791748, + "logits/rejected": -0.7102354764938354, + "logps/chosen": -256.093017578125, + "logps/rejected": -53.58993911743164, + "loss": 0.0608, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.0742034912109375, + "rewards/margins": 2.1455585956573486, + "rewards/rejected": -0.07135505974292755, + "step": 1693 + }, + { + "epoch": 0.27, + "learning_rate": 9.700258511947722e-06, + "logits/chosen": -1.0616118907928467, + "logits/rejected": -1.270716667175293, + "logps/chosen": -67.04133605957031, + "logps/rejected": -102.93049621582031, + "loss": 1.2005, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.9759773015975952, + "rewards/margins": -0.3365844488143921, + "rewards/rejected": 2.3125617504119873, + "step": 1694 + }, + { + "epoch": 0.28, + "learning_rate": 9.699810146980788e-06, + "logits/chosen": -1.2585761547088623, + "logits/rejected": -1.2289025783538818, + "logps/chosen": -100.42800903320312, + "logps/rejected": -82.79660034179688, + "loss": 0.2106, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.972143530845642, + "rewards/margins": 0.7758102416992188, + "rewards/rejected": 1.1963332891464233, + "step": 1695 + }, + { + "epoch": 0.28, + "learning_rate": 9.699361457301444e-06, + "logits/chosen": -0.8225021362304688, + "logits/rejected": -0.7764816880226135, + "logps/chosen": -50.744834899902344, + "logps/rejected": -10.481632232666016, + "loss": 0.66, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.24715271592140198, + "rewards/margins": 0.030888468027114868, + "rewards/rejected": 0.2162642478942871, + "step": 1696 + }, + { + "epoch": 0.28, + "learning_rate": 9.698912442940686e-06, + "logits/chosen": -0.6584029197692871, + "logits/rejected": -0.6584029197692871, + "logps/chosen": -18.670692443847656, + "logps/rejected": -18.670692443847656, + "loss": 0.5917, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.5123451352119446, + "rewards/margins": 0.0, + "rewards/rejected": 0.5123451352119446, + "step": 1697 + }, + { + "epoch": 0.28, + "learning_rate": 9.698463103929542e-06, + "logits/chosen": -1.2442615032196045, + "logits/rejected": -1.1135544776916504, + "logps/chosen": -82.223876953125, + "logps/rejected": -49.334041595458984, + "loss": 0.9028, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.849198818206787, + "rewards/margins": 2.6273698806762695, + "rewards/rejected": 3.2218289375305176, + "step": 1698 + }, + { + "epoch": 0.28, + "learning_rate": 9.698013440299054e-06, + "logits/chosen": -0.8905263543128967, + "logits/rejected": -0.8543751835823059, + "logps/chosen": -169.01229858398438, + "logps/rejected": -64.37177276611328, + "loss": 0.1237, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1867034435272217, + "rewards/margins": 1.4258780479431152, + "rewards/rejected": 1.7608253955841064, + "step": 1699 + }, + { + "epoch": 0.28, + "learning_rate": 9.697563452080292e-06, + "logits/chosen": -0.9939031600952148, + "logits/rejected": -0.9619969725608826, + "logps/chosen": -81.1126708984375, + "logps/rejected": -62.88368225097656, + "loss": 0.2245, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.899774074554443, + "rewards/margins": 3.017141580581665, + "rewards/rejected": 2.8826324939727783, + "step": 1700 + }, + { + "epoch": 0.28, + "learning_rate": 9.69711313930434e-06, + "logits/chosen": -0.8554395437240601, + "logits/rejected": -0.7744202017784119, + "logps/chosen": -79.59451293945312, + "logps/rejected": -16.47321128845215, + "loss": 1.1122, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0682167038321495, + "rewards/margins": -0.3085514008998871, + "rewards/rejected": 0.3767681121826172, + "step": 1701 + }, + { + "epoch": 0.28, + "learning_rate": 9.69666250200232e-06, + "logits/chosen": -1.343073844909668, + "logits/rejected": -1.3472309112548828, + "logps/chosen": -117.51721954345703, + "logps/rejected": -144.4639892578125, + "loss": 1.3344, + "rewards/accuracies": 0.0, + "rewards/chosen": 4.530422210693359, + "rewards/margins": -2.3408942222595215, + "rewards/rejected": 6.871316432952881, + "step": 1702 + }, + { + "epoch": 0.28, + "learning_rate": 9.696211540205358e-06, + "logits/chosen": -0.8228052854537964, + "logits/rejected": -0.8176960349082947, + "logps/chosen": -16.59566879272461, + "logps/rejected": -24.699995040893555, + "loss": 0.4022, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5569448471069336, + "rewards/margins": 0.08269444108009338, + "rewards/rejected": 0.4742504060268402, + "step": 1703 + }, + { + "epoch": 0.28, + "learning_rate": 9.695760253944615e-06, + "logits/chosen": -0.7759831547737122, + "logits/rejected": -0.5424543619155884, + "logps/chosen": -43.47441864013672, + "logps/rejected": -45.15251159667969, + "loss": 0.9054, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.639319658279419, + "rewards/margins": 1.1976165771484375, + "rewards/rejected": 1.4417030811309814, + "step": 1704 + }, + { + "epoch": 0.28, + "learning_rate": 9.695308643251271e-06, + "logits/chosen": -0.8286646008491516, + "logits/rejected": -0.5853683948516846, + "logps/chosen": -100.20143127441406, + "logps/rejected": -43.024330139160156, + "loss": 0.4027, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.6261703968048096, + "rewards/margins": 2.1310410499572754, + "rewards/rejected": 0.49512940645217896, + "step": 1705 + }, + { + "epoch": 0.28, + "learning_rate": 9.694856708156526e-06, + "logits/chosen": -1.0937397480010986, + "logits/rejected": -1.1562657356262207, + "logps/chosen": -244.80026245117188, + "logps/rejected": -100.24864196777344, + "loss": 0.1493, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.853421211242676, + "rewards/margins": 1.4376649856567383, + "rewards/rejected": 5.4157562255859375, + "step": 1706 + }, + { + "epoch": 0.28, + "learning_rate": 9.694404448691607e-06, + "logits/chosen": -1.1548770666122437, + "logits/rejected": -1.1535497903823853, + "logps/chosen": -67.36003112792969, + "logps/rejected": -77.36068725585938, + "loss": 1.1378, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.8907158374786377, + "rewards/margins": 0.047118425369262695, + "rewards/rejected": 2.843597412109375, + "step": 1707 + }, + { + "epoch": 0.28, + "learning_rate": 9.693951864887758e-06, + "logits/chosen": -1.230910062789917, + "logits/rejected": -1.153550386428833, + "logps/chosen": -63.45036697387695, + "logps/rejected": -49.86290740966797, + "loss": 0.1685, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.9024441242218018, + "rewards/margins": 0.9832748770713806, + "rewards/rejected": 0.9191692471504211, + "step": 1708 + }, + { + "epoch": 0.28, + "learning_rate": 9.693498956776251e-06, + "logits/chosen": -1.0427381992340088, + "logits/rejected": -1.0610365867614746, + "logps/chosen": -51.85653305053711, + "logps/rejected": -56.82170867919922, + "loss": 0.8804, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.166814088821411, + "rewards/margins": 0.8598850965499878, + "rewards/rejected": 1.3069289922714233, + "step": 1709 + }, + { + "epoch": 0.28, + "learning_rate": 9.693045724388375e-06, + "logits/chosen": -1.2131903171539307, + "logits/rejected": -1.2290937900543213, + "logps/chosen": -84.25696563720703, + "logps/rejected": -48.870853424072266, + "loss": 0.8404, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.2527215480804443, + "rewards/margins": -1.448251724243164, + "rewards/rejected": 3.7009732723236084, + "step": 1710 + }, + { + "epoch": 0.28, + "learning_rate": 9.692592167755447e-06, + "logits/chosen": -0.6420490145683289, + "logits/rejected": -0.6868348121643066, + "logps/chosen": -18.964391708374023, + "logps/rejected": -47.23072052001953, + "loss": 2.2926, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.6596729159355164, + "rewards/margins": -1.8326694965362549, + "rewards/rejected": 2.492342472076416, + "step": 1711 + }, + { + "epoch": 0.28, + "learning_rate": 9.692138286908801e-06, + "logits/chosen": -1.3173881769180298, + "logits/rejected": -1.1551480293273926, + "logps/chosen": -153.7313232421875, + "logps/rejected": -72.39678192138672, + "loss": 1.3592, + "rewards/accuracies": 0.0, + "rewards/chosen": 4.629977703094482, + "rewards/margins": -2.1184849739074707, + "rewards/rejected": 6.748462677001953, + "step": 1712 + }, + { + "epoch": 0.28, + "learning_rate": 9.691684081879797e-06, + "logits/chosen": -1.4348609447479248, + "logits/rejected": -1.3411749601364136, + "logps/chosen": -105.80516052246094, + "logps/rejected": -61.10645294189453, + "loss": 0.1079, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.801849365234375, + "rewards/margins": 1.5969605445861816, + "rewards/rejected": 4.204888820648193, + "step": 1713 + }, + { + "epoch": 0.28, + "learning_rate": 9.691229552699817e-06, + "logits/chosen": -0.8953448534011841, + "logits/rejected": -0.8569142818450928, + "logps/chosen": -16.322240829467773, + "logps/rejected": -24.241348266601562, + "loss": 0.8086, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.5861459970474243, + "rewards/margins": -0.25327277183532715, + "rewards/rejected": 1.8394187688827515, + "step": 1714 + }, + { + "epoch": 0.28, + "learning_rate": 9.690774699400262e-06, + "logits/chosen": -0.9466654658317566, + "logits/rejected": -0.868459939956665, + "logps/chosen": -72.71440124511719, + "logps/rejected": -60.63514709472656, + "loss": 0.7493, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.491101026535034, + "rewards/margins": -1.2035601139068604, + "rewards/rejected": 4.6946611404418945, + "step": 1715 + }, + { + "epoch": 0.28, + "learning_rate": 9.69031952201256e-06, + "logits/chosen": -0.7600445747375488, + "logits/rejected": -0.7389906644821167, + "logps/chosen": -33.00837326049805, + "logps/rejected": -14.796272277832031, + "loss": 0.5867, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.2665557861328125, + "rewards/margins": -0.5460455417633057, + "rewards/rejected": 0.2794897258281708, + "step": 1716 + }, + { + "epoch": 0.28, + "learning_rate": 9.68986402056816e-06, + "logits/chosen": -0.9913755059242249, + "logits/rejected": -1.0177758932113647, + "logps/chosen": -36.9869270324707, + "logps/rejected": -75.89659118652344, + "loss": 0.2616, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.428997039794922, + "rewards/margins": 0.3821983337402344, + "rewards/rejected": 2.0467987060546875, + "step": 1717 + }, + { + "epoch": 0.28, + "learning_rate": 9.689408195098531e-06, + "logits/chosen": -0.9035868048667908, + "logits/rejected": -0.8741334080696106, + "logps/chosen": -24.91727638244629, + "logps/rejected": -7.763073921203613, + "loss": 0.4308, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.7820550799369812, + "rewards/margins": 0.14267003536224365, + "rewards/rejected": 0.6393850445747375, + "step": 1718 + }, + { + "epoch": 0.28, + "learning_rate": 9.68895204563517e-06, + "logits/chosen": -0.7707456350326538, + "logits/rejected": -0.7324963212013245, + "logps/chosen": -72.13597106933594, + "logps/rejected": -99.87371826171875, + "loss": 0.4566, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.1620659828186035, + "rewards/margins": 1.3187737464904785, + "rewards/rejected": 0.843292236328125, + "step": 1719 + }, + { + "epoch": 0.28, + "learning_rate": 9.688495572209587e-06, + "logits/chosen": -1.4454596042633057, + "logits/rejected": -1.3367724418640137, + "logps/chosen": -89.8187255859375, + "logps/rejected": -64.03828430175781, + "loss": 0.0768, + "rewards/accuracies": 1.0, + "rewards/chosen": 7.865001201629639, + "rewards/margins": 5.328678131103516, + "rewards/rejected": 2.536322832107544, + "step": 1720 + }, + { + "epoch": 0.28, + "learning_rate": 9.688038774853324e-06, + "logits/chosen": -1.1237401962280273, + "logits/rejected": -1.0714185237884521, + "logps/chosen": -59.702789306640625, + "logps/rejected": -113.56497192382812, + "loss": 2.1597, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.964442491531372, + "rewards/margins": -3.1689956188201904, + "rewards/rejected": 7.1334381103515625, + "step": 1721 + }, + { + "epoch": 0.28, + "learning_rate": 9.68758165359794e-06, + "logits/chosen": -0.9214227199554443, + "logits/rejected": -0.8185003399848938, + "logps/chosen": -89.25630950927734, + "logps/rejected": -38.321285247802734, + "loss": 0.0233, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.204261302947998, + "rewards/margins": 3.3617467880249023, + "rewards/rejected": 1.8425143957138062, + "step": 1722 + }, + { + "epoch": 0.28, + "learning_rate": 9.687124208475018e-06, + "logits/chosen": -1.0859711170196533, + "logits/rejected": -1.0370080471038818, + "logps/chosen": -61.321868896484375, + "logps/rejected": -67.44328308105469, + "loss": 0.2717, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1511123180389404, + "rewards/margins": 0.3437347412109375, + "rewards/rejected": 2.807377576828003, + "step": 1723 + }, + { + "epoch": 0.28, + "learning_rate": 9.686666439516164e-06, + "logits/chosen": -0.9110777378082275, + "logits/rejected": -0.7944095134735107, + "logps/chosen": -133.77127075195312, + "logps/rejected": -69.26487731933594, + "loss": 0.311, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.4378204345703125, + "rewards/margins": 0.6025344133377075, + "rewards/rejected": 1.835286021232605, + "step": 1724 + }, + { + "epoch": 0.28, + "learning_rate": 9.686208346753006e-06, + "logits/chosen": -1.3233938217163086, + "logits/rejected": -1.356248140335083, + "logps/chosen": -132.8731689453125, + "logps/rejected": -177.78086853027344, + "loss": 2.1542, + "rewards/accuracies": 0.0, + "rewards/chosen": 5.3972930908203125, + "rewards/margins": -2.3459181785583496, + "rewards/rejected": 7.743211269378662, + "step": 1725 + }, + { + "epoch": 0.28, + "learning_rate": 9.68574993021719e-06, + "logits/chosen": -0.8731946349143982, + "logits/rejected": -0.804038941860199, + "logps/chosen": -118.3192138671875, + "logps/rejected": -71.8333511352539, + "loss": 0.4571, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2091033458709717, + "rewards/margins": 0.7630133628845215, + "rewards/rejected": 2.44608998298645, + "step": 1726 + }, + { + "epoch": 0.28, + "learning_rate": 9.685291189940392e-06, + "logits/chosen": -0.8062441945075989, + "logits/rejected": -0.7118551135063171, + "logps/chosen": -59.15666198730469, + "logps/rejected": -20.871580123901367, + "loss": 1.0908, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.6474961042404175, + "rewards/margins": 0.01848316192626953, + "rewards/rejected": 1.629012942314148, + "step": 1727 + }, + { + "epoch": 0.28, + "learning_rate": 9.684832125954304e-06, + "logits/chosen": -0.9073737263679504, + "logits/rejected": -0.8934383392333984, + "logps/chosen": -93.04403686523438, + "logps/rejected": -48.49944305419922, + "loss": 0.9904, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.48980942368507385, + "rewards/margins": -1.064215898513794, + "rewards/rejected": 1.5540252923965454, + "step": 1728 + }, + { + "epoch": 0.28, + "learning_rate": 9.684372738290646e-06, + "logits/chosen": -1.3234776258468628, + "logits/rejected": -1.3711999654769897, + "logps/chosen": -172.16464233398438, + "logps/rejected": -109.91865539550781, + "loss": 0.3208, + "rewards/accuracies": 1.0, + "rewards/chosen": 7.725271701812744, + "rewards/margins": 0.1527724266052246, + "rewards/rejected": 7.5724992752075195, + "step": 1729 + }, + { + "epoch": 0.28, + "learning_rate": 9.683913026981155e-06, + "logits/chosen": -0.990552544593811, + "logits/rejected": -0.9734674096107483, + "logps/chosen": -77.69800567626953, + "logps/rejected": -75.89631652832031, + "loss": 0.4082, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.1338889598846436, + "rewards/margins": -0.026950836181640625, + "rewards/rejected": 2.160839796066284, + "step": 1730 + }, + { + "epoch": 0.28, + "learning_rate": 9.683452992057593e-06, + "logits/chosen": -0.9425305724143982, + "logits/rejected": -0.8109439015388489, + "logps/chosen": -149.4727783203125, + "logps/rejected": -78.85724639892578, + "loss": 0.4701, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.78042459487915, + "rewards/margins": 2.6661698818206787, + "rewards/rejected": 3.1142547130584717, + "step": 1731 + }, + { + "epoch": 0.28, + "learning_rate": 9.682992633551744e-06, + "logits/chosen": -0.8328917622566223, + "logits/rejected": -0.44626277685165405, + "logps/chosen": -32.6007194519043, + "logps/rejected": -59.67024612426758, + "loss": 0.2069, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.2892425060272217, + "rewards/margins": 1.4460612535476685, + "rewards/rejected": 0.8431812524795532, + "step": 1732 + }, + { + "epoch": 0.28, + "learning_rate": 9.682531951495417e-06, + "logits/chosen": -0.9667430520057678, + "logits/rejected": -1.012528657913208, + "logps/chosen": -54.48997497558594, + "logps/rejected": -92.56471252441406, + "loss": 0.2987, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.259629011154175, + "rewards/margins": 0.45302653312683105, + "rewards/rejected": 1.8066024780273438, + "step": 1733 + }, + { + "epoch": 0.28, + "learning_rate": 9.682070945920437e-06, + "logits/chosen": -1.0854061841964722, + "logits/rejected": -1.0205543041229248, + "logps/chosen": -90.40280151367188, + "logps/rejected": -11.247068405151367, + "loss": 0.7137, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.0823516845703125, + "rewards/margins": 1.1905627250671387, + "rewards/rejected": 0.891788899898529, + "step": 1734 + }, + { + "epoch": 0.28, + "learning_rate": 9.681609616858658e-06, + "logits/chosen": -0.7203047871589661, + "logits/rejected": -0.6972097754478455, + "logps/chosen": -77.34278869628906, + "logps/rejected": -54.0023193359375, + "loss": 0.7652, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.6644607782363892, + "rewards/margins": 0.44647061824798584, + "rewards/rejected": 1.2179901599884033, + "step": 1735 + }, + { + "epoch": 0.28, + "learning_rate": 9.681147964341953e-06, + "logits/chosen": -1.292087435722351, + "logits/rejected": -1.2539421319961548, + "logps/chosen": -53.99613571166992, + "logps/rejected": -26.32499885559082, + "loss": 0.9899, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.2133251428604126, + "rewards/margins": 1.0903949737548828, + "rewards/rejected": 0.122930146753788, + "step": 1736 + }, + { + "epoch": 0.28, + "learning_rate": 9.680685988402213e-06, + "logits/chosen": -1.0545392036437988, + "logits/rejected": -0.9609458446502686, + "logps/chosen": -55.47343826293945, + "logps/rejected": -41.106170654296875, + "loss": 0.5257, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.8526500463485718, + "rewards/margins": 0.5149314403533936, + "rewards/rejected": 1.3377186059951782, + "step": 1737 + }, + { + "epoch": 0.28, + "learning_rate": 9.680223689071364e-06, + "logits/chosen": -0.8741396069526672, + "logits/rejected": -0.8741396069526672, + "logps/chosen": -49.24864959716797, + "logps/rejected": -49.24864959716797, + "loss": 0.3503, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.3835827112197876, + "rewards/margins": 0.0, + "rewards/rejected": 1.3835827112197876, + "step": 1738 + }, + { + "epoch": 0.28, + "learning_rate": 9.679761066381342e-06, + "logits/chosen": -1.0147852897644043, + "logits/rejected": -0.9917601943016052, + "logps/chosen": -65.74002838134766, + "logps/rejected": -93.83070373535156, + "loss": 0.704, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1933724880218506, + "rewards/margins": 1.1638169288635254, + "rewards/rejected": 2.029555559158325, + "step": 1739 + }, + { + "epoch": 0.28, + "learning_rate": 9.67929812036411e-06, + "logits/chosen": -1.3067489862442017, + "logits/rejected": -1.088804841041565, + "logps/chosen": -190.16746520996094, + "logps/rejected": -138.10549926757812, + "loss": 0.1777, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.228858947753906, + "rewards/margins": 0.8618149757385254, + "rewards/rejected": 4.367043972015381, + "step": 1740 + }, + { + "epoch": 0.28, + "learning_rate": 9.678834851051655e-06, + "logits/chosen": -0.6186333298683167, + "logits/rejected": -0.6152617931365967, + "logps/chosen": -98.92994689941406, + "logps/rejected": -128.4024200439453, + "loss": 0.3896, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.106697916984558, + "rewards/margins": 0.06989061832427979, + "rewards/rejected": 1.0368072986602783, + "step": 1741 + }, + { + "epoch": 0.28, + "learning_rate": 9.678371258475983e-06, + "logits/chosen": -1.1631648540496826, + "logits/rejected": -1.159402847290039, + "logps/chosen": -110.59996795654297, + "logps/rejected": -202.22291564941406, + "loss": 1.4747, + "rewards/accuracies": 0.0, + "rewards/chosen": 6.276120662689209, + "rewards/margins": -2.2028727531433105, + "rewards/rejected": 8.47899341583252, + "step": 1742 + }, + { + "epoch": 0.28, + "learning_rate": 9.677907342669124e-06, + "logits/chosen": -0.9237238764762878, + "logits/rejected": -0.8741740584373474, + "logps/chosen": -105.14000701904297, + "logps/rejected": -53.29264450073242, + "loss": 0.321, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.316800594329834, + "rewards/margins": 0.7568094730377197, + "rewards/rejected": 3.5599911212921143, + "step": 1743 + }, + { + "epoch": 0.28, + "learning_rate": 9.67744310366313e-06, + "logits/chosen": -1.3513438701629639, + "logits/rejected": -1.3985600471496582, + "logps/chosen": -66.71817016601562, + "logps/rejected": -145.70806884765625, + "loss": 2.6125, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.8185348510742188, + "rewards/margins": -4.331332683563232, + "rewards/rejected": 7.149867534637451, + "step": 1744 + }, + { + "epoch": 0.28, + "learning_rate": 9.676978541490076e-06, + "logits/chosen": -0.8151617050170898, + "logits/rejected": -0.6173003911972046, + "logps/chosen": -113.65798950195312, + "logps/rejected": -51.863250732421875, + "loss": 0.4399, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.345977783203125, + "rewards/margins": 0.7895157337188721, + "rewards/rejected": 3.556462049484253, + "step": 1745 + }, + { + "epoch": 0.28, + "learning_rate": 9.676513656182059e-06, + "logits/chosen": -0.7303411364555359, + "logits/rejected": -0.795904815196991, + "logps/chosen": -57.5570068359375, + "logps/rejected": -84.46067810058594, + "loss": 0.5045, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.5357986688613892, + "rewards/margins": 0.11429142951965332, + "rewards/rejected": 1.4215072393417358, + "step": 1746 + }, + { + "epoch": 0.28, + "learning_rate": 9.676048447771198e-06, + "logits/chosen": -1.0385023355484009, + "logits/rejected": -1.134467363357544, + "logps/chosen": -21.153047561645508, + "logps/rejected": -59.61344909667969, + "loss": 0.7236, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.1054301261901855, + "rewards/margins": -0.14614391326904297, + "rewards/rejected": 2.2515740394592285, + "step": 1747 + }, + { + "epoch": 0.28, + "learning_rate": 9.675582916289634e-06, + "logits/chosen": -0.5725478529930115, + "logits/rejected": -0.5437507033348083, + "logps/chosen": -19.07560157775879, + "logps/rejected": -1.0356314182281494, + "loss": 1.1165, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.07800274342298508, + "rewards/margins": -0.2355131208896637, + "rewards/rejected": 0.15751038491725922, + "step": 1748 + }, + { + "epoch": 0.28, + "learning_rate": 9.675117061769532e-06, + "logits/chosen": -1.1378434896469116, + "logits/rejected": -1.1575666666030884, + "logps/chosen": -194.16400146484375, + "logps/rejected": -132.56581115722656, + "loss": 1.7958, + "rewards/accuracies": 0.0, + "rewards/chosen": 5.959509372711182, + "rewards/margins": -2.5917649269104004, + "rewards/rejected": 8.551274299621582, + "step": 1749 + }, + { + "epoch": 0.28, + "learning_rate": 9.674650884243076e-06, + "logits/chosen": -0.8404715657234192, + "logits/rejected": -0.8397570252418518, + "logps/chosen": -78.75608825683594, + "logps/rejected": -50.199546813964844, + "loss": 0.4799, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.5643523931503296, + "rewards/margins": -0.15030908584594727, + "rewards/rejected": 1.7146614789962769, + "step": 1750 + }, + { + "epoch": 0.28, + "learning_rate": 9.674184383742477e-06, + "logits/chosen": -0.6984073519706726, + "logits/rejected": -0.7023375034332275, + "logps/chosen": -56.4448127746582, + "logps/rejected": -51.96257400512695, + "loss": 1.0002, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.9794902801513672, + "rewards/margins": -0.46198201179504395, + "rewards/rejected": 1.4414722919464111, + "step": 1751 + }, + { + "epoch": 0.28, + "learning_rate": 9.673717560299965e-06, + "logits/chosen": -1.1276390552520752, + "logits/rejected": -1.0073628425598145, + "logps/chosen": -130.92977905273438, + "logps/rejected": -90.30699157714844, + "loss": 0.3618, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.259314060211182, + "rewards/margins": 0.4471116065979004, + "rewards/rejected": 5.812202453613281, + "step": 1752 + }, + { + "epoch": 0.28, + "learning_rate": 9.673250413947792e-06, + "logits/chosen": -1.053903579711914, + "logits/rejected": -1.053903579711914, + "logps/chosen": -107.39170837402344, + "logps/rejected": -107.39170837402344, + "loss": 1.4931, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.624546766281128, + "rewards/margins": 0.0, + "rewards/rejected": 2.624546766281128, + "step": 1753 + }, + { + "epoch": 0.28, + "learning_rate": 9.672782944718234e-06, + "logits/chosen": -1.145015835762024, + "logits/rejected": -1.0406283140182495, + "logps/chosen": -123.00724792480469, + "logps/rejected": -42.63998794555664, + "loss": 0.2578, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.9875900745391846, + "rewards/margins": 2.0767433643341064, + "rewards/rejected": 1.9108467102050781, + "step": 1754 + }, + { + "epoch": 0.28, + "learning_rate": 9.672315152643589e-06, + "logits/chosen": -0.9602285027503967, + "logits/rejected": -0.8228534460067749, + "logps/chosen": -103.83885192871094, + "logps/rejected": -179.21014404296875, + "loss": 0.1666, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.9971513748168945, + "rewards/margins": 3.042668342590332, + "rewards/rejected": 2.9544830322265625, + "step": 1755 + }, + { + "epoch": 0.29, + "learning_rate": 9.671847037756177e-06, + "logits/chosen": -1.1989856958389282, + "logits/rejected": -1.2610381841659546, + "logps/chosen": -123.05725860595703, + "logps/rejected": -270.36663818359375, + "loss": 1.3866, + "rewards/accuracies": 0.0, + "rewards/chosen": 6.585259437561035, + "rewards/margins": -1.0224189758300781, + "rewards/rejected": 7.607678413391113, + "step": 1756 + }, + { + "epoch": 0.29, + "learning_rate": 9.671378600088338e-06, + "logits/chosen": -0.5643450617790222, + "logits/rejected": -0.5018764734268188, + "logps/chosen": -71.87810516357422, + "logps/rejected": -75.56023406982422, + "loss": 0.182, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.1879119873046875, + "rewards/margins": 0.8524482250213623, + "rewards/rejected": 1.3354637622833252, + "step": 1757 + }, + { + "epoch": 0.29, + "learning_rate": 9.670909839672441e-06, + "logits/chosen": -1.2694584131240845, + "logits/rejected": -0.9292668104171753, + "logps/chosen": -131.30157470703125, + "logps/rejected": -33.939369201660156, + "loss": 0.0466, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.944593906402588, + "rewards/margins": 5.350363254547119, + "rewards/rejected": 0.5942306518554688, + "step": 1758 + }, + { + "epoch": 0.29, + "learning_rate": 9.670440756540873e-06, + "logits/chosen": -1.2897027730941772, + "logits/rejected": -1.502551794052124, + "logps/chosen": -179.7909698486328, + "logps/rejected": -139.561767578125, + "loss": 1.1897, + "rewards/accuracies": 0.0, + "rewards/chosen": 4.420109748840332, + "rewards/margins": -1.4182190895080566, + "rewards/rejected": 5.838328838348389, + "step": 1759 + }, + { + "epoch": 0.29, + "learning_rate": 9.669971350726038e-06, + "logits/chosen": -1.1502641439437866, + "logits/rejected": -1.0675435066223145, + "logps/chosen": -245.50245666503906, + "logps/rejected": -88.48363494873047, + "loss": 0.1894, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.3725786209106445, + "rewards/margins": 2.8129830360412598, + "rewards/rejected": 3.5595955848693848, + "step": 1760 + }, + { + "epoch": 0.29, + "learning_rate": 9.669501622260368e-06, + "logits/chosen": -1.0781587362289429, + "logits/rejected": -0.9022756814956665, + "logps/chosen": -74.56631469726562, + "logps/rejected": -42.81385803222656, + "loss": 1.3623, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.115574598312378, + "rewards/margins": 1.1892176866531372, + "rewards/rejected": 0.9263569116592407, + "step": 1761 + }, + { + "epoch": 0.29, + "learning_rate": 9.669031571176322e-06, + "logits/chosen": -0.8804160952568054, + "logits/rejected": -0.9457318782806396, + "logps/chosen": -76.93775177001953, + "logps/rejected": -51.40789031982422, + "loss": 0.6295, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.553635358810425, + "rewards/margins": -0.024178504943847656, + "rewards/rejected": 2.5778138637542725, + "step": 1762 + }, + { + "epoch": 0.29, + "learning_rate": 9.668561197506375e-06, + "logits/chosen": -1.33604896068573, + "logits/rejected": -1.226754903793335, + "logps/chosen": -79.24174499511719, + "logps/rejected": -16.963336944580078, + "loss": 0.5726, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.3577194213867188, + "rewards/margins": 3.1861119270324707, + "rewards/rejected": 0.1716075986623764, + "step": 1763 + }, + { + "epoch": 0.29, + "learning_rate": 9.66809050128302e-06, + "logits/chosen": -1.3653615713119507, + "logits/rejected": -1.3311342000961304, + "logps/chosen": -120.84020233154297, + "logps/rejected": -67.99239349365234, + "loss": 0.1681, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.886573076248169, + "rewards/margins": 1.084355115890503, + "rewards/rejected": 0.8022179007530212, + "step": 1764 + }, + { + "epoch": 0.29, + "learning_rate": 9.667619482538784e-06, + "logits/chosen": -1.2324495315551758, + "logits/rejected": -1.200801968574524, + "logps/chosen": -65.56658935546875, + "logps/rejected": -101.02423858642578, + "loss": 1.578, + "rewards/accuracies": 0.0, + "rewards/chosen": 4.358831882476807, + "rewards/margins": -3.1045265197753906, + "rewards/rejected": 7.463358402252197, + "step": 1765 + }, + { + "epoch": 0.29, + "learning_rate": 9.667148141306206e-06, + "logits/chosen": -1.2876745462417603, + "logits/rejected": -1.1849724054336548, + "logps/chosen": -105.20655822753906, + "logps/rejected": -81.33110046386719, + "loss": 2.2963, + "rewards/accuracies": 1.0, + "rewards/chosen": 7.130262851715088, + "rewards/margins": 2.135284423828125, + "rewards/rejected": 4.994978427886963, + "step": 1766 + }, + { + "epoch": 0.29, + "learning_rate": 9.666676477617851e-06, + "logits/chosen": -1.1533925533294678, + "logits/rejected": -1.1463494300842285, + "logps/chosen": -43.76234436035156, + "logps/rejected": -111.27594757080078, + "loss": 1.0432, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.9675018787384033, + "rewards/margins": -1.1347649097442627, + "rewards/rejected": 3.102266788482666, + "step": 1767 + }, + { + "epoch": 0.29, + "learning_rate": 9.666204491506309e-06, + "logits/chosen": -0.9399962425231934, + "logits/rejected": -0.9281315803527832, + "logps/chosen": -16.01548194885254, + "logps/rejected": -4.735600471496582, + "loss": 0.6777, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.8423997759819031, + "rewards/margins": 0.0628625750541687, + "rewards/rejected": 0.7795372009277344, + "step": 1768 + }, + { + "epoch": 0.29, + "learning_rate": 9.66573218300419e-06, + "logits/chosen": -1.1194090843200684, + "logits/rejected": -1.195311188697815, + "logps/chosen": -67.43115997314453, + "logps/rejected": -109.42561340332031, + "loss": 1.0077, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.4178123474121094, + "rewards/margins": -0.458559513092041, + "rewards/rejected": 2.8763718605041504, + "step": 1769 + }, + { + "epoch": 0.29, + "learning_rate": 9.665259552144122e-06, + "logits/chosen": -0.9480810761451721, + "logits/rejected": -1.0251116752624512, + "logps/chosen": -28.257705688476562, + "logps/rejected": -81.128662109375, + "loss": 1.8597, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.8740043640136719, + "rewards/margins": -0.8253799676895142, + "rewards/rejected": 1.699384331703186, + "step": 1770 + }, + { + "epoch": 0.29, + "learning_rate": 9.664786598958763e-06, + "logits/chosen": -0.9498385190963745, + "logits/rejected": -1.0023159980773926, + "logps/chosen": -73.01162719726562, + "logps/rejected": -61.37923049926758, + "loss": 0.5583, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.5286178588867188, + "rewards/margins": -0.10387015342712402, + "rewards/rejected": 2.6324880123138428, + "step": 1771 + }, + { + "epoch": 0.29, + "learning_rate": 9.66431332348079e-06, + "logits/chosen": -0.9616889357566833, + "logits/rejected": -0.820960283279419, + "logps/chosen": -72.54866027832031, + "logps/rejected": -24.246410369873047, + "loss": 0.0461, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.815990447998047, + "rewards/margins": 4.035778999328613, + "rewards/rejected": -0.2197883576154709, + "step": 1772 + }, + { + "epoch": 0.29, + "learning_rate": 9.6638397257429e-06, + "logits/chosen": -1.2841383218765259, + "logits/rejected": -1.3117071390151978, + "logps/chosen": -42.157386779785156, + "logps/rejected": -62.10678482055664, + "loss": 0.3973, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.8378254175186157, + "rewards/margins": -0.1881035566329956, + "rewards/rejected": 2.0259289741516113, + "step": 1773 + }, + { + "epoch": 0.29, + "learning_rate": 9.663365805777815e-06, + "logits/chosen": -0.7588172554969788, + "logits/rejected": -0.754450798034668, + "logps/chosen": -3.177931547164917, + "logps/rejected": -1.81235933303833, + "loss": 0.4203, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.17846396565437317, + "rewards/margins": -0.1870095431804657, + "rewards/rejected": 0.36547350883483887, + "step": 1774 + }, + { + "epoch": 0.29, + "learning_rate": 9.662891563618277e-06, + "logits/chosen": -1.0453089475631714, + "logits/rejected": -1.0579462051391602, + "logps/chosen": -103.63105773925781, + "logps/rejected": -98.97615051269531, + "loss": 0.5538, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.788037896156311, + "rewards/margins": -0.7028297185897827, + "rewards/rejected": 2.4908676147460938, + "step": 1775 + }, + { + "epoch": 0.29, + "learning_rate": 9.662416999297053e-06, + "logits/chosen": -1.045376181602478, + "logits/rejected": -0.8457748293876648, + "logps/chosen": -122.37843322753906, + "logps/rejected": -45.36525344848633, + "loss": 0.0368, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.054100036621094, + "rewards/margins": 2.7346699237823486, + "rewards/rejected": 3.319430112838745, + "step": 1776 + }, + { + "epoch": 0.29, + "learning_rate": 9.66194211284693e-06, + "logits/chosen": -0.7168751358985901, + "logits/rejected": -0.7168751358985901, + "logps/chosen": -53.024681091308594, + "logps/rejected": -53.024681091308594, + "loss": 0.3492, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.175395965576172, + "rewards/margins": 0.0, + "rewards/rejected": 2.175395965576172, + "step": 1777 + }, + { + "epoch": 0.29, + "learning_rate": 9.66146690430072e-06, + "logits/chosen": -0.6907904148101807, + "logits/rejected": -0.693775475025177, + "logps/chosen": -59.86546325683594, + "logps/rejected": -62.849002838134766, + "loss": 0.6948, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.3721466064453125, + "rewards/margins": -0.2573361396789551, + "rewards/rejected": 3.6294827461242676, + "step": 1778 + }, + { + "epoch": 0.29, + "learning_rate": 9.660991373691253e-06, + "logits/chosen": -1.41627037525177, + "logits/rejected": -1.2035690546035767, + "logps/chosen": -95.61832427978516, + "logps/rejected": -84.49459838867188, + "loss": 1.866, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.752437591552734, + "rewards/margins": 4.1254754066467285, + "rewards/rejected": 1.6269623041152954, + "step": 1779 + }, + { + "epoch": 0.29, + "learning_rate": 9.660515521051385e-06, + "logits/chosen": -1.0209993124008179, + "logits/rejected": -0.9717954993247986, + "logps/chosen": -49.58678436279297, + "logps/rejected": -44.241485595703125, + "loss": 0.7589, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.7715095281600952, + "rewards/margins": 0.004029035568237305, + "rewards/rejected": 1.767480492591858, + "step": 1780 + }, + { + "epoch": 0.29, + "learning_rate": 9.660039346413994e-06, + "logits/chosen": -1.183956503868103, + "logits/rejected": -1.1977126598358154, + "logps/chosen": -98.83565521240234, + "logps/rejected": -62.94049835205078, + "loss": 0.2762, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.2817100286483765, + "rewards/margins": 0.4191596508026123, + "rewards/rejected": 0.8625503778457642, + "step": 1781 + }, + { + "epoch": 0.29, + "learning_rate": 9.659562849811977e-06, + "logits/chosen": -1.226771593093872, + "logits/rejected": -1.2607426643371582, + "logps/chosen": -47.246307373046875, + "logps/rejected": -117.17097473144531, + "loss": 0.9626, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.8151230812072754, + "rewards/margins": -1.6913375854492188, + "rewards/rejected": 4.506460666656494, + "step": 1782 + }, + { + "epoch": 0.29, + "learning_rate": 9.659086031278256e-06, + "logits/chosen": -0.708894670009613, + "logits/rejected": -0.7042490839958191, + "logps/chosen": -48.51356506347656, + "logps/rejected": -44.658565521240234, + "loss": 0.8686, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.888372778892517, + "rewards/margins": -0.7208560705184937, + "rewards/rejected": 2.6092288494110107, + "step": 1783 + }, + { + "epoch": 0.29, + "learning_rate": 9.658608890845773e-06, + "logits/chosen": -1.1999351978302002, + "logits/rejected": -0.9316763281822205, + "logps/chosen": -165.2587890625, + "logps/rejected": -54.21940231323242, + "loss": 0.0091, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.637648105621338, + "rewards/margins": 6.274096488952637, + "rewards/rejected": 0.36355170607566833, + "step": 1784 + }, + { + "epoch": 0.29, + "learning_rate": 9.658131428547498e-06, + "logits/chosen": -1.5745580196380615, + "logits/rejected": -1.3994381427764893, + "logps/chosen": -127.27336120605469, + "logps/rejected": -82.23971557617188, + "loss": 0.0297, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.845481872558594, + "rewards/margins": 3.1330978870391846, + "rewards/rejected": 2.712383985519409, + "step": 1785 + }, + { + "epoch": 0.29, + "learning_rate": 9.657653644416417e-06, + "logits/chosen": -1.0376309156417847, + "logits/rejected": -0.7717315554618835, + "logps/chosen": -133.2635040283203, + "logps/rejected": -42.32769775390625, + "loss": 0.0059, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.561200141906738, + "rewards/margins": 5.6536712646484375, + "rewards/rejected": -0.0924709364771843, + "step": 1786 + }, + { + "epoch": 0.29, + "learning_rate": 9.657175538485541e-06, + "logits/chosen": -1.1228301525115967, + "logits/rejected": -1.307485818862915, + "logps/chosen": -42.16577911376953, + "logps/rejected": -34.69904708862305, + "loss": 0.1235, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.202634572982788, + "rewards/margins": 1.9672292470932007, + "rewards/rejected": 0.2354053556919098, + "step": 1787 + }, + { + "epoch": 0.29, + "learning_rate": 9.6566971107879e-06, + "logits/chosen": -1.1639646291732788, + "logits/rejected": -1.1646393537521362, + "logps/chosen": -100.18034362792969, + "logps/rejected": -91.12977600097656, + "loss": 1.0418, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.0791122913360596, + "rewards/margins": 0.07312536239624023, + "rewards/rejected": 2.0059869289398193, + "step": 1788 + }, + { + "epoch": 0.29, + "learning_rate": 9.656218361356553e-06, + "logits/chosen": -1.011210322380066, + "logits/rejected": -0.7748307585716248, + "logps/chosen": -94.44027709960938, + "logps/rejected": -17.659191131591797, + "loss": 0.8879, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.3311798572540283, + "rewards/margins": 3.0652287006378174, + "rewards/rejected": 0.26595115661621094, + "step": 1789 + }, + { + "epoch": 0.29, + "learning_rate": 9.655739290224574e-06, + "logits/chosen": -0.8122765421867371, + "logits/rejected": -0.9116800427436829, + "logps/chosen": -64.94231414794922, + "logps/rejected": -102.96739196777344, + "loss": 1.9624, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.2385529279708862, + "rewards/margins": -3.6971755027770996, + "rewards/rejected": 4.935728549957275, + "step": 1790 + }, + { + "epoch": 0.29, + "learning_rate": 9.65525989742506e-06, + "logits/chosen": -0.768588662147522, + "logits/rejected": -0.7363895177841187, + "logps/chosen": -78.84333038330078, + "logps/rejected": -52.07868194580078, + "loss": 0.5691, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.1735466718673706, + "rewards/margins": -0.02110743522644043, + "rewards/rejected": 1.194654107093811, + "step": 1791 + }, + { + "epoch": 0.29, + "learning_rate": 9.65478018299114e-06, + "logits/chosen": -1.1720067262649536, + "logits/rejected": -1.184792160987854, + "logps/chosen": -60.28200912475586, + "logps/rejected": -62.17378234863281, + "loss": 0.7097, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.2605175971984863, + "rewards/margins": -0.44012022018432617, + "rewards/rejected": 2.7006378173828125, + "step": 1792 + }, + { + "epoch": 0.29, + "learning_rate": 9.65430014695595e-06, + "logits/chosen": -0.7048981785774231, + "logits/rejected": -0.6718086004257202, + "logps/chosen": -47.05683135986328, + "logps/rejected": -18.955894470214844, + "loss": 0.6135, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5430084466934204, + "rewards/margins": 0.258465975522995, + "rewards/rejected": 0.2845424711704254, + "step": 1793 + }, + { + "epoch": 0.29, + "learning_rate": 9.65381978935266e-06, + "logits/chosen": -1.5566928386688232, + "logits/rejected": -1.4890596866607666, + "logps/chosen": -66.52304077148438, + "logps/rejected": -29.63580894470215, + "loss": 0.4346, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.7483291625976562, + "rewards/margins": 1.9253259897232056, + "rewards/rejected": -0.17699681222438812, + "step": 1794 + }, + { + "epoch": 0.29, + "learning_rate": 9.65333911021446e-06, + "logits/chosen": -1.1029826402664185, + "logits/rejected": -0.958783745765686, + "logps/chosen": -98.71916198730469, + "logps/rejected": -78.45511627197266, + "loss": 0.047, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.862179756164551, + "rewards/margins": 2.952796220779419, + "rewards/rejected": 2.909383535385132, + "step": 1795 + }, + { + "epoch": 0.29, + "learning_rate": 9.652858109574554e-06, + "logits/chosen": -0.6014845967292786, + "logits/rejected": -0.6695224642753601, + "logps/chosen": -24.467981338500977, + "logps/rejected": -97.41072082519531, + "loss": 1.0476, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.07017574459314346, + "rewards/margins": -0.862392246723175, + "rewards/rejected": 0.7922164797782898, + "step": 1796 + }, + { + "epoch": 0.29, + "learning_rate": 9.65237678746618e-06, + "logits/chosen": -0.8494687080383301, + "logits/rejected": -0.8946436643600464, + "logps/chosen": -67.30419921875, + "logps/rejected": -85.663818359375, + "loss": 1.1775, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1845078468322754, + "rewards/margins": 1.5131226778030396, + "rewards/rejected": 1.6713851690292358, + "step": 1797 + }, + { + "epoch": 0.29, + "learning_rate": 9.651895143922591e-06, + "logits/chosen": -0.9147045016288757, + "logits/rejected": -0.6573607921600342, + "logps/chosen": -75.95756530761719, + "logps/rejected": -39.65557098388672, + "loss": 0.5287, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.266899824142456, + "rewards/margins": 2.1756980419158936, + "rewards/rejected": 0.0912017822265625, + "step": 1798 + }, + { + "epoch": 0.29, + "learning_rate": 9.651413178977065e-06, + "logits/chosen": -0.8047342300415039, + "logits/rejected": -0.8047342300415039, + "logps/chosen": -45.13993453979492, + "logps/rejected": -45.13993453979492, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.5704464316368103, + "rewards/margins": 0.0, + "rewards/rejected": 0.5704464316368103, + "step": 1799 + }, + { + "epoch": 0.29, + "learning_rate": 9.6509308926629e-06, + "logits/chosen": -0.9505744576454163, + "logits/rejected": -1.0362614393234253, + "logps/chosen": -62.25169372558594, + "logps/rejected": -73.4705810546875, + "loss": 1.2391, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.0980141162872314, + "rewards/margins": -2.225942373275757, + "rewards/rejected": 4.323956489562988, + "step": 1800 + }, + { + "epoch": 0.29, + "learning_rate": 9.650448285013417e-06, + "logits/chosen": -0.7762270569801331, + "logits/rejected": -0.7856634855270386, + "logps/chosen": -70.76458740234375, + "logps/rejected": -43.960811614990234, + "loss": 0.7468, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.2360282987356186, + "rewards/margins": -1.2248947620391846, + "rewards/rejected": 1.4609230756759644, + "step": 1801 + }, + { + "epoch": 0.29, + "learning_rate": 9.649965356061961e-06, + "logits/chosen": -1.1794493198394775, + "logits/rejected": -1.3605856895446777, + "logps/chosen": -75.46855163574219, + "logps/rejected": -36.818058013916016, + "loss": 0.0709, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.1107239723205566, + "rewards/margins": 1.944498896598816, + "rewards/rejected": 0.16622506082057953, + "step": 1802 + }, + { + "epoch": 0.29, + "learning_rate": 9.649482105841899e-06, + "logits/chosen": -0.7418447136878967, + "logits/rejected": -0.7650952935218811, + "logps/chosen": -15.654671669006348, + "logps/rejected": -22.237653732299805, + "loss": 0.9441, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3969782888889313, + "rewards/margins": 0.2163800299167633, + "rewards/rejected": 0.18059825897216797, + "step": 1803 + }, + { + "epoch": 0.29, + "learning_rate": 9.648998534386615e-06, + "logits/chosen": -0.5632386803627014, + "logits/rejected": -0.5144709348678589, + "logps/chosen": -48.53093719482422, + "logps/rejected": -57.8464241027832, + "loss": 1.6231, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.408246636390686, + "rewards/margins": -1.195963740348816, + "rewards/rejected": 2.604210376739502, + "step": 1804 + }, + { + "epoch": 0.29, + "learning_rate": 9.648514641729522e-06, + "logits/chosen": -0.6467804312705994, + "logits/rejected": -0.6467804312705994, + "logps/chosen": -51.49900817871094, + "logps/rejected": -51.49900817871094, + "loss": 0.6676, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.5636109113693237, + "rewards/margins": 0.0, + "rewards/rejected": 1.5636109113693237, + "step": 1805 + }, + { + "epoch": 0.29, + "learning_rate": 9.648030427904052e-06, + "logits/chosen": -1.08444344997406, + "logits/rejected": -1.0131337642669678, + "logps/chosen": -95.57733154296875, + "logps/rejected": -64.26339721679688, + "loss": 0.1841, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.063559055328369, + "rewards/margins": 1.7493195533752441, + "rewards/rejected": 3.314239501953125, + "step": 1806 + }, + { + "epoch": 0.29, + "learning_rate": 9.64754589294366e-06, + "logits/chosen": -0.9366406798362732, + "logits/rejected": -0.8944382071495056, + "logps/chosen": -75.23420715332031, + "logps/rejected": -47.55654525756836, + "loss": 0.7872, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.5041779279708862, + "rewards/margins": 0.19592177867889404, + "rewards/rejected": 1.3082561492919922, + "step": 1807 + }, + { + "epoch": 0.29, + "learning_rate": 9.647061036881821e-06, + "logits/chosen": -0.8384121656417847, + "logits/rejected": -0.8900523781776428, + "logps/chosen": -84.4356460571289, + "logps/rejected": -92.45333862304688, + "loss": 0.2696, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.758124589920044, + "rewards/margins": 0.42644500732421875, + "rewards/rejected": 1.3316795825958252, + "step": 1808 + }, + { + "epoch": 0.29, + "learning_rate": 9.646575859752036e-06, + "logits/chosen": -0.9506332278251648, + "logits/rejected": -0.8019829988479614, + "logps/chosen": -92.51031494140625, + "logps/rejected": -78.54263305664062, + "loss": 0.0903, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.407710552215576, + "rewards/margins": 1.6251649856567383, + "rewards/rejected": 3.782545566558838, + "step": 1809 + }, + { + "epoch": 0.29, + "learning_rate": 9.646090361587828e-06, + "logits/chosen": -1.1511225700378418, + "logits/rejected": -0.8769118785858154, + "logps/chosen": -141.361328125, + "logps/rejected": -97.58281707763672, + "loss": 0.1645, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.758087158203125, + "rewards/margins": 0.9528279304504395, + "rewards/rejected": 4.8052592277526855, + "step": 1810 + }, + { + "epoch": 0.29, + "learning_rate": 9.645604542422734e-06, + "logits/chosen": -1.105326771736145, + "logits/rejected": -1.0036097764968872, + "logps/chosen": -109.87396240234375, + "logps/rejected": -79.16781616210938, + "loss": 0.9193, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.3796677589416504, + "rewards/margins": -1.659764289855957, + "rewards/rejected": 5.039432048797607, + "step": 1811 + }, + { + "epoch": 0.29, + "learning_rate": 9.645118402290325e-06, + "logits/chosen": -0.9094854593276978, + "logits/rejected": -0.8843289613723755, + "logps/chosen": -78.06861877441406, + "logps/rejected": -66.91275024414062, + "loss": 0.5497, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.0983903408050537, + "rewards/margins": -0.02602839469909668, + "rewards/rejected": 3.1244187355041504, + "step": 1812 + }, + { + "epoch": 0.29, + "learning_rate": 9.644631941224186e-06, + "logits/chosen": -0.9643981456756592, + "logits/rejected": -0.8661947846412659, + "logps/chosen": -210.96788024902344, + "logps/rejected": -58.08285140991211, + "loss": 0.0843, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.283384799957275, + "rewards/margins": 2.544933795928955, + "rewards/rejected": 2.7384510040283203, + "step": 1813 + }, + { + "epoch": 0.29, + "learning_rate": 9.644145159257928e-06, + "logits/chosen": -0.9816256165504456, + "logits/rejected": -0.9213964343070984, + "logps/chosen": -69.8913345336914, + "logps/rejected": -63.26865005493164, + "loss": 0.7582, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0188026428222656, + "rewards/margins": 0.4902377128601074, + "rewards/rejected": 2.528564929962158, + "step": 1814 + }, + { + "epoch": 0.29, + "learning_rate": 9.643658056425183e-06, + "logits/chosen": -1.003422498703003, + "logits/rejected": -1.0160106420516968, + "logps/chosen": -50.78092956542969, + "logps/rejected": -83.01156616210938, + "loss": 0.2737, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.559678792953491, + "rewards/margins": 0.9419381618499756, + "rewards/rejected": 1.6177406311035156, + "step": 1815 + }, + { + "epoch": 0.29, + "learning_rate": 9.643170632759606e-06, + "logits/chosen": -1.1166719198226929, + "logits/rejected": -1.124597430229187, + "logps/chosen": -84.53256225585938, + "logps/rejected": -116.90072631835938, + "loss": 0.0998, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.814822673797607, + "rewards/margins": 2.7789552211761475, + "rewards/rejected": 3.03586745262146, + "step": 1816 + }, + { + "epoch": 0.29, + "learning_rate": 9.642682888294872e-06, + "logits/chosen": -0.8403193354606628, + "logits/rejected": -0.710052490234375, + "logps/chosen": -119.05094146728516, + "logps/rejected": -17.397743225097656, + "loss": 1.1345, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.9249565005302429, + "rewards/margins": 0.24813860654830933, + "rewards/rejected": 0.6768178939819336, + "step": 1817 + }, + { + "epoch": 0.3, + "learning_rate": 9.642194823064679e-06, + "logits/chosen": -0.8784768581390381, + "logits/rejected": -0.8280547857284546, + "logps/chosen": -113.28883361816406, + "logps/rejected": -81.17533874511719, + "loss": 0.4591, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.6009812355041504, + "rewards/margins": -0.3941948413848877, + "rewards/rejected": 2.995176076889038, + "step": 1818 + }, + { + "epoch": 0.3, + "learning_rate": 9.64170643710275e-06, + "logits/chosen": -1.0433257818222046, + "logits/rejected": -0.9680414199829102, + "logps/chosen": -85.56208038330078, + "logps/rejected": -57.70830535888672, + "loss": 0.8912, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.9733543395996094, + "rewards/margins": -0.6791092157363892, + "rewards/rejected": 1.6524635553359985, + "step": 1819 + }, + { + "epoch": 0.3, + "learning_rate": 9.641217730442824e-06, + "logits/chosen": -0.731417715549469, + "logits/rejected": -0.731417715549469, + "logps/chosen": -69.16378784179688, + "logps/rejected": -69.16378784179688, + "loss": 0.5851, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.828448534011841, + "rewards/margins": 0.0, + "rewards/rejected": 2.828448534011841, + "step": 1820 + }, + { + "epoch": 0.3, + "learning_rate": 9.640728703118669e-06, + "logits/chosen": -1.1029462814331055, + "logits/rejected": -1.0914095640182495, + "logps/chosen": -72.04342651367188, + "logps/rejected": -75.54275512695312, + "loss": 0.8676, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.8144501447677612, + "rewards/margins": -0.8042587041854858, + "rewards/rejected": 2.618708848953247, + "step": 1821 + }, + { + "epoch": 0.3, + "learning_rate": 9.640239355164074e-06, + "logits/chosen": -1.0495545864105225, + "logits/rejected": -0.9429094791412354, + "logps/chosen": -98.86288452148438, + "logps/rejected": -54.33306884765625, + "loss": 0.8302, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.142662048339844, + "rewards/margins": 4.434937477111816, + "rewards/rejected": 0.7077247500419617, + "step": 1822 + }, + { + "epoch": 0.3, + "learning_rate": 9.639749686612843e-06, + "logits/chosen": -0.6199988722801208, + "logits/rejected": -0.6265919804573059, + "logps/chosen": -60.56932830810547, + "logps/rejected": -69.51996612548828, + "loss": 1.8991, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.618090033531189, + "rewards/margins": -0.03627634048461914, + "rewards/rejected": 1.654366374015808, + "step": 1823 + }, + { + "epoch": 0.3, + "learning_rate": 9.639259697498813e-06, + "logits/chosen": -0.967108428478241, + "logits/rejected": -0.9674409627914429, + "logps/chosen": -12.049579620361328, + "logps/rejected": -2.136889696121216, + "loss": 0.598, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.07292290031909943, + "rewards/margins": -0.41288357973098755, + "rewards/rejected": 0.33996066451072693, + "step": 1824 + }, + { + "epoch": 0.3, + "learning_rate": 9.638769387855833e-06, + "logits/chosen": -1.0841952562332153, + "logits/rejected": -1.0634733438491821, + "logps/chosen": -85.32832336425781, + "logps/rejected": -133.6729278564453, + "loss": 1.2656, + "rewards/accuracies": 0.0, + "rewards/chosen": 4.49917459487915, + "rewards/margins": -2.427670478820801, + "rewards/rejected": 6.926845073699951, + "step": 1825 + }, + { + "epoch": 0.3, + "learning_rate": 9.63827875771778e-06, + "logits/chosen": -1.0861002206802368, + "logits/rejected": -1.0904990434646606, + "logps/chosen": -95.59671020507812, + "logps/rejected": -105.16109466552734, + "loss": 1.5039, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.4291741847991943, + "rewards/margins": 0.15139245986938477, + "rewards/rejected": 2.2777817249298096, + "step": 1826 + }, + { + "epoch": 0.3, + "learning_rate": 9.637787807118555e-06, + "logits/chosen": -0.9970555901527405, + "logits/rejected": -0.7424265742301941, + "logps/chosen": -68.012939453125, + "logps/rejected": -18.785179138183594, + "loss": 0.3217, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.5747621059417725, + "rewards/margins": 3.4556570053100586, + "rewards/rejected": 0.11910515278577805, + "step": 1827 + }, + { + "epoch": 0.3, + "learning_rate": 9.637296536092076e-06, + "logits/chosen": -1.171716570854187, + "logits/rejected": -1.1573163270950317, + "logps/chosen": -80.78604125976562, + "logps/rejected": -57.03402328491211, + "loss": 0.5729, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.2653931379318237, + "rewards/margins": -0.4552501440048218, + "rewards/rejected": 1.7206432819366455, + "step": 1828 + }, + { + "epoch": 0.3, + "learning_rate": 9.636804944672282e-06, + "logits/chosen": -1.2178113460540771, + "logits/rejected": -1.1073969602584839, + "logps/chosen": -30.60940933227539, + "logps/rejected": -39.260040283203125, + "loss": 0.0565, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.111609697341919, + "rewards/margins": 2.171597719192505, + "rewards/rejected": -0.05998802185058594, + "step": 1829 + }, + { + "epoch": 0.3, + "learning_rate": 9.636313032893142e-06, + "logits/chosen": -0.6021499037742615, + "logits/rejected": -0.6242101788520813, + "logps/chosen": -27.615461349487305, + "logps/rejected": -26.06548309326172, + "loss": 0.9647, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.7603891491889954, + "rewards/margins": 0.17900735139846802, + "rewards/rejected": 0.5813817977905273, + "step": 1830 + }, + { + "epoch": 0.3, + "learning_rate": 9.63582080078864e-06, + "logits/chosen": -1.307858943939209, + "logits/rejected": -1.2549257278442383, + "logps/chosen": -72.82664489746094, + "logps/rejected": -11.623239517211914, + "loss": 0.7863, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.706823706626892, + "rewards/margins": 1.7226864099502563, + "rewards/rejected": -0.015862656757235527, + "step": 1831 + }, + { + "epoch": 0.3, + "learning_rate": 9.635328248392785e-06, + "logits/chosen": -0.9436384439468384, + "logits/rejected": -0.878287136554718, + "logps/chosen": -50.99189758300781, + "logps/rejected": -76.26042175292969, + "loss": 0.4322, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.112112522125244, + "rewards/margins": 0.1386711597442627, + "rewards/rejected": 1.9734413623809814, + "step": 1832 + }, + { + "epoch": 0.3, + "learning_rate": 9.634835375739611e-06, + "logits/chosen": -0.8765555024147034, + "logits/rejected": -0.6825057864189148, + "logps/chosen": -67.70100402832031, + "logps/rejected": -34.85027313232422, + "loss": 1.657, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.0305449962615967, + "rewards/margins": -0.08375978469848633, + "rewards/rejected": 2.114304780960083, + "step": 1833 + }, + { + "epoch": 0.3, + "learning_rate": 9.634342182863163e-06, + "logits/chosen": -0.9225662350654602, + "logits/rejected": -0.8760212063789368, + "logps/chosen": -87.09072875976562, + "logps/rejected": -88.41144561767578, + "loss": 1.0133, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.5183351039886475, + "rewards/margins": 0.1897423267364502, + "rewards/rejected": 2.3285927772521973, + "step": 1834 + }, + { + "epoch": 0.3, + "learning_rate": 9.633848669797524e-06, + "logits/chosen": -0.8281547427177429, + "logits/rejected": -0.8181195855140686, + "logps/chosen": -41.09572219848633, + "logps/rejected": -87.89376831054688, + "loss": 0.6194, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.2537167072296143, + "rewards/margins": 0.372849702835083, + "rewards/rejected": 0.8808670043945312, + "step": 1835 + }, + { + "epoch": 0.3, + "learning_rate": 9.633354836576787e-06, + "logits/chosen": -1.1721243858337402, + "logits/rejected": -1.098585844039917, + "logps/chosen": -136.2566375732422, + "logps/rejected": -105.6405258178711, + "loss": 1.2081, + "rewards/accuracies": 0.0, + "rewards/chosen": 6.01779317855835, + "rewards/margins": -0.5719718933105469, + "rewards/rejected": 6.5897650718688965, + "step": 1836 + }, + { + "epoch": 0.3, + "learning_rate": 9.632860683235072e-06, + "logits/chosen": -0.9580867290496826, + "logits/rejected": -0.9522863030433655, + "logps/chosen": -48.30051803588867, + "logps/rejected": -95.99837493896484, + "loss": 0.4992, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.3385429382324219, + "rewards/margins": -0.37262654304504395, + "rewards/rejected": 1.7111694812774658, + "step": 1837 + }, + { + "epoch": 0.3, + "learning_rate": 9.63236620980652e-06, + "logits/chosen": -0.9587228894233704, + "logits/rejected": -0.9105318784713745, + "logps/chosen": -66.17439270019531, + "logps/rejected": -77.17416381835938, + "loss": 1.2707, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.373478651046753, + "rewards/margins": -1.109445333480835, + "rewards/rejected": 3.482923984527588, + "step": 1838 + }, + { + "epoch": 0.3, + "learning_rate": 9.631871416325295e-06, + "logits/chosen": -0.9982432126998901, + "logits/rejected": -1.0341758728027344, + "logps/chosen": -74.85069274902344, + "logps/rejected": -37.90330505371094, + "loss": 0.5979, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.2734482288360596, + "rewards/margins": -0.7581086158752441, + "rewards/rejected": 2.0315568447113037, + "step": 1839 + }, + { + "epoch": 0.3, + "learning_rate": 9.631376302825581e-06, + "logits/chosen": -1.152169108390808, + "logits/rejected": -1.0144740343093872, + "logps/chosen": -85.70866394042969, + "logps/rejected": -74.96035766601562, + "loss": 0.0432, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.421072483062744, + "rewards/margins": 2.571791172027588, + "rewards/rejected": 1.8492813110351562, + "step": 1840 + }, + { + "epoch": 0.3, + "learning_rate": 9.63088086934159e-06, + "logits/chosen": -1.0380079746246338, + "logits/rejected": -1.0389304161071777, + "logps/chosen": -2.9087975025177, + "logps/rejected": -1.3864785432815552, + "loss": 1.5389, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.13575060665607452, + "rewards/margins": -0.17454339563846588, + "rewards/rejected": 0.3102940022945404, + "step": 1841 + }, + { + "epoch": 0.3, + "learning_rate": 9.630385115907545e-06, + "logits/chosen": -1.098930835723877, + "logits/rejected": -1.0051050186157227, + "logps/chosen": -105.13497924804688, + "logps/rejected": -46.24329376220703, + "loss": 0.3426, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.7845093011856079, + "rewards/margins": 0.6531215906143188, + "rewards/rejected": 0.13138771057128906, + "step": 1842 + }, + { + "epoch": 0.3, + "learning_rate": 9.629889042557706e-06, + "logits/chosen": -1.0591915845870972, + "logits/rejected": -1.100460410118103, + "logps/chosen": -71.69743347167969, + "logps/rejected": -105.61981201171875, + "loss": 0.8386, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.9331451654434204, + "rewards/margins": -0.8929992914199829, + "rewards/rejected": 2.8261444568634033, + "step": 1843 + }, + { + "epoch": 0.3, + "learning_rate": 9.62939264932634e-06, + "logits/chosen": -0.33867180347442627, + "logits/rejected": -0.33867180347442627, + "logps/chosen": -51.90278244018555, + "logps/rejected": -51.90278244018555, + "loss": 0.395, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.46411019563674927, + "rewards/margins": 0.0, + "rewards/rejected": 0.46411019563674927, + "step": 1844 + }, + { + "epoch": 0.3, + "learning_rate": 9.628895936247744e-06, + "logits/chosen": -0.9876172542572021, + "logits/rejected": -0.9022176861763, + "logps/chosen": -46.63355255126953, + "logps/rejected": -27.446800231933594, + "loss": 0.7883, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.4635505676269531, + "rewards/margins": -0.46146321296691895, + "rewards/rejected": 1.925013780593872, + "step": 1845 + }, + { + "epoch": 0.3, + "learning_rate": 9.628398903356239e-06, + "logits/chosen": -1.2415319681167603, + "logits/rejected": -1.2415319681167603, + "logps/chosen": -37.86091232299805, + "logps/rejected": -37.86091232299805, + "loss": 0.7807, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.6369526386260986, + "rewards/margins": 0.0, + "rewards/rejected": 2.6369526386260986, + "step": 1846 + }, + { + "epoch": 0.3, + "learning_rate": 9.627901550686166e-06, + "logits/chosen": -1.027929663658142, + "logits/rejected": -1.0744746923446655, + "logps/chosen": -91.64942932128906, + "logps/rejected": -106.1601791381836, + "loss": 2.2002, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.4116761684417725, + "rewards/margins": -2.633207082748413, + "rewards/rejected": 5.0448832511901855, + "step": 1847 + }, + { + "epoch": 0.3, + "learning_rate": 9.627403878271883e-06, + "logits/chosen": -1.5066051483154297, + "logits/rejected": -1.5500046014785767, + "logps/chosen": -43.31195068359375, + "logps/rejected": -89.37191009521484, + "loss": 1.4216, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.2929160594940186, + "rewards/margins": -0.09717416763305664, + "rewards/rejected": 2.390090227127075, + "step": 1848 + }, + { + "epoch": 0.3, + "learning_rate": 9.62690588614778e-06, + "logits/chosen": -0.7710950374603271, + "logits/rejected": -0.7340584993362427, + "logps/chosen": -23.64618682861328, + "logps/rejected": -5.712128639221191, + "loss": 0.4175, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5739232897758484, + "rewards/margins": 0.13329467177391052, + "rewards/rejected": 0.44062861800193787, + "step": 1849 + }, + { + "epoch": 0.3, + "learning_rate": 9.626407574348258e-06, + "logits/chosen": -1.27455735206604, + "logits/rejected": -1.1637212038040161, + "logps/chosen": -65.73963928222656, + "logps/rejected": -37.24396514892578, + "loss": 0.5329, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.9671753644943237, + "rewards/margins": 1.8162175416946411, + "rewards/rejected": 0.1509578675031662, + "step": 1850 + }, + { + "epoch": 0.3, + "learning_rate": 9.625908942907748e-06, + "logits/chosen": -1.5113646984100342, + "logits/rejected": -1.4401795864105225, + "logps/chosen": -58.928184509277344, + "logps/rejected": -69.73104858398438, + "loss": 1.7205, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.8295845985412598, + "rewards/margins": -2.780646324157715, + "rewards/rejected": 5.610230922698975, + "step": 1851 + }, + { + "epoch": 0.3, + "learning_rate": 9.625409991860701e-06, + "logits/chosen": -0.7786346673965454, + "logits/rejected": -0.7888972759246826, + "logps/chosen": -34.433067321777344, + "logps/rejected": -35.07779312133789, + "loss": 0.6377, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.26863938570022583, + "rewards/margins": -0.7624068856239319, + "rewards/rejected": 1.0310462713241577, + "step": 1852 + }, + { + "epoch": 0.3, + "learning_rate": 9.62491072124159e-06, + "logits/chosen": -0.6966411471366882, + "logits/rejected": -0.6966411471366882, + "logps/chosen": -63.24000549316406, + "logps/rejected": -63.24000549316406, + "loss": 0.7322, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.1844924688339233, + "rewards/margins": 0.0, + "rewards/rejected": 1.1844924688339233, + "step": 1853 + }, + { + "epoch": 0.3, + "learning_rate": 9.62441113108491e-06, + "logits/chosen": -1.129305362701416, + "logits/rejected": -0.9915350079536438, + "logps/chosen": -133.39874267578125, + "logps/rejected": -86.93287658691406, + "loss": 0.2784, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.601115703582764, + "rewards/margins": 0.5041413307189941, + "rewards/rejected": 4.0969743728637695, + "step": 1854 + }, + { + "epoch": 0.3, + "learning_rate": 9.623911221425176e-06, + "logits/chosen": -0.9793438911437988, + "logits/rejected": -1.0458699464797974, + "logps/chosen": -84.3607406616211, + "logps/rejected": -125.05601501464844, + "loss": 1.7707, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.3233451843261719, + "rewards/margins": -2.7781424522399902, + "rewards/rejected": 4.101487636566162, + "step": 1855 + }, + { + "epoch": 0.3, + "learning_rate": 9.62341099229693e-06, + "logits/chosen": -0.827928900718689, + "logits/rejected": -0.6283918619155884, + "logps/chosen": -146.36280822753906, + "logps/rejected": -65.77592468261719, + "loss": 0.673, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.974723815917969, + "rewards/margins": 0.7694306373596191, + "rewards/rejected": 4.20529317855835, + "step": 1856 + }, + { + "epoch": 0.3, + "learning_rate": 9.62291044373473e-06, + "logits/chosen": -1.1996270418167114, + "logits/rejected": -1.1148488521575928, + "logps/chosen": -81.69352722167969, + "logps/rejected": -152.748779296875, + "loss": 2.1238, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.181640625, + "rewards/margins": -4.223565578460693, + "rewards/rejected": 5.405206203460693, + "step": 1857 + }, + { + "epoch": 0.3, + "learning_rate": 9.622409575773162e-06, + "logits/chosen": -1.184037446975708, + "logits/rejected": -1.1719110012054443, + "logps/chosen": -19.1533260345459, + "logps/rejected": -20.24823760986328, + "loss": 0.3251, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.608139991760254, + "rewards/margins": 0.9271625280380249, + "rewards/rejected": 0.680977463722229, + "step": 1858 + }, + { + "epoch": 0.3, + "learning_rate": 9.62190838844683e-06, + "logits/chosen": -0.5046049952507019, + "logits/rejected": -0.5271033644676208, + "logps/chosen": -9.591828346252441, + "logps/rejected": -32.520347595214844, + "loss": 0.8152, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.21712541580200195, + "rewards/margins": -0.44012176990509033, + "rewards/rejected": 0.6572471857070923, + "step": 1859 + }, + { + "epoch": 0.3, + "learning_rate": 9.62140688179036e-06, + "logits/chosen": -0.847389280796051, + "logits/rejected": -0.8858689069747925, + "logps/chosen": -38.41633605957031, + "logps/rejected": -98.4024658203125, + "loss": 0.1018, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.332844614982605, + "rewards/margins": 1.540889024734497, + "rewards/rejected": -0.20804443955421448, + "step": 1860 + }, + { + "epoch": 0.3, + "learning_rate": 9.620905055838402e-06, + "logits/chosen": -0.8937880396842957, + "logits/rejected": -0.8358320593833923, + "logps/chosen": -53.67329406738281, + "logps/rejected": -47.326744079589844, + "loss": 0.1635, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.477037191390991, + "rewards/margins": 1.0661637783050537, + "rewards/rejected": 1.4108734130859375, + "step": 1861 + }, + { + "epoch": 0.3, + "learning_rate": 9.620402910625632e-06, + "logits/chosen": -1.1127989292144775, + "logits/rejected": -1.1694185733795166, + "logps/chosen": -48.475242614746094, + "logps/rejected": -122.3909683227539, + "loss": 0.4194, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.7999916076660156, + "rewards/margins": 0.39251554012298584, + "rewards/rejected": 1.4074760675430298, + "step": 1862 + }, + { + "epoch": 0.3, + "learning_rate": 9.619900446186737e-06, + "logits/chosen": -1.4769055843353271, + "logits/rejected": -1.2535593509674072, + "logps/chosen": -140.8135223388672, + "logps/rejected": -19.785247802734375, + "loss": 0.0274, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.768662929534912, + "rewards/margins": 6.328671932220459, + "rewards/rejected": 0.4399909973144531, + "step": 1863 + }, + { + "epoch": 0.3, + "learning_rate": 9.619397662556434e-06, + "logits/chosen": -1.1340819597244263, + "logits/rejected": -1.0907877683639526, + "logps/chosen": -81.71064758300781, + "logps/rejected": -80.85891723632812, + "loss": 0.8824, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.569364309310913, + "rewards/margins": -0.19324707984924316, + "rewards/rejected": 2.7626113891601562, + "step": 1864 + }, + { + "epoch": 0.3, + "learning_rate": 9.618894559769462e-06, + "logits/chosen": -0.8928940892219543, + "logits/rejected": -0.9309033155441284, + "logps/chosen": -59.26753234863281, + "logps/rejected": -73.70785522460938, + "loss": 1.2494, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.017527390271425247, + "rewards/margins": -1.8226902484893799, + "rewards/rejected": 1.8402175903320312, + "step": 1865 + }, + { + "epoch": 0.3, + "learning_rate": 9.618391137860583e-06, + "logits/chosen": -0.7868567705154419, + "logits/rejected": -0.8179081678390503, + "logps/chosen": -34.29091262817383, + "logps/rejected": -62.94230651855469, + "loss": 0.4437, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.8948360681533813, + "rewards/margins": 0.23125571012496948, + "rewards/rejected": 0.6635803580284119, + "step": 1866 + }, + { + "epoch": 0.3, + "learning_rate": 9.617887396864574e-06, + "logits/chosen": -0.5141587257385254, + "logits/rejected": -0.5081595778465271, + "logps/chosen": -13.095849990844727, + "logps/rejected": -5.606540203094482, + "loss": 0.7283, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.07157526165246964, + "rewards/margins": -0.20879793167114258, + "rewards/rejected": 0.13722267746925354, + "step": 1867 + }, + { + "epoch": 0.3, + "learning_rate": 9.61738333681624e-06, + "logits/chosen": -0.9525498747825623, + "logits/rejected": -0.7580363750457764, + "logps/chosen": -69.5611572265625, + "logps/rejected": -5.038984775543213, + "loss": 1.0134, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.8985695838928223, + "rewards/margins": 2.2050278186798096, + "rewards/rejected": 0.6935417652130127, + "step": 1868 + }, + { + "epoch": 0.3, + "learning_rate": 9.616878957750409e-06, + "logits/chosen": -0.9980339407920837, + "logits/rejected": -0.9980339407920837, + "logps/chosen": -66.09184265136719, + "logps/rejected": -66.09184265136719, + "loss": 1.046, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.5797975063323975, + "rewards/margins": 0.0, + "rewards/rejected": 2.5797975063323975, + "step": 1869 + }, + { + "epoch": 0.3, + "learning_rate": 9.616374259701927e-06, + "logits/chosen": -1.159959077835083, + "logits/rejected": -1.3315403461456299, + "logps/chosen": -72.00224304199219, + "logps/rejected": -99.00016784667969, + "loss": 3.2986, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.5554405450820923, + "rewards/margins": -6.006258487701416, + "rewards/rejected": 7.561698913574219, + "step": 1870 + }, + { + "epoch": 0.3, + "learning_rate": 9.615869242705664e-06, + "logits/chosen": -1.2578924894332886, + "logits/rejected": -1.1798522472381592, + "logps/chosen": -134.34173583984375, + "logps/rejected": -87.01461029052734, + "loss": 0.307, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.4372406005859375, + "rewards/margins": 2.1167945861816406, + "rewards/rejected": 1.3204460144042969, + "step": 1871 + }, + { + "epoch": 0.3, + "learning_rate": 9.61536390679651e-06, + "logits/chosen": -0.9648942947387695, + "logits/rejected": -0.9648942947387695, + "logps/chosen": -40.08668518066406, + "logps/rejected": -40.08668518066406, + "loss": 0.5534, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.287524461746216, + "rewards/margins": 0.0, + "rewards/rejected": 3.287524461746216, + "step": 1872 + }, + { + "epoch": 0.3, + "learning_rate": 9.614858252009385e-06, + "logits/chosen": -0.7060236930847168, + "logits/rejected": -0.6739920973777771, + "logps/chosen": -42.600685119628906, + "logps/rejected": -5.249091148376465, + "loss": 1.3449, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.28347817063331604, + "rewards/margins": 0.022110790014266968, + "rewards/rejected": 0.2613673806190491, + "step": 1873 + }, + { + "epoch": 0.3, + "learning_rate": 9.614352278379217e-06, + "logits/chosen": -0.9399387836456299, + "logits/rejected": -0.7951427102088928, + "logps/chosen": -49.39914321899414, + "logps/rejected": -30.544775009155273, + "loss": 0.3318, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.437861204147339, + "rewards/margins": 0.10854458808898926, + "rewards/rejected": 2.3293166160583496, + "step": 1874 + }, + { + "epoch": 0.3, + "learning_rate": 9.613845985940971e-06, + "logits/chosen": -1.3494572639465332, + "logits/rejected": -1.1716221570968628, + "logps/chosen": -98.41807556152344, + "logps/rejected": -26.068559646606445, + "loss": 0.0449, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.328845500946045, + "rewards/margins": 5.267720699310303, + "rewards/rejected": 0.061124611645936966, + "step": 1875 + }, + { + "epoch": 0.3, + "learning_rate": 9.613339374729622e-06, + "logits/chosen": -0.4113155007362366, + "logits/rejected": -0.40280216932296753, + "logps/chosen": -10.334577560424805, + "logps/rejected": -19.931949615478516, + "loss": 1.3809, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.34085360169410706, + "rewards/margins": 0.15762053430080414, + "rewards/rejected": 0.18323306739330292, + "step": 1876 + }, + { + "epoch": 0.3, + "learning_rate": 9.612832444780175e-06, + "logits/chosen": -1.0046182870864868, + "logits/rejected": -1.0139753818511963, + "logps/chosen": -124.81976318359375, + "logps/rejected": -55.39814758300781, + "loss": 0.4838, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.66796875, + "rewards/margins": -0.4810006618499756, + "rewards/rejected": 2.1489694118499756, + "step": 1877 + }, + { + "epoch": 0.3, + "learning_rate": 9.612325196127654e-06, + "logits/chosen": -0.7326586246490479, + "logits/rejected": -0.7233970165252686, + "logps/chosen": -10.70312213897705, + "logps/rejected": -25.242897033691406, + "loss": 0.5689, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.7686080932617188, + "rewards/margins": 0.0338519811630249, + "rewards/rejected": 1.7347561120986938, + "step": 1878 + }, + { + "epoch": 0.3, + "learning_rate": 9.611817628807104e-06, + "logits/chosen": -0.8586947917938232, + "logits/rejected": -0.809317409992218, + "logps/chosen": -22.8277645111084, + "logps/rejected": -67.89388275146484, + "loss": 0.6078, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.7136491537094116, + "rewards/margins": -0.7067939043045044, + "rewards/rejected": 2.420443058013916, + "step": 1879 + }, + { + "epoch": 0.31, + "learning_rate": 9.611309742853592e-06, + "logits/chosen": -0.6435106992721558, + "logits/rejected": -0.6435106992721558, + "logps/chosen": -68.78214263916016, + "logps/rejected": -68.78214263916016, + "loss": 0.3941, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.1499505043029785, + "rewards/margins": 0.0, + "rewards/rejected": 2.1499505043029785, + "step": 1880 + }, + { + "epoch": 0.31, + "learning_rate": 9.610801538302208e-06, + "logits/chosen": -0.7655181884765625, + "logits/rejected": -0.7648460865020752, + "logps/chosen": -103.28839111328125, + "logps/rejected": -184.10963439941406, + "loss": 0.367, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.7527649402618408, + "rewards/margins": 0.6818863153457642, + "rewards/rejected": 1.0708786249160767, + "step": 1881 + }, + { + "epoch": 0.31, + "learning_rate": 9.610293015188067e-06, + "logits/chosen": -1.4060090780258179, + "logits/rejected": -1.3461365699768066, + "logps/chosen": -40.435638427734375, + "logps/rejected": -72.20767211914062, + "loss": 0.5646, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.2294952869415283, + "rewards/margins": -0.5277016162872314, + "rewards/rejected": 2.7571969032287598, + "step": 1882 + }, + { + "epoch": 0.31, + "learning_rate": 9.609784173546304e-06, + "logits/chosen": -1.5484915971755981, + "logits/rejected": -1.495413064956665, + "logps/chosen": -140.40615844726562, + "logps/rejected": -152.5809326171875, + "loss": 0.4403, + "rewards/accuracies": 0.0, + "rewards/chosen": 7.103883266448975, + "rewards/margins": -0.2946901321411133, + "rewards/rejected": 7.398573398590088, + "step": 1883 + }, + { + "epoch": 0.31, + "learning_rate": 9.60927501341207e-06, + "logits/chosen": -0.833404004573822, + "logits/rejected": -0.833404004573822, + "logps/chosen": -86.55524444580078, + "logps/rejected": -86.55524444580078, + "loss": 0.4593, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.4490073919296265, + "rewards/margins": 0.0, + "rewards/rejected": 1.4490073919296265, + "step": 1884 + }, + { + "epoch": 0.31, + "learning_rate": 9.608765534820548e-06, + "logits/chosen": -0.7454236149787903, + "logits/rejected": -0.7690023183822632, + "logps/chosen": -61.631187438964844, + "logps/rejected": -40.957237243652344, + "loss": 0.4772, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.7770423889160156, + "rewards/margins": -0.3379685878753662, + "rewards/rejected": 3.115010976791382, + "step": 1885 + }, + { + "epoch": 0.31, + "learning_rate": 9.608255737806933e-06, + "logits/chosen": -0.833346962928772, + "logits/rejected": -0.8044085502624512, + "logps/chosen": -56.693138122558594, + "logps/rejected": -78.2637939453125, + "loss": 0.8329, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.0151634216308594, + "rewards/margins": 0.23155516386032104, + "rewards/rejected": 0.7836082577705383, + "step": 1886 + }, + { + "epoch": 0.31, + "learning_rate": 9.607745622406453e-06, + "logits/chosen": -1.1913869380950928, + "logits/rejected": -1.1474332809448242, + "logps/chosen": -184.2238006591797, + "logps/rejected": -87.24309539794922, + "loss": 1.4955, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.7193589210510254, + "rewards/margins": -2.0372185707092285, + "rewards/rejected": 5.756577491760254, + "step": 1887 + }, + { + "epoch": 0.31, + "learning_rate": 9.607235188654348e-06, + "logits/chosen": -0.9736972451210022, + "logits/rejected": -1.0752637386322021, + "logps/chosen": -81.37257385253906, + "logps/rejected": -112.30401611328125, + "loss": 3.3446, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.3163269758224487, + "rewards/margins": -2.232412815093994, + "rewards/rejected": 3.5487396717071533, + "step": 1888 + }, + { + "epoch": 0.31, + "learning_rate": 9.606724436585885e-06, + "logits/chosen": -0.7206161618232727, + "logits/rejected": -0.7263526916503906, + "logps/chosen": -81.522216796875, + "logps/rejected": -61.58396911621094, + "loss": 0.7687, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.312353491783142, + "rewards/margins": -0.4815812110900879, + "rewards/rejected": 1.79393470287323, + "step": 1889 + }, + { + "epoch": 0.31, + "learning_rate": 9.606213366236354e-06, + "logits/chosen": -0.8458192348480225, + "logits/rejected": -0.8375053405761719, + "logps/chosen": -30.98218536376953, + "logps/rejected": -78.37625122070312, + "loss": 0.5397, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.078558325767517, + "rewards/margins": -0.34023594856262207, + "rewards/rejected": 1.4187942743301392, + "step": 1890 + }, + { + "epoch": 0.31, + "learning_rate": 9.605701977641065e-06, + "logits/chosen": -0.6406740546226501, + "logits/rejected": -0.6828052997589111, + "logps/chosen": -43.77001953125, + "logps/rejected": -55.01743698120117, + "loss": 0.3029, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.9883171319961548, + "rewards/margins": 0.40474361181259155, + "rewards/rejected": 0.5835735201835632, + "step": 1891 + }, + { + "epoch": 0.31, + "learning_rate": 9.605190270835348e-06, + "logits/chosen": -0.9220983982086182, + "logits/rejected": -0.9123520851135254, + "logps/chosen": -47.796539306640625, + "logps/rejected": -80.91233825683594, + "loss": 1.2016, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.9401931762695312, + "rewards/margins": -2.1696290969848633, + "rewards/rejected": 4.1098222732543945, + "step": 1892 + }, + { + "epoch": 0.31, + "learning_rate": 9.604678245854557e-06, + "logits/chosen": -0.5980445742607117, + "logits/rejected": -0.5980445742607117, + "logps/chosen": -26.579822540283203, + "logps/rejected": -26.579822540283203, + "loss": 0.3693, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.3228973150253296, + "rewards/margins": 0.0, + "rewards/rejected": 1.3228973150253296, + "step": 1893 + }, + { + "epoch": 0.31, + "learning_rate": 9.604165902734069e-06, + "logits/chosen": -1.3130496740341187, + "logits/rejected": -1.2316242456436157, + "logps/chosen": -108.04296875, + "logps/rejected": -95.024169921875, + "loss": 0.4702, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.062771797180176, + "rewards/margins": 0.5039262771606445, + "rewards/rejected": 3.5588455200195312, + "step": 1894 + }, + { + "epoch": 0.31, + "learning_rate": 9.603653241509282e-06, + "logits/chosen": -0.5225266814231873, + "logits/rejected": -0.5279590487480164, + "logps/chosen": -94.53877258300781, + "logps/rejected": -53.8785514831543, + "loss": 0.6747, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.0620170831680298, + "rewards/margins": -0.6107692718505859, + "rewards/rejected": 1.6727863550186157, + "step": 1895 + }, + { + "epoch": 0.31, + "learning_rate": 9.603140262215617e-06, + "logits/chosen": -1.1980700492858887, + "logits/rejected": -1.1980634927749634, + "logps/chosen": -34.387176513671875, + "logps/rejected": -53.62235641479492, + "loss": 1.1668, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.9984733462333679, + "rewards/margins": -2.1806225776672363, + "rewards/rejected": 3.179095983505249, + "step": 1896 + }, + { + "epoch": 0.31, + "learning_rate": 9.602626964888515e-06, + "logits/chosen": -0.9674066305160522, + "logits/rejected": -0.9492599368095398, + "logps/chosen": -101.94945526123047, + "logps/rejected": -61.92754364013672, + "loss": 0.4929, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.9712837338447571, + "rewards/margins": -0.497596800327301, + "rewards/rejected": 1.468880534172058, + "step": 1897 + }, + { + "epoch": 0.31, + "learning_rate": 9.60211334956344e-06, + "logits/chosen": -1.0117312669754028, + "logits/rejected": -1.0415823459625244, + "logps/chosen": -79.34530639648438, + "logps/rejected": -86.50189208984375, + "loss": 1.1024, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.4077430963516235, + "rewards/margins": -1.1173135042190552, + "rewards/rejected": 2.5250566005706787, + "step": 1898 + }, + { + "epoch": 0.31, + "learning_rate": 9.601599416275877e-06, + "logits/chosen": -1.1718101501464844, + "logits/rejected": -1.1841168403625488, + "logps/chosen": -79.84701538085938, + "logps/rejected": -80.89090728759766, + "loss": 0.3518, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.413012742996216, + "rewards/margins": 0.3870224952697754, + "rewards/rejected": 2.0259902477264404, + "step": 1899 + }, + { + "epoch": 0.31, + "learning_rate": 9.601085165061337e-06, + "logits/chosen": -1.2427397966384888, + "logits/rejected": -1.3803229331970215, + "logps/chosen": -92.6840591430664, + "logps/rejected": -37.08335494995117, + "loss": 0.2004, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.3181099891662598, + "rewards/margins": 1.9771313667297363, + "rewards/rejected": 0.34097862243652344, + "step": 1900 + }, + { + "epoch": 0.31, + "learning_rate": 9.600570595955347e-06, + "logits/chosen": -0.8396321535110474, + "logits/rejected": -0.6653401851654053, + "logps/chosen": -62.580257415771484, + "logps/rejected": -27.03944206237793, + "loss": 0.162, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.6721912622451782, + "rewards/margins": 2.0956201553344727, + "rewards/rejected": -0.4234289228916168, + "step": 1901 + }, + { + "epoch": 0.31, + "learning_rate": 9.600055708993462e-06, + "logits/chosen": -1.2442811727523804, + "logits/rejected": -0.978288471698761, + "logps/chosen": -133.79934692382812, + "logps/rejected": -76.69670867919922, + "loss": 0.0793, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.128686428070068, + "rewards/margins": 1.9089856147766113, + "rewards/rejected": 4.219700813293457, + "step": 1902 + }, + { + "epoch": 0.31, + "learning_rate": 9.59954050421125e-06, + "logits/chosen": -0.7747288942337036, + "logits/rejected": -0.6311934590339661, + "logps/chosen": -47.28302764892578, + "logps/rejected": -29.77897834777832, + "loss": 0.0638, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2199509143829346, + "rewards/margins": 3.2859647274017334, + "rewards/rejected": -0.06601390987634659, + "step": 1903 + }, + { + "epoch": 0.31, + "learning_rate": 9.599024981644314e-06, + "logits/chosen": -1.0639138221740723, + "logits/rejected": -0.9887779355049133, + "logps/chosen": -149.70761108398438, + "logps/rejected": -68.69697570800781, + "loss": 0.5046, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.490325927734375, + "rewards/margins": 0.01546788215637207, + "rewards/rejected": 2.474858045578003, + "step": 1904 + }, + { + "epoch": 0.31, + "learning_rate": 9.598509141328265e-06, + "logits/chosen": -0.7607774138450623, + "logits/rejected": -0.7533985376358032, + "logps/chosen": -52.02259063720703, + "logps/rejected": -96.80892944335938, + "loss": 1.0029, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.5313994884490967, + "rewards/margins": -1.8282349109649658, + "rewards/rejected": 4.3596343994140625, + "step": 1905 + }, + { + "epoch": 0.31, + "learning_rate": 9.597992983298748e-06, + "logits/chosen": -0.6533074378967285, + "logits/rejected": -0.6252499222755432, + "logps/chosen": -40.22774887084961, + "logps/rejected": -46.15882873535156, + "loss": 0.178, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9496867656707764, + "rewards/margins": 1.2380915880203247, + "rewards/rejected": 1.7115951776504517, + "step": 1906 + }, + { + "epoch": 0.31, + "learning_rate": 9.597476507591421e-06, + "logits/chosen": -1.4088099002838135, + "logits/rejected": -1.1600059270858765, + "logps/chosen": -144.832763671875, + "logps/rejected": -57.12474060058594, + "loss": 0.3394, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.327545166015625, + "rewards/margins": 0.5871856212615967, + "rewards/rejected": 3.7403595447540283, + "step": 1907 + }, + { + "epoch": 0.31, + "learning_rate": 9.59695971424197e-06, + "logits/chosen": -1.2309175729751587, + "logits/rejected": -1.2496109008789062, + "logps/chosen": -74.63092803955078, + "logps/rejected": -23.081186294555664, + "loss": 0.1886, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.412506103515625, + "rewards/margins": 1.1243693828582764, + "rewards/rejected": 0.28813669085502625, + "step": 1908 + }, + { + "epoch": 0.31, + "learning_rate": 9.5964426032861e-06, + "logits/chosen": -1.2467347383499146, + "logits/rejected": -1.2510287761688232, + "logps/chosen": -92.08796691894531, + "logps/rejected": -77.88841247558594, + "loss": 3.8316, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.243194580078125, + "rewards/margins": -0.8700273036956787, + "rewards/rejected": 2.1132218837738037, + "step": 1909 + }, + { + "epoch": 0.31, + "learning_rate": 9.595925174759537e-06, + "logits/chosen": -0.7712922096252441, + "logits/rejected": -0.4794634282588959, + "logps/chosen": -105.67893981933594, + "logps/rejected": -29.523635864257812, + "loss": 0.0046, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.047463893890381, + "rewards/margins": 4.861835479736328, + "rewards/rejected": 1.1856285333633423, + "step": 1910 + }, + { + "epoch": 0.31, + "learning_rate": 9.595407428698032e-06, + "logits/chosen": -1.0798786878585815, + "logits/rejected": -0.8035660982131958, + "logps/chosen": -160.07606506347656, + "logps/rejected": -34.749107360839844, + "loss": 0.19, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.669050693511963, + "rewards/margins": 3.4529969692230225, + "rewards/rejected": 0.216053768992424, + "step": 1911 + }, + { + "epoch": 0.31, + "learning_rate": 9.594889365137354e-06, + "logits/chosen": -0.8413652777671814, + "logits/rejected": -0.8385012745857239, + "logps/chosen": -53.11299133300781, + "logps/rejected": -98.85237121582031, + "loss": 0.3675, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.6857513189315796, + "rewards/margins": 1.1167014837265015, + "rewards/rejected": 0.5690498352050781, + "step": 1912 + }, + { + "epoch": 0.31, + "learning_rate": 9.594370984113302e-06, + "logits/chosen": -0.9750614166259766, + "logits/rejected": -0.9602479338645935, + "logps/chosen": -93.7618408203125, + "logps/rejected": -62.11518478393555, + "loss": 0.3441, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.4223618507385254, + "rewards/margins": 1.3397351503372192, + "rewards/rejected": 1.0826267004013062, + "step": 1913 + }, + { + "epoch": 0.31, + "learning_rate": 9.593852285661684e-06, + "logits/chosen": -1.0127099752426147, + "logits/rejected": -0.9780897498130798, + "logps/chosen": -89.14892578125, + "logps/rejected": -62.43577575683594, + "loss": 1.5706, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.1563034057617188, + "rewards/margins": -0.16500556468963623, + "rewards/rejected": 1.321308970451355, + "step": 1914 + }, + { + "epoch": 0.31, + "learning_rate": 9.593333269818341e-06, + "logits/chosen": -1.044764518737793, + "logits/rejected": -1.0320507287979126, + "logps/chosen": -52.54404067993164, + "logps/rejected": -35.243858337402344, + "loss": 1.1994, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.3696049451828003, + "rewards/margins": -0.6180862188339233, + "rewards/rejected": 1.9876911640167236, + "step": 1915 + }, + { + "epoch": 0.31, + "learning_rate": 9.592813936619132e-06, + "logits/chosen": -0.9289833307266235, + "logits/rejected": -0.8593135476112366, + "logps/chosen": -65.42001342773438, + "logps/rejected": -55.229522705078125, + "loss": 0.412, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.4269814491271973, + "rewards/margins": 0.3587806224822998, + "rewards/rejected": 2.0682008266448975, + "step": 1916 + }, + { + "epoch": 0.31, + "learning_rate": 9.592294286099938e-06, + "logits/chosen": -0.995883047580719, + "logits/rejected": -0.9384059906005859, + "logps/chosen": -105.88569641113281, + "logps/rejected": -65.68350219726562, + "loss": 0.3648, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.222673177719116, + "rewards/margins": 1.5190002918243408, + "rewards/rejected": 0.7036728262901306, + "step": 1917 + }, + { + "epoch": 0.31, + "learning_rate": 9.591774318296661e-06, + "logits/chosen": -0.9940969944000244, + "logits/rejected": -1.0875548124313354, + "logps/chosen": -65.34370422363281, + "logps/rejected": -128.61024475097656, + "loss": 1.0156, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.6771697998046875, + "rewards/margins": -1.4110076427459717, + "rewards/rejected": 3.088177442550659, + "step": 1918 + }, + { + "epoch": 0.31, + "learning_rate": 9.591254033245228e-06, + "logits/chosen": -1.030066728591919, + "logits/rejected": -1.0031925439834595, + "logps/chosen": -47.60548782348633, + "logps/rejected": -147.48370361328125, + "loss": 1.0451, + "rewards/accuracies": 0.0, + "rewards/chosen": 4.198338985443115, + "rewards/margins": -1.8416423797607422, + "rewards/rejected": 6.039981365203857, + "step": 1919 + }, + { + "epoch": 0.31, + "learning_rate": 9.590733430981583e-06, + "logits/chosen": -1.2039252519607544, + "logits/rejected": -1.2231231927871704, + "logps/chosen": -135.67953491210938, + "logps/rejected": -131.66978454589844, + "loss": 1.6064, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.1466827392578125, + "rewards/margins": 0.2332366704940796, + "rewards/rejected": 1.913446068763733, + "step": 1920 + }, + { + "epoch": 0.31, + "learning_rate": 9.590212511541694e-06, + "logits/chosen": -1.0784223079681396, + "logits/rejected": -1.1027755737304688, + "logps/chosen": -123.2080307006836, + "logps/rejected": -109.78763580322266, + "loss": 2.6696, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.7959206104278564, + "rewards/margins": -0.6916060447692871, + "rewards/rejected": 3.4875266551971436, + "step": 1921 + }, + { + "epoch": 0.31, + "learning_rate": 9.589691274961556e-06, + "logits/chosen": -0.21986088156700134, + "logits/rejected": -0.2214536964893341, + "logps/chosen": -3.8068342208862305, + "logps/rejected": -1.537777304649353, + "loss": 0.818, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.44074782729148865, + "rewards/margins": -0.11502179503440857, + "rewards/rejected": 0.5557696223258972, + "step": 1922 + }, + { + "epoch": 0.31, + "learning_rate": 9.589169721277179e-06, + "logits/chosen": -1.2857787609100342, + "logits/rejected": -1.1717162132263184, + "logps/chosen": -159.8140106201172, + "logps/rejected": -59.78483581542969, + "loss": 0.177, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.3286101818084717, + "rewards/margins": 0.8851417303085327, + "rewards/rejected": 1.443468451499939, + "step": 1923 + }, + { + "epoch": 0.31, + "learning_rate": 9.588647850524595e-06, + "logits/chosen": -1.1195917129516602, + "logits/rejected": -1.064273476600647, + "logps/chosen": -86.37066650390625, + "logps/rejected": -75.83028411865234, + "loss": 0.0541, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.2871086597442627, + "rewards/margins": 2.364981174468994, + "rewards/rejected": -0.07787247002124786, + "step": 1924 + }, + { + "epoch": 0.31, + "learning_rate": 9.588125662739865e-06, + "logits/chosen": -0.8609939217567444, + "logits/rejected": -0.8147895336151123, + "logps/chosen": -103.19904327392578, + "logps/rejected": -80.0713882446289, + "loss": 0.4996, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.333571910858154, + "rewards/margins": 2.207676887512207, + "rewards/rejected": 2.1258950233459473, + "step": 1925 + }, + { + "epoch": 0.31, + "learning_rate": 9.587603157959064e-06, + "logits/chosen": -1.1106867790222168, + "logits/rejected": -0.9223323464393616, + "logps/chosen": -93.67008209228516, + "logps/rejected": -67.91856384277344, + "loss": 0.3882, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.108574867248535, + "rewards/margins": 1.494330644607544, + "rewards/rejected": 2.614244222640991, + "step": 1926 + }, + { + "epoch": 0.31, + "learning_rate": 9.587080336218292e-06, + "logits/chosen": -0.7758170366287231, + "logits/rejected": -0.6781767010688782, + "logps/chosen": -53.22779846191406, + "logps/rejected": -55.080223083496094, + "loss": 0.4359, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.5893523693084717, + "rewards/margins": 1.7979201078414917, + "rewards/rejected": 1.79143226146698, + "step": 1927 + }, + { + "epoch": 0.31, + "learning_rate": 9.586557197553674e-06, + "logits/chosen": -0.5400997400283813, + "logits/rejected": -0.5432745814323425, + "logps/chosen": -46.54631042480469, + "logps/rejected": -40.5108642578125, + "loss": 0.5494, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.133530855178833, + "rewards/margins": 0.06085240840911865, + "rewards/rejected": 1.0726784467697144, + "step": 1928 + }, + { + "epoch": 0.31, + "learning_rate": 9.586033742001351e-06, + "logits/chosen": -1.1170743703842163, + "logits/rejected": -1.1141691207885742, + "logps/chosen": -72.99773406982422, + "logps/rejected": -55.09015655517578, + "loss": 0.649, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.8908805847167969, + "rewards/margins": -0.910102128982544, + "rewards/rejected": 2.800982713699341, + "step": 1929 + }, + { + "epoch": 0.31, + "learning_rate": 9.585509969597491e-06, + "logits/chosen": -0.7608725428581238, + "logits/rejected": -0.7587602138519287, + "logps/chosen": -6.703718185424805, + "logps/rejected": -16.909629821777344, + "loss": 0.6477, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.2627691328525543, + "rewards/margins": -0.09981822967529297, + "rewards/rejected": 0.3625873625278473, + "step": 1930 + }, + { + "epoch": 0.31, + "learning_rate": 9.584985880378279e-06, + "logits/chosen": -0.7986147999763489, + "logits/rejected": -0.8273096680641174, + "logps/chosen": -56.422245025634766, + "logps/rejected": -78.06822967529297, + "loss": 3.2537, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.3361027240753174, + "rewards/margins": -1.2078356742858887, + "rewards/rejected": 3.543938398361206, + "step": 1931 + }, + { + "epoch": 0.31, + "learning_rate": 9.584461474379927e-06, + "logits/chosen": -0.9520124793052673, + "logits/rejected": -0.8217063546180725, + "logps/chosen": -58.519561767578125, + "logps/rejected": -29.428003311157227, + "loss": 0.0752, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.121904134750366, + "rewards/margins": 2.1719233989715576, + "rewards/rejected": -0.050019264221191406, + "step": 1932 + }, + { + "epoch": 0.31, + "learning_rate": 9.583936751638667e-06, + "logits/chosen": -0.9715184569358826, + "logits/rejected": -0.9528985023498535, + "logps/chosen": -82.67473602294922, + "logps/rejected": -143.30853271484375, + "loss": 1.8439, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.5774147510528564, + "rewards/margins": -0.5669074058532715, + "rewards/rejected": 2.144322156906128, + "step": 1933 + }, + { + "epoch": 0.31, + "learning_rate": 9.583411712190749e-06, + "logits/chosen": -1.2510226964950562, + "logits/rejected": -1.3449698686599731, + "logps/chosen": -102.39098358154297, + "logps/rejected": -152.4466094970703, + "loss": 1.8503, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.577517032623291, + "rewards/margins": -3.6336541175842285, + "rewards/rejected": 6.2111711502075195, + "step": 1934 + }, + { + "epoch": 0.31, + "learning_rate": 9.582886356072452e-06, + "logits/chosen": -0.8716070055961609, + "logits/rejected": -0.8744868636131287, + "logps/chosen": -8.505415916442871, + "logps/rejected": -4.044000148773193, + "loss": 1.2743, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.10075979679822922, + "rewards/margins": -0.2795925438404083, + "rewards/rejected": 0.38035234808921814, + "step": 1935 + }, + { + "epoch": 0.31, + "learning_rate": 9.58236068332007e-06, + "logits/chosen": -1.1435837745666504, + "logits/rejected": -1.0914530754089355, + "logps/chosen": -132.38739013671875, + "logps/rejected": -102.6723403930664, + "loss": 0.8634, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.4776626825332642, + "rewards/margins": -1.3677726984024048, + "rewards/rejected": 2.845435380935669, + "step": 1936 + }, + { + "epoch": 0.31, + "learning_rate": 9.581834693969925e-06, + "logits/chosen": -0.6811830997467041, + "logits/rejected": -0.6413934826850891, + "logps/chosen": -50.222957611083984, + "logps/rejected": -20.66710090637207, + "loss": 0.1677, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.3431061506271362, + "rewards/margins": 1.0492304563522339, + "rewards/rejected": 0.29387569427490234, + "step": 1937 + }, + { + "epoch": 0.31, + "learning_rate": 9.581308388058354e-06, + "logits/chosen": -1.0213136672973633, + "logits/rejected": -1.0132973194122314, + "logps/chosen": -104.61041259765625, + "logps/rejected": -98.38600158691406, + "loss": 2.0677, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.108355760574341, + "rewards/margins": -1.0048034191131592, + "rewards/rejected": 3.1131591796875, + "step": 1938 + }, + { + "epoch": 0.31, + "learning_rate": 9.580781765621725e-06, + "logits/chosen": -1.073721170425415, + "logits/rejected": -0.9812162518501282, + "logps/chosen": -83.02959442138672, + "logps/rejected": -56.33954620361328, + "loss": 0.1008, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.5922601222991943, + "rewards/margins": 2.1196184158325195, + "rewards/rejected": 0.4726417660713196, + "step": 1939 + }, + { + "epoch": 0.31, + "learning_rate": 9.580254826696418e-06, + "logits/chosen": -1.2295140027999878, + "logits/rejected": -1.190375804901123, + "logps/chosen": -60.544151306152344, + "logps/rejected": -110.34896087646484, + "loss": 0.4907, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.1128928661346436, + "rewards/margins": 0.3874030113220215, + "rewards/rejected": 1.725489854812622, + "step": 1940 + }, + { + "epoch": 0.32, + "learning_rate": 9.579727571318842e-06, + "logits/chosen": -0.864062488079071, + "logits/rejected": -0.864062488079071, + "logps/chosen": -65.07624816894531, + "logps/rejected": -65.07624816894531, + "loss": 0.8129, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.2946860790252686, + "rewards/margins": 0.0, + "rewards/rejected": 2.2946860790252686, + "step": 1941 + }, + { + "epoch": 0.32, + "learning_rate": 9.579199999525424e-06, + "logits/chosen": -1.1290833950042725, + "logits/rejected": -1.0015918016433716, + "logps/chosen": -76.79035186767578, + "logps/rejected": -34.09321975708008, + "loss": 0.0504, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9414665699005127, + "rewards/margins": 2.307858943939209, + "rewards/rejected": 0.6336075067520142, + "step": 1942 + }, + { + "epoch": 0.32, + "learning_rate": 9.578672111352615e-06, + "logits/chosen": -0.8496857285499573, + "logits/rejected": -0.8930524587631226, + "logps/chosen": -155.0688018798828, + "logps/rejected": -74.65580749511719, + "loss": 0.9295, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.207777500152588, + "rewards/margins": 1.8265762329101562, + "rewards/rejected": 2.3812012672424316, + "step": 1943 + }, + { + "epoch": 0.32, + "learning_rate": 9.578143906836887e-06, + "logits/chosen": -1.004848599433899, + "logits/rejected": -0.9620434045791626, + "logps/chosen": -67.87590026855469, + "logps/rejected": -74.91458129882812, + "loss": 0.7898, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.8022133111953735, + "rewards/margins": -0.3553260564804077, + "rewards/rejected": 2.1575393676757812, + "step": 1944 + }, + { + "epoch": 0.32, + "learning_rate": 9.577615386014734e-06, + "logits/chosen": -0.7762975096702576, + "logits/rejected": -0.8103821873664856, + "logps/chosen": -55.18522644042969, + "logps/rejected": -78.68618774414062, + "loss": 0.2928, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.662055253982544, + "rewards/margins": 0.3091468811035156, + "rewards/rejected": 1.3529083728790283, + "step": 1945 + }, + { + "epoch": 0.32, + "learning_rate": 9.577086548922671e-06, + "logits/chosen": -0.9575325846672058, + "logits/rejected": -1.0243529081344604, + "logps/chosen": -102.11770629882812, + "logps/rejected": -114.65623474121094, + "loss": 3.1007, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.028179883956909, + "rewards/margins": -3.5436785221099854, + "rewards/rejected": 5.5718584060668945, + "step": 1946 + }, + { + "epoch": 0.32, + "learning_rate": 9.576557395597237e-06, + "logits/chosen": -0.811108410358429, + "logits/rejected": -0.7383823990821838, + "logps/chosen": -41.01459884643555, + "logps/rejected": -25.562942504882812, + "loss": 1.1862, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.7775276303291321, + "rewards/margins": 0.3964384198188782, + "rewards/rejected": 0.3810892105102539, + "step": 1947 + }, + { + "epoch": 0.32, + "learning_rate": 9.57602792607499e-06, + "logits/chosen": -0.8834720849990845, + "logits/rejected": -1.0003917217254639, + "logps/chosen": -110.95700073242188, + "logps/rejected": -94.567626953125, + "loss": 2.0514, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.4914337396621704, + "rewards/margins": -4.07057523727417, + "rewards/rejected": 5.562008857727051, + "step": 1948 + }, + { + "epoch": 0.32, + "learning_rate": 9.575498140392512e-06, + "logits/chosen": -0.7910417914390564, + "logits/rejected": -0.7912321090698242, + "logps/chosen": -4.031567573547363, + "logps/rejected": -3.237114667892456, + "loss": 0.3788, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.2090894728899002, + "rewards/margins": -0.05585993826389313, + "rewards/rejected": 0.26494941115379333, + "step": 1949 + }, + { + "epoch": 0.32, + "learning_rate": 9.574968038586408e-06, + "logits/chosen": -0.8810067772865295, + "logits/rejected": -0.8434237837791443, + "logps/chosen": -34.98801803588867, + "logps/rejected": -9.534332275390625, + "loss": 0.3988, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.6108849048614502, + "rewards/margins": 0.5461975336074829, + "rewards/rejected": 1.0646873712539673, + "step": 1950 + }, + { + "epoch": 0.32, + "learning_rate": 9.5744376206933e-06, + "logits/chosen": -0.9639565348625183, + "logits/rejected": -0.9677377939224243, + "logps/chosen": -53.822303771972656, + "logps/rejected": -64.85049438476562, + "loss": 1.3949, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.048011064529419, + "rewards/margins": -0.6235495805740356, + "rewards/rejected": 1.6715606451034546, + "step": 1951 + }, + { + "epoch": 0.32, + "learning_rate": 9.573906886749836e-06, + "logits/chosen": -0.9902470707893372, + "logits/rejected": -0.973033607006073, + "logps/chosen": -145.53369140625, + "logps/rejected": -70.43031311035156, + "loss": 0.0237, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.9265899658203125, + "rewards/margins": 3.957880973815918, + "rewards/rejected": 1.968708872795105, + "step": 1952 + }, + { + "epoch": 0.32, + "learning_rate": 9.573375836792684e-06, + "logits/chosen": -1.1310250759124756, + "logits/rejected": -1.0617282390594482, + "logps/chosen": -141.58180236816406, + "logps/rejected": -52.4283447265625, + "loss": 0.1324, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.122305393218994, + "rewards/margins": 1.2397087812423706, + "rewards/rejected": 0.8825966119766235, + "step": 1953 + }, + { + "epoch": 0.32, + "learning_rate": 9.572844470858537e-06, + "logits/chosen": -0.8882066011428833, + "logits/rejected": -0.948701798915863, + "logps/chosen": -36.72184753417969, + "logps/rejected": -69.14373779296875, + "loss": 0.3833, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.7282062768936157, + "rewards/margins": -0.11691701412200928, + "rewards/rejected": 1.845123291015625, + "step": 1954 + }, + { + "epoch": 0.32, + "learning_rate": 9.572312788984105e-06, + "logits/chosen": -0.9749308228492737, + "logits/rejected": -0.9633674621582031, + "logps/chosen": -83.95133209228516, + "logps/rejected": -144.32797241210938, + "loss": 1.5669, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.8531311750411987, + "rewards/margins": 0.2570892572402954, + "rewards/rejected": 1.5960419178009033, + "step": 1955 + }, + { + "epoch": 0.32, + "learning_rate": 9.571780791206123e-06, + "logits/chosen": -1.0833408832550049, + "logits/rejected": -0.9960689544677734, + "logps/chosen": -174.00494384765625, + "logps/rejected": -87.27799987792969, + "loss": 0.5317, + "rewards/accuracies": 1.0, + "rewards/chosen": 7.061328411102295, + "rewards/margins": 4.291032791137695, + "rewards/rejected": 2.7702958583831787, + "step": 1956 + }, + { + "epoch": 0.32, + "learning_rate": 9.571248477561349e-06, + "logits/chosen": -0.7298144698143005, + "logits/rejected": -0.7739142775535583, + "logps/chosen": -69.79457092285156, + "logps/rejected": -59.41455841064453, + "loss": 1.0998, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.232557773590088, + "rewards/margins": 0.2919884920120239, + "rewards/rejected": 1.940569281578064, + "step": 1957 + }, + { + "epoch": 0.32, + "learning_rate": 9.570715848086555e-06, + "logits/chosen": -1.1732624769210815, + "logits/rejected": -1.2618099451065063, + "logps/chosen": -18.55609130859375, + "logps/rejected": -80.56706237792969, + "loss": 2.4349, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.5110630393028259, + "rewards/margins": -3.978148937225342, + "rewards/rejected": 4.4892120361328125, + "step": 1958 + }, + { + "epoch": 0.32, + "learning_rate": 9.570182902818545e-06, + "logits/chosen": -0.8690150380134583, + "logits/rejected": -0.7347958087921143, + "logps/chosen": -89.0523681640625, + "logps/rejected": -27.027069091796875, + "loss": 0.5215, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.60784912109375, + "rewards/margins": 1.3695323467254639, + "rewards/rejected": 0.23831672966480255, + "step": 1959 + }, + { + "epoch": 0.32, + "learning_rate": 9.569649641794141e-06, + "logits/chosen": -0.7907747626304626, + "logits/rejected": -0.7551597952842712, + "logps/chosen": -84.93231201171875, + "logps/rejected": -99.92330932617188, + "loss": 0.3511, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.1549232006073, + "rewards/margins": 0.2055426836013794, + "rewards/rejected": 1.9493805170059204, + "step": 1960 + }, + { + "epoch": 0.32, + "learning_rate": 9.569116065050186e-06, + "logits/chosen": -1.1754287481307983, + "logits/rejected": -1.1754287481307983, + "logps/chosen": -53.17645263671875, + "logps/rejected": -53.17645263671875, + "loss": 0.6613, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.660930633544922, + "rewards/margins": 0.0, + "rewards/rejected": 2.660930633544922, + "step": 1961 + }, + { + "epoch": 0.32, + "learning_rate": 9.568582172623544e-06, + "logits/chosen": -0.8692667484283447, + "logits/rejected": -0.9959890246391296, + "logps/chosen": -69.30048370361328, + "logps/rejected": -90.37960815429688, + "loss": 2.2193, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.807637929916382, + "rewards/margins": -4.307680130004883, + "rewards/rejected": 7.115318298339844, + "step": 1962 + }, + { + "epoch": 0.32, + "learning_rate": 9.568047964551102e-06, + "logits/chosen": -0.7773363590240479, + "logits/rejected": -0.7688270211219788, + "logps/chosen": -88.90119934082031, + "logps/rejected": -44.23033142089844, + "loss": 0.754, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.0301613807678223, + "rewards/margins": -0.23027348518371582, + "rewards/rejected": 2.260434865951538, + "step": 1963 + }, + { + "epoch": 0.32, + "learning_rate": 9.567513440869768e-06, + "logits/chosen": -1.2771551609039307, + "logits/rejected": -1.3014791011810303, + "logps/chosen": -124.51346588134766, + "logps/rejected": -120.09996032714844, + "loss": 2.4511, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.300551891326904, + "rewards/margins": 0.049463748931884766, + "rewards/rejected": 4.2510881423950195, + "step": 1964 + }, + { + "epoch": 0.32, + "learning_rate": 9.566978601616474e-06, + "logits/chosen": -0.9180648922920227, + "logits/rejected": -0.7832491397857666, + "logps/chosen": -62.44327163696289, + "logps/rejected": -105.87480163574219, + "loss": 0.5962, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.8940868377685547, + "rewards/margins": -0.027872085571289062, + "rewards/rejected": 1.9219589233398438, + "step": 1965 + }, + { + "epoch": 0.32, + "learning_rate": 9.566443446828172e-06, + "logits/chosen": -0.8824797868728638, + "logits/rejected": -0.9301280379295349, + "logps/chosen": -87.16766357421875, + "logps/rejected": -80.57199096679688, + "loss": 0.8361, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.9316879510879517, + "rewards/margins": -1.3256462812423706, + "rewards/rejected": 3.2573342323303223, + "step": 1966 + }, + { + "epoch": 0.32, + "learning_rate": 9.565907976541835e-06, + "logits/chosen": -0.9136941432952881, + "logits/rejected": -0.7718835473060608, + "logps/chosen": -93.6006088256836, + "logps/rejected": -27.583744049072266, + "loss": 0.7556, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.0832306146621704, + "rewards/margins": 1.095576524734497, + "rewards/rejected": -0.012345886789262295, + "step": 1967 + }, + { + "epoch": 0.32, + "learning_rate": 9.565372190794461e-06, + "logits/chosen": -1.0558395385742188, + "logits/rejected": -1.0068339109420776, + "logps/chosen": -52.612892150878906, + "logps/rejected": -73.96964263916016, + "loss": 0.6705, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.4506843090057373, + "rewards/margins": 0.8219970464706421, + "rewards/rejected": 1.6286872625350952, + "step": 1968 + }, + { + "epoch": 0.32, + "learning_rate": 9.564836089623066e-06, + "logits/chosen": -0.9866820573806763, + "logits/rejected": -0.6587426066398621, + "logps/chosen": -72.99637603759766, + "logps/rejected": -94.86377716064453, + "loss": 0.6909, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.665759325027466, + "rewards/margins": 0.4691169261932373, + "rewards/rejected": 2.1966423988342285, + "step": 1969 + }, + { + "epoch": 0.32, + "learning_rate": 9.56429967306469e-06, + "logits/chosen": -1.2012152671813965, + "logits/rejected": -1.1588183641433716, + "logps/chosen": -63.9141731262207, + "logps/rejected": -72.91442108154297, + "loss": 1.8315, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.3754665851593018, + "rewards/margins": 0.09233129024505615, + "rewards/rejected": 1.2831352949142456, + "step": 1970 + }, + { + "epoch": 0.32, + "learning_rate": 9.563762941156396e-06, + "logits/chosen": -0.4655351936817169, + "logits/rejected": -0.4675504267215729, + "logps/chosen": -1.7790672779083252, + "logps/rejected": -2.515984535217285, + "loss": 1.1946, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.25428876280784607, + "rewards/margins": -0.18172284960746765, + "rewards/rejected": 0.4360116124153137, + "step": 1971 + }, + { + "epoch": 0.32, + "learning_rate": 9.563225893935264e-06, + "logits/chosen": -0.6471815705299377, + "logits/rejected": -0.41251322627067566, + "logps/chosen": -50.05758285522461, + "logps/rejected": -12.793304443359375, + "loss": 0.9304, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0333211421966553, + "rewards/margins": 2.2863845825195312, + "rewards/rejected": 0.7469364404678345, + "step": 1972 + }, + { + "epoch": 0.32, + "learning_rate": 9.5626885314384e-06, + "logits/chosen": -0.5067535638809204, + "logits/rejected": -0.5238974094390869, + "logps/chosen": -1.1815507411956787, + "logps/rejected": -22.337663650512695, + "loss": 0.3336, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3691524267196655, + "rewards/margins": 0.052809298038482666, + "rewards/rejected": 0.31634312868118286, + "step": 1973 + }, + { + "epoch": 0.32, + "learning_rate": 9.562150853702931e-06, + "logits/chosen": -0.6949824690818787, + "logits/rejected": -0.8260294198989868, + "logps/chosen": -127.96453094482422, + "logps/rejected": -80.40895080566406, + "loss": 0.5586, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.6798501014709473, + "rewards/margins": 0.8943428993225098, + "rewards/rejected": 2.7855072021484375, + "step": 1974 + }, + { + "epoch": 0.32, + "learning_rate": 9.561612860766007e-06, + "logits/chosen": -0.7329294681549072, + "logits/rejected": -0.729008138179779, + "logps/chosen": -63.475242614746094, + "logps/rejected": -56.549156188964844, + "loss": 0.4756, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.359259843826294, + "rewards/margins": -0.2566497325897217, + "rewards/rejected": 2.6159095764160156, + "step": 1975 + }, + { + "epoch": 0.32, + "learning_rate": 9.561074552664794e-06, + "logits/chosen": -1.0306246280670166, + "logits/rejected": -0.9198290705680847, + "logps/chosen": -64.70637512207031, + "logps/rejected": -80.25080871582031, + "loss": 0.8954, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.11661696434021, + "rewards/margins": -0.3844420909881592, + "rewards/rejected": 2.501059055328369, + "step": 1976 + }, + { + "epoch": 0.32, + "learning_rate": 9.56053592943649e-06, + "logits/chosen": -0.9656286239624023, + "logits/rejected": -1.1285377740859985, + "logps/chosen": -65.5910415649414, + "logps/rejected": -52.18547439575195, + "loss": 0.5837, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.8538109064102173, + "rewards/margins": -0.7702869176864624, + "rewards/rejected": 2.6240978240966797, + "step": 1977 + }, + { + "epoch": 0.32, + "learning_rate": 9.559996991118304e-06, + "logits/chosen": -1.143061876296997, + "logits/rejected": -1.143061876296997, + "logps/chosen": -47.246856689453125, + "logps/rejected": -47.246856689453125, + "loss": 0.5435, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.6372524499893188, + "rewards/margins": 0.0, + "rewards/rejected": 1.6372524499893188, + "step": 1978 + }, + { + "epoch": 0.32, + "learning_rate": 9.559457737747474e-06, + "logits/chosen": -1.1582202911376953, + "logits/rejected": -1.1582202911376953, + "logps/chosen": -54.28007507324219, + "logps/rejected": -54.28007507324219, + "loss": 0.6513, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.6230437755584717, + "rewards/margins": 0.0, + "rewards/rejected": 2.6230437755584717, + "step": 1979 + }, + { + "epoch": 0.32, + "learning_rate": 9.558918169361253e-06, + "logits/chosen": -0.36152300238609314, + "logits/rejected": -0.36152300238609314, + "logps/chosen": -40.426841735839844, + "logps/rejected": -40.426841735839844, + "loss": 0.422, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.6783649325370789, + "rewards/margins": 0.0, + "rewards/rejected": 0.6783649325370789, + "step": 1980 + }, + { + "epoch": 0.32, + "learning_rate": 9.558378285996925e-06, + "logits/chosen": -0.9269315004348755, + "logits/rejected": -0.9080124497413635, + "logps/chosen": -38.137916564941406, + "logps/rejected": -30.49374771118164, + "loss": 0.4348, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.7439380884170532, + "rewards/margins": -0.29196178913116455, + "rewards/rejected": 2.0358998775482178, + "step": 1981 + }, + { + "epoch": 0.32, + "learning_rate": 9.55783808769179e-06, + "logits/chosen": -1.0339975357055664, + "logits/rejected": -1.0289994478225708, + "logps/chosen": -71.76907348632812, + "logps/rejected": -46.391212463378906, + "loss": 0.2422, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.437701463699341, + "rewards/margins": 0.6935760974884033, + "rewards/rejected": 1.7441253662109375, + "step": 1982 + }, + { + "epoch": 0.32, + "learning_rate": 9.55729757448317e-06, + "logits/chosen": -1.1523418426513672, + "logits/rejected": -1.1956292390823364, + "logps/chosen": -158.8142852783203, + "logps/rejected": -77.84463500976562, + "loss": 1.2257, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.779210090637207, + "rewards/margins": 2.9100115299224854, + "rewards/rejected": 2.8691985607147217, + "step": 1983 + }, + { + "epoch": 0.32, + "learning_rate": 9.556756746408409e-06, + "logits/chosen": -0.9103783369064331, + "logits/rejected": -0.9006320238113403, + "logps/chosen": -46.234031677246094, + "logps/rejected": -71.26390075683594, + "loss": 0.9183, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.228379011154175, + "rewards/margins": 0.6197333335876465, + "rewards/rejected": 1.6086456775665283, + "step": 1984 + }, + { + "epoch": 0.32, + "learning_rate": 9.556215603504874e-06, + "logits/chosen": -1.2254582643508911, + "logits/rejected": -1.0951528549194336, + "logps/chosen": -118.35861206054688, + "logps/rejected": -147.48736572265625, + "loss": 1.1141, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.125210762023926, + "rewards/margins": 1.2700181007385254, + "rewards/rejected": 3.8551926612854004, + "step": 1985 + }, + { + "epoch": 0.32, + "learning_rate": 9.55567414580995e-06, + "logits/chosen": -0.94329833984375, + "logits/rejected": -0.7434968948364258, + "logps/chosen": -105.18702697753906, + "logps/rejected": -16.02753448486328, + "loss": 0.368, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.129359483718872, + "rewards/margins": 1.9134355783462524, + "rewards/rejected": 0.21592389047145844, + "step": 1986 + }, + { + "epoch": 0.32, + "learning_rate": 9.555132373361051e-06, + "logits/chosen": -0.9555892944335938, + "logits/rejected": -1.0096622705459595, + "logps/chosen": -73.31571197509766, + "logps/rejected": -108.4786376953125, + "loss": 1.3707, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.393926978111267, + "rewards/margins": -0.60198974609375, + "rewards/rejected": 1.995916724205017, + "step": 1987 + }, + { + "epoch": 0.32, + "learning_rate": 9.554590286195603e-06, + "logits/chosen": -1.126166582107544, + "logits/rejected": -1.0951018333435059, + "logps/chosen": -86.14844512939453, + "logps/rejected": -125.38374328613281, + "loss": 0.7493, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.088999271392822, + "rewards/margins": 0.5457878112792969, + "rewards/rejected": 5.543211460113525, + "step": 1988 + }, + { + "epoch": 0.32, + "learning_rate": 9.554047884351065e-06, + "logits/chosen": -1.1006171703338623, + "logits/rejected": -1.113518238067627, + "logps/chosen": -63.07542037963867, + "logps/rejected": -102.62284851074219, + "loss": 1.33, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.910800576210022, + "rewards/margins": -1.6331371068954468, + "rewards/rejected": 3.5439376831054688, + "step": 1989 + }, + { + "epoch": 0.32, + "learning_rate": 9.55350516786491e-06, + "logits/chosen": -0.7667381763458252, + "logits/rejected": -0.8422524333000183, + "logps/chosen": -31.36586570739746, + "logps/rejected": -108.70768737792969, + "loss": 1.3255, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.024927854537964, + "rewards/margins": -1.5564870834350586, + "rewards/rejected": 3.5814149379730225, + "step": 1990 + }, + { + "epoch": 0.32, + "learning_rate": 9.55296213677463e-06, + "logits/chosen": -0.9772254824638367, + "logits/rejected": -0.9772254824638367, + "logps/chosen": -30.0177059173584, + "logps/rejected": -30.0177059173584, + "loss": 0.7668, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.6019556522369385, + "rewards/margins": 0.0, + "rewards/rejected": 2.6019556522369385, + "step": 1991 + }, + { + "epoch": 0.32, + "learning_rate": 9.552418791117747e-06, + "logits/chosen": -0.8487996459007263, + "logits/rejected": -0.8852447867393494, + "logps/chosen": -73.36356353759766, + "logps/rejected": -79.39686584472656, + "loss": 2.2078, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.9280418157577515, + "rewards/margins": -0.9622734785079956, + "rewards/rejected": 2.890315294265747, + "step": 1992 + }, + { + "epoch": 0.32, + "learning_rate": 9.551875130931804e-06, + "logits/chosen": -0.8179346919059753, + "logits/rejected": -0.8179346919059753, + "logps/chosen": -1.7041492462158203, + "logps/rejected": -1.7041492462158203, + "loss": 0.9589, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.3186572194099426, + "rewards/margins": 0.0, + "rewards/rejected": 0.3186572194099426, + "step": 1993 + }, + { + "epoch": 0.32, + "learning_rate": 9.551331156254358e-06, + "logits/chosen": -1.0467277765274048, + "logits/rejected": -0.8929610848426819, + "logps/chosen": -140.87411499023438, + "logps/rejected": -54.6649284362793, + "loss": 0.1875, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.8460617065429688, + "rewards/margins": 1.199330449104309, + "rewards/rejected": 1.6467312574386597, + "step": 1994 + }, + { + "epoch": 0.32, + "learning_rate": 9.550786867122993e-06, + "logits/chosen": -0.8605800867080688, + "logits/rejected": -0.7628040909767151, + "logps/chosen": -117.46337127685547, + "logps/rejected": -64.49870300292969, + "loss": 1.9152, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.305031657218933, + "rewards/margins": 0.24077224731445312, + "rewards/rejected": 1.06425940990448, + "step": 1995 + }, + { + "epoch": 0.32, + "learning_rate": 9.550242263575318e-06, + "logits/chosen": -0.4180781841278076, + "logits/rejected": -0.4182344079017639, + "logps/chosen": -4.235871315002441, + "logps/rejected": -1.6303296089172363, + "loss": 0.3375, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3474585711956024, + "rewards/margins": 0.04203951358795166, + "rewards/rejected": 0.30541905760765076, + "step": 1996 + }, + { + "epoch": 0.32, + "learning_rate": 9.549697345648956e-06, + "logits/chosen": -1.1133824586868286, + "logits/rejected": -1.1312384605407715, + "logps/chosen": -126.59028625488281, + "logps/rejected": -143.71498107910156, + "loss": 2.7261, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.453495740890503, + "rewards/margins": -5.165069580078125, + "rewards/rejected": 7.618565559387207, + "step": 1997 + }, + { + "epoch": 0.32, + "learning_rate": 9.549152113381557e-06, + "logits/chosen": -0.8333654999732971, + "logits/rejected": -0.7824683785438538, + "logps/chosen": -59.36801528930664, + "logps/rejected": -11.286907196044922, + "loss": 1.1575, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.781925678253174, + "rewards/margins": 1.6836292743682861, + "rewards/rejected": 1.0982964038848877, + "step": 1998 + }, + { + "epoch": 0.32, + "learning_rate": 9.548606566810792e-06, + "logits/chosen": -1.0947009325027466, + "logits/rejected": -1.05299711227417, + "logps/chosen": -114.24803924560547, + "logps/rejected": -63.721797943115234, + "loss": 0.2598, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.60207599401474, + "rewards/margins": 0.3912937641143799, + "rewards/rejected": 0.2107822448015213, + "step": 1999 + }, + { + "epoch": 0.32, + "learning_rate": 9.548060705974353e-06, + "logits/chosen": -0.7708441615104675, + "logits/rejected": -0.8801659941673279, + "logps/chosen": -108.78347778320312, + "logps/rejected": -105.61238098144531, + "loss": 1.2124, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.4440064430236816, + "rewards/margins": -2.0533509254455566, + "rewards/rejected": 4.497357368469238, + "step": 2000 + }, + { + "epoch": 0.32, + "learning_rate": 9.547514530909951e-06, + "logits/chosen": -0.828454315662384, + "logits/rejected": -0.7942465543746948, + "logps/chosen": -28.58135986328125, + "logps/rejected": -91.2127914428711, + "loss": 1.3326, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.3645942211151123, + "rewards/margins": -1.333017110824585, + "rewards/rejected": 3.6976113319396973, + "step": 2001 + }, + { + "epoch": 0.32, + "learning_rate": 9.546968041655326e-06, + "logits/chosen": -1.061896800994873, + "logits/rejected": -1.0669845342636108, + "logps/chosen": -59.25636672973633, + "logps/rejected": -31.3780460357666, + "loss": 0.6845, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.2085087299346924, + "rewards/margins": -0.8100244998931885, + "rewards/rejected": 2.018533229827881, + "step": 2002 + }, + { + "epoch": 0.33, + "learning_rate": 9.546421238248233e-06, + "logits/chosen": -1.1335082054138184, + "logits/rejected": -1.1039907932281494, + "logps/chosen": -48.05265426635742, + "logps/rejected": -63.8779182434082, + "loss": 0.2876, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.8470265865325928, + "rewards/margins": 0.35537052154541016, + "rewards/rejected": 2.4916560649871826, + "step": 2003 + }, + { + "epoch": 0.33, + "learning_rate": 9.54587412072645e-06, + "logits/chosen": -1.1851803064346313, + "logits/rejected": -1.1851803064346313, + "logps/chosen": -105.62940216064453, + "logps/rejected": -105.62940216064453, + "loss": 0.3508, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.3498611450195312, + "rewards/margins": 0.0, + "rewards/rejected": 1.3498611450195312, + "step": 2004 + }, + { + "epoch": 0.33, + "learning_rate": 9.54532668912778e-06, + "logits/chosen": -1.0906391143798828, + "logits/rejected": -1.083022952079773, + "logps/chosen": -75.89057922363281, + "logps/rejected": -64.42626953125, + "loss": 0.2821, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.5682953596115112, + "rewards/margins": 0.39008796215057373, + "rewards/rejected": 1.1782073974609375, + "step": 2005 + }, + { + "epoch": 0.33, + "learning_rate": 9.544778943490042e-06, + "logits/chosen": -1.1809982061386108, + "logits/rejected": -1.118589997291565, + "logps/chosen": -97.83998107910156, + "logps/rejected": -50.478092193603516, + "loss": 0.351, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.357118129730225, + "rewards/margins": 2.504368782043457, + "rewards/rejected": 1.852749228477478, + "step": 2006 + }, + { + "epoch": 0.33, + "learning_rate": 9.544230883851084e-06, + "logits/chosen": -0.6987537145614624, + "logits/rejected": -0.6877190470695496, + "logps/chosen": -16.68389129638672, + "logps/rejected": -2.0728743076324463, + "loss": 0.2998, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5438076257705688, + "rewards/margins": 0.1991465985774994, + "rewards/rejected": 0.34466102719306946, + "step": 2007 + }, + { + "epoch": 0.33, + "learning_rate": 9.54368251024877e-06, + "logits/chosen": -0.7930211424827576, + "logits/rejected": -0.6884154081344604, + "logps/chosen": -42.70344543457031, + "logps/rejected": -76.31192016601562, + "loss": 0.5397, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.9034286737442017, + "rewards/margins": -0.6037460565567017, + "rewards/rejected": 2.5071747303009033, + "step": 2008 + }, + { + "epoch": 0.33, + "learning_rate": 9.543133822720986e-06, + "logits/chosen": -0.6464182734489441, + "logits/rejected": -0.7150682806968689, + "logps/chosen": -57.07093811035156, + "logps/rejected": -85.0073013305664, + "loss": 0.4787, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.6682541370391846, + "rewards/margins": 2.014286994934082, + "rewards/rejected": 1.653967261314392, + "step": 2009 + }, + { + "epoch": 0.33, + "learning_rate": 9.542584821305643e-06, + "logits/chosen": -0.9526567459106445, + "logits/rejected": -1.0029369592666626, + "logps/chosen": -75.17374420166016, + "logps/rejected": -154.92068481445312, + "loss": 0.2767, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.2637977600097656, + "rewards/margins": 0.5691810846328735, + "rewards/rejected": 1.694616675376892, + "step": 2010 + }, + { + "epoch": 0.33, + "learning_rate": 9.542035506040673e-06, + "logits/chosen": -1.128049612045288, + "logits/rejected": -0.9420285224914551, + "logps/chosen": -143.72802734375, + "logps/rejected": -16.7340145111084, + "loss": 0.0921, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.415205478668213, + "rewards/margins": 2.2435216903686523, + "rewards/rejected": 0.1716838926076889, + "step": 2011 + }, + { + "epoch": 0.33, + "learning_rate": 9.541485876964024e-06, + "logits/chosen": -0.8989336490631104, + "logits/rejected": -0.8567647933959961, + "logps/chosen": -103.44586181640625, + "logps/rejected": -60.34420394897461, + "loss": 0.2874, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.769580125808716, + "rewards/margins": 0.6481845378875732, + "rewards/rejected": 2.1213955879211426, + "step": 2012 + }, + { + "epoch": 0.33, + "learning_rate": 9.540935934113673e-06, + "logits/chosen": -1.1696828603744507, + "logits/rejected": -1.212782621383667, + "logps/chosen": -93.1681137084961, + "logps/rejected": -88.79756927490234, + "loss": 1.315, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.7751388549804688, + "rewards/margins": -1.269599199295044, + "rewards/rejected": 3.0447380542755127, + "step": 2013 + }, + { + "epoch": 0.33, + "learning_rate": 9.540385677527617e-06, + "logits/chosen": -0.7513128519058228, + "logits/rejected": -0.7513128519058228, + "logps/chosen": -0.8238868713378906, + "logps/rejected": -0.8238868713378906, + "loss": 0.5888, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.13860325515270233, + "rewards/margins": 0.0, + "rewards/rejected": 0.13860325515270233, + "step": 2014 + }, + { + "epoch": 0.33, + "learning_rate": 9.539835107243872e-06, + "logits/chosen": -0.9029184579849243, + "logits/rejected": -0.8713579773902893, + "logps/chosen": -70.68299865722656, + "logps/rejected": -37.18869400024414, + "loss": 2.1421, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.596936821937561, + "rewards/margins": -0.5845605134963989, + "rewards/rejected": 2.18149733543396, + "step": 2015 + }, + { + "epoch": 0.33, + "learning_rate": 9.539284223300477e-06, + "logits/chosen": -1.1566979885101318, + "logits/rejected": -1.1510955095291138, + "logps/chosen": -118.07994842529297, + "logps/rejected": -74.57423400878906, + "loss": 1.8742, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.548207998275757, + "rewards/margins": 1.2791146039962769, + "rewards/rejected": 1.26909339427948, + "step": 2016 + }, + { + "epoch": 0.33, + "learning_rate": 9.538733025735494e-06, + "logits/chosen": -1.055317759513855, + "logits/rejected": -0.9976764917373657, + "logps/chosen": -85.42465209960938, + "logps/rejected": -55.98385238647461, + "loss": 0.3076, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9703400135040283, + "rewards/margins": 0.19127774238586426, + "rewards/rejected": 2.779062271118164, + "step": 2017 + }, + { + "epoch": 0.33, + "learning_rate": 9.538181514587004e-06, + "logits/chosen": -0.4847639203071594, + "logits/rejected": -0.485116571187973, + "logps/chosen": -1.605952501296997, + "logps/rejected": -2.443410873413086, + "loss": 0.8303, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.32177627086639404, + "rewards/margins": -0.02283155918121338, + "rewards/rejected": 0.3446078300476074, + "step": 2018 + }, + { + "epoch": 0.33, + "learning_rate": 9.53762968989311e-06, + "logits/chosen": -0.7243329882621765, + "logits/rejected": -0.5984389185905457, + "logps/chosen": -46.07612991333008, + "logps/rejected": -23.63604164123535, + "loss": 0.0871, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.6713924407958984, + "rewards/margins": 1.7426958084106445, + "rewards/rejected": 0.9286966323852539, + "step": 2019 + }, + { + "epoch": 0.33, + "learning_rate": 9.537077551691943e-06, + "logits/chosen": -1.1068960428237915, + "logits/rejected": -1.042698860168457, + "logps/chosen": -73.60845947265625, + "logps/rejected": -18.034706115722656, + "loss": 0.4324, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.1219253540039062, + "rewards/margins": 1.1742651462554932, + "rewards/rejected": 0.9476602673530579, + "step": 2020 + }, + { + "epoch": 0.33, + "learning_rate": 9.536525100021648e-06, + "logits/chosen": -1.1592752933502197, + "logits/rejected": -0.9502729773521423, + "logps/chosen": -94.11043548583984, + "logps/rejected": -18.44806671142578, + "loss": 2.5898, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.4623239040374756, + "rewards/margins": 2.8710620403289795, + "rewards/rejected": 0.5912618637084961, + "step": 2021 + }, + { + "epoch": 0.33, + "learning_rate": 9.535972334920392e-06, + "logits/chosen": -0.18762853741645813, + "logits/rejected": -0.18762853741645813, + "logps/chosen": -46.701351165771484, + "logps/rejected": -46.701351165771484, + "loss": 1.0236, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.42421531677246094, + "rewards/margins": 0.0, + "rewards/rejected": 0.42421531677246094, + "step": 2022 + }, + { + "epoch": 0.33, + "learning_rate": 9.535419256426366e-06, + "logits/chosen": -1.108136773109436, + "logits/rejected": -1.0778090953826904, + "logps/chosen": -98.18936920166016, + "logps/rejected": -54.364620208740234, + "loss": 0.5644, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.23555909097194672, + "rewards/margins": -0.5810130834579468, + "rewards/rejected": 0.8165721893310547, + "step": 2023 + }, + { + "epoch": 0.33, + "learning_rate": 9.534865864577785e-06, + "logits/chosen": -0.8136953711509705, + "logits/rejected": -0.8081591725349426, + "logps/chosen": -54.00499725341797, + "logps/rejected": -89.273681640625, + "loss": 0.4421, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.0429985523223877, + "rewards/margins": 1.0085800886154175, + "rewards/rejected": 1.0344184637069702, + "step": 2024 + }, + { + "epoch": 0.33, + "learning_rate": 9.534312159412882e-06, + "logits/chosen": -0.9405741691589355, + "logits/rejected": -1.036285161972046, + "logps/chosen": -63.0719108581543, + "logps/rejected": -140.22032165527344, + "loss": 1.8345, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.8514561653137207, + "rewards/margins": -3.236950397491455, + "rewards/rejected": 6.088406562805176, + "step": 2025 + }, + { + "epoch": 0.33, + "learning_rate": 9.533758140969913e-06, + "logits/chosen": -1.2422285079956055, + "logits/rejected": -1.096492886543274, + "logps/chosen": -139.77027893066406, + "logps/rejected": -129.50564575195312, + "loss": 0.4213, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.546494960784912, + "rewards/margins": 3.2902145385742188, + "rewards/rejected": 1.256280541419983, + "step": 2026 + }, + { + "epoch": 0.33, + "learning_rate": 9.533203809287157e-06, + "logits/chosen": -0.9295495748519897, + "logits/rejected": -0.7694579362869263, + "logps/chosen": -37.451290130615234, + "logps/rejected": -19.30529022216797, + "loss": 0.1018, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.216829299926758, + "rewards/margins": 2.077226161956787, + "rewards/rejected": 0.13960324227809906, + "step": 2027 + }, + { + "epoch": 0.33, + "learning_rate": 9.53264916440291e-06, + "logits/chosen": -0.9947550296783447, + "logits/rejected": -1.0686242580413818, + "logps/chosen": -67.25575256347656, + "logps/rejected": -71.76750183105469, + "loss": 0.8362, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.053453207015991, + "rewards/margins": -1.4245643615722656, + "rewards/rejected": 3.478017568588257, + "step": 2028 + }, + { + "epoch": 0.33, + "learning_rate": 9.532094206355493e-06, + "logits/chosen": -1.0555697679519653, + "logits/rejected": -1.0882413387298584, + "logps/chosen": -43.667903900146484, + "logps/rejected": -71.64498901367188, + "loss": 0.4848, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.1843162775039673, + "rewards/margins": 0.5653015375137329, + "rewards/rejected": 0.6190147399902344, + "step": 2029 + }, + { + "epoch": 0.33, + "learning_rate": 9.531538935183252e-06, + "logits/chosen": -0.8559216260910034, + "logits/rejected": -0.699413537979126, + "logps/chosen": -60.59361267089844, + "logps/rejected": -65.48677825927734, + "loss": 0.508, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.053513288497925, + "rewards/margins": 0.7454298734664917, + "rewards/rejected": 1.308083415031433, + "step": 2030 + }, + { + "epoch": 0.33, + "learning_rate": 9.530983350924545e-06, + "logits/chosen": -0.7334662675857544, + "logits/rejected": -0.6161001920700073, + "logps/chosen": -91.854736328125, + "logps/rejected": -111.94445037841797, + "loss": 0.6733, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.299635410308838, + "rewards/margins": -0.953162431716919, + "rewards/rejected": 3.252797842025757, + "step": 2031 + }, + { + "epoch": 0.33, + "learning_rate": 9.530427453617763e-06, + "logits/chosen": -0.8519218564033508, + "logits/rejected": -0.7300960421562195, + "logps/chosen": -43.59713363647461, + "logps/rejected": -40.56156921386719, + "loss": 1.0986, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.539844274520874, + "rewards/margins": -0.5893199443817139, + "rewards/rejected": 3.129164218902588, + "step": 2032 + }, + { + "epoch": 0.33, + "learning_rate": 9.529871243301312e-06, + "logits/chosen": -0.6680408120155334, + "logits/rejected": -0.6812381744384766, + "logps/chosen": -66.3652572631836, + "logps/rejected": -50.37528991699219, + "loss": 1.8597, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.018682837486267, + "rewards/margins": -1.1179665327072144, + "rewards/rejected": 2.1366493701934814, + "step": 2033 + }, + { + "epoch": 0.33, + "learning_rate": 9.529314720013618e-06, + "logits/chosen": -0.926491379737854, + "logits/rejected": -0.6418758630752563, + "logps/chosen": -105.21453094482422, + "logps/rejected": -23.657672882080078, + "loss": 0.0994, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.414618968963623, + "rewards/margins": 4.1568427085876465, + "rewards/rejected": 0.25777626037597656, + "step": 2034 + }, + { + "epoch": 0.33, + "learning_rate": 9.528757883793135e-06, + "logits/chosen": -0.9117234945297241, + "logits/rejected": -0.6501246094703674, + "logps/chosen": -43.227874755859375, + "logps/rejected": -48.841094970703125, + "loss": 1.0374, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.8257427215576172, + "rewards/margins": -1.873913288116455, + "rewards/rejected": 3.6996560096740723, + "step": 2035 + }, + { + "epoch": 0.33, + "learning_rate": 9.528200734678333e-06, + "logits/chosen": -0.8394935727119446, + "logits/rejected": -0.8193403482437134, + "logps/chosen": -76.92097473144531, + "logps/rejected": -111.18092346191406, + "loss": 0.1237, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.098613739013672, + "rewards/margins": 1.340349555015564, + "rewards/rejected": 0.7582641839981079, + "step": 2036 + }, + { + "epoch": 0.33, + "learning_rate": 9.527643272707707e-06, + "logits/chosen": -1.283278226852417, + "logits/rejected": -1.3449805974960327, + "logps/chosen": -98.05734252929688, + "logps/rejected": -132.40052795410156, + "loss": 1.5193, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.1117217540740967, + "rewards/margins": -2.0153472423553467, + "rewards/rejected": 4.127068996429443, + "step": 2037 + }, + { + "epoch": 0.33, + "learning_rate": 9.527085497919773e-06, + "logits/chosen": -0.6990175247192383, + "logits/rejected": -0.6947912573814392, + "logps/chosen": -55.78984832763672, + "logps/rejected": -105.08596801757812, + "loss": 0.1801, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.825060248374939, + "rewards/margins": 1.160325527191162, + "rewards/rejected": 0.6647346615791321, + "step": 2038 + }, + { + "epoch": 0.33, + "learning_rate": 9.526527410353065e-06, + "logits/chosen": -1.0865422487258911, + "logits/rejected": -0.9349820613861084, + "logps/chosen": -59.95634841918945, + "logps/rejected": -51.23952865600586, + "loss": 0.1153, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.4608988761901855, + "rewards/margins": 1.4893605709075928, + "rewards/rejected": 2.9715383052825928, + "step": 2039 + }, + { + "epoch": 0.33, + "learning_rate": 9.525969010046143e-06, + "logits/chosen": -0.855427086353302, + "logits/rejected": -0.6955363750457764, + "logps/chosen": -49.94821548461914, + "logps/rejected": -16.99905776977539, + "loss": 0.1103, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.7109830379486084, + "rewards/margins": 2.083219528198242, + "rewards/rejected": 0.627763569355011, + "step": 2040 + }, + { + "epoch": 0.33, + "learning_rate": 9.52541029703759e-06, + "logits/chosen": -0.7651422619819641, + "logits/rejected": -0.7961987853050232, + "logps/chosen": -48.41761779785156, + "logps/rejected": -47.00725555419922, + "loss": 0.3381, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.9064209461212158, + "rewards/margins": 0.16411590576171875, + "rewards/rejected": 1.742305040359497, + "step": 2041 + }, + { + "epoch": 0.33, + "learning_rate": 9.524851271366002e-06, + "logits/chosen": -1.1488842964172363, + "logits/rejected": -1.1775405406951904, + "logps/chosen": -116.56199645996094, + "logps/rejected": -104.25614929199219, + "loss": 1.1566, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.5458176136016846, + "rewards/margins": -1.74530029296875, + "rewards/rejected": 3.2911179065704346, + "step": 2042 + }, + { + "epoch": 0.33, + "learning_rate": 9.524291933070009e-06, + "logits/chosen": -0.4313354790210724, + "logits/rejected": -0.4313354790210724, + "logps/chosen": -20.51127052307129, + "logps/rejected": -20.51127052307129, + "loss": 0.5847, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.2864639461040497, + "rewards/margins": 0.0, + "rewards/rejected": 0.2864639461040497, + "step": 2043 + }, + { + "epoch": 0.33, + "learning_rate": 9.523732282188251e-06, + "logits/chosen": -0.51182621717453, + "logits/rejected": -0.40471673011779785, + "logps/chosen": -40.80632400512695, + "logps/rejected": -47.69080352783203, + "loss": 0.8561, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.5507495403289795, + "rewards/margins": 0.28714704513549805, + "rewards/rejected": 2.2636024951934814, + "step": 2044 + }, + { + "epoch": 0.33, + "learning_rate": 9.523172318759397e-06, + "logits/chosen": -1.2232223749160767, + "logits/rejected": -1.2344902753829956, + "logps/chosen": -44.899749755859375, + "logps/rejected": -67.10211944580078, + "loss": 1.1502, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.5576583743095398, + "rewards/margins": -0.16641312837600708, + "rewards/rejected": 0.7240715026855469, + "step": 2045 + }, + { + "epoch": 0.33, + "learning_rate": 9.522612042822132e-06, + "logits/chosen": -1.1122511625289917, + "logits/rejected": -1.0768287181854248, + "logps/chosen": -68.3643798828125, + "logps/rejected": -54.449546813964844, + "loss": 0.417, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.6187500953674316, + "rewards/margins": 0.34381937980651855, + "rewards/rejected": 2.274930715560913, + "step": 2046 + }, + { + "epoch": 0.33, + "learning_rate": 9.52205145441517e-06, + "logits/chosen": -0.8219259977340698, + "logits/rejected": -0.8198229670524597, + "logps/chosen": -55.9246826171875, + "logps/rejected": -69.72978973388672, + "loss": 0.5315, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.5596725940704346, + "rewards/margins": -0.5905852317810059, + "rewards/rejected": 2.1502578258514404, + "step": 2047 + }, + { + "epoch": 0.33, + "learning_rate": 9.521490553577242e-06, + "logits/chosen": -0.9012123942375183, + "logits/rejected": -0.9091593623161316, + "logps/chosen": -77.36885070800781, + "logps/rejected": -66.71353149414062, + "loss": 1.3557, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.8509330749511719, + "rewards/margins": 0.5359458923339844, + "rewards/rejected": 1.3149871826171875, + "step": 2048 + }, + { + "epoch": 0.33, + "learning_rate": 9.520929340347096e-06, + "logits/chosen": -1.137458086013794, + "logits/rejected": -1.1074656248092651, + "logps/chosen": -92.99087524414062, + "logps/rejected": -108.78168487548828, + "loss": 0.8775, + "rewards/accuracies": 0.0, + "rewards/chosen": 4.037884712219238, + "rewards/margins": -1.4923453330993652, + "rewards/rejected": 5.5302300453186035, + "step": 2049 + }, + { + "epoch": 0.33, + "learning_rate": 9.520367814763514e-06, + "logits/chosen": -0.813678503036499, + "logits/rejected": -0.813678503036499, + "logps/chosen": -85.5481185913086, + "logps/rejected": -85.5481185913086, + "loss": 1.8334, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.8552131652832031, + "rewards/margins": 0.0, + "rewards/rejected": 1.8552131652832031, + "step": 2050 + }, + { + "epoch": 0.33, + "learning_rate": 9.519805976865285e-06, + "logits/chosen": -1.1993547677993774, + "logits/rejected": -1.1791759729385376, + "logps/chosen": -228.0741424560547, + "logps/rejected": -93.44252014160156, + "loss": 0.1332, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.9313859939575195, + "rewards/margins": 1.2191271781921387, + "rewards/rejected": 5.712258815765381, + "step": 2051 + }, + { + "epoch": 0.33, + "learning_rate": 9.519243826691232e-06, + "logits/chosen": -0.8134918212890625, + "logits/rejected": -0.8183515667915344, + "logps/chosen": -1.7051582336425781, + "logps/rejected": -4.052117824554443, + "loss": 0.4615, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3719097077846527, + "rewards/margins": 0.06518986821174622, + "rewards/rejected": 0.3067198395729065, + "step": 2052 + }, + { + "epoch": 0.33, + "learning_rate": 9.51868136428019e-06, + "logits/chosen": -1.0152369737625122, + "logits/rejected": -0.937114417552948, + "logps/chosen": -61.90602111816406, + "logps/rejected": -35.58158493041992, + "loss": 0.1542, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.703172445297241, + "rewards/margins": 2.2600903511047363, + "rewards/rejected": 1.4430820941925049, + "step": 2053 + }, + { + "epoch": 0.33, + "learning_rate": 9.518118589671025e-06, + "logits/chosen": -0.8442203998565674, + "logits/rejected": -0.9103386402130127, + "logps/chosen": -160.58673095703125, + "logps/rejected": -54.71810531616211, + "loss": 0.9394, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.929966688156128, + "rewards/margins": 2.1635584831237793, + "rewards/rejected": 1.7664082050323486, + "step": 2054 + }, + { + "epoch": 0.33, + "learning_rate": 9.517555502902613e-06, + "logits/chosen": -0.8140485882759094, + "logits/rejected": -0.5785010457038879, + "logps/chosen": -80.02629089355469, + "logps/rejected": -57.40987777709961, + "loss": 0.0774, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.6637725830078125, + "rewards/margins": 2.5297157764434814, + "rewards/rejected": 0.13405685126781464, + "step": 2055 + }, + { + "epoch": 0.33, + "learning_rate": 9.51699210401386e-06, + "logits/chosen": -0.979690432548523, + "logits/rejected": -0.9037795662879944, + "logps/chosen": -34.9381103515625, + "logps/rejected": -52.98271179199219, + "loss": 0.4791, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.433846354484558, + "rewards/margins": -0.18979334831237793, + "rewards/rejected": 1.623639702796936, + "step": 2056 + }, + { + "epoch": 0.33, + "learning_rate": 9.516428393043697e-06, + "logits/chosen": -0.9981504678726196, + "logits/rejected": -0.867953896522522, + "logps/chosen": -68.53884887695312, + "logps/rejected": -54.04202651977539, + "loss": 0.4076, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.8867905139923096, + "rewards/margins": -0.2303760051727295, + "rewards/rejected": 2.117166519165039, + "step": 2057 + }, + { + "epoch": 0.33, + "learning_rate": 9.515864370031066e-06, + "logits/chosen": -0.5507470369338989, + "logits/rejected": -0.6317939162254333, + "logps/chosen": -74.63509368896484, + "logps/rejected": -85.89320373535156, + "loss": 0.6828, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.376214623451233, + "rewards/margins": -0.8499237298965454, + "rewards/rejected": 2.2261383533477783, + "step": 2058 + }, + { + "epoch": 0.33, + "learning_rate": 9.515300035014934e-06, + "logits/chosen": -0.8963437676429749, + "logits/rejected": -0.9187793135643005, + "logps/chosen": -100.033935546875, + "logps/rejected": -68.12025451660156, + "loss": 0.2089, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.3399772644042969, + "rewards/margins": 0.6595519781112671, + "rewards/rejected": 0.6804252862930298, + "step": 2059 + }, + { + "epoch": 0.33, + "learning_rate": 9.514735388034295e-06, + "logits/chosen": -0.7168371081352234, + "logits/rejected": -0.7168371081352234, + "logps/chosen": -94.89070892333984, + "logps/rejected": -94.89070892333984, + "loss": 0.5895, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.572344958782196, + "rewards/margins": 0.0, + "rewards/rejected": 0.572344958782196, + "step": 2060 + }, + { + "epoch": 0.33, + "learning_rate": 9.51417042912816e-06, + "logits/chosen": -1.0130276679992676, + "logits/rejected": -1.0273420810699463, + "logps/chosen": -65.12847900390625, + "logps/rejected": -74.6568603515625, + "loss": 0.5255, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.1926522254943848, + "rewards/margins": -0.3575630187988281, + "rewards/rejected": 2.550215244293213, + "step": 2061 + }, + { + "epoch": 0.33, + "learning_rate": 9.513605158335562e-06, + "logits/chosen": -0.7630788087844849, + "logits/rejected": -0.7894383072853088, + "logps/chosen": -15.414297103881836, + "logps/rejected": -55.24172592163086, + "loss": 0.4017, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.2968204617500305, + "rewards/margins": -0.20366060733795166, + "rewards/rejected": 0.5004810690879822, + "step": 2062 + }, + { + "epoch": 0.33, + "learning_rate": 9.513039575695555e-06, + "logits/chosen": -0.9496198296546936, + "logits/rejected": -0.7892709374427795, + "logps/chosen": -153.46389770507812, + "logps/rejected": -99.88072204589844, + "loss": 0.0225, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.341879367828369, + "rewards/margins": 3.180124044418335, + "rewards/rejected": 2.161755323410034, + "step": 2063 + }, + { + "epoch": 0.34, + "learning_rate": 9.512473681247217e-06, + "logits/chosen": -0.9825796484947205, + "logits/rejected": -0.9773126244544983, + "logps/chosen": -61.45779037475586, + "logps/rejected": -137.78570556640625, + "loss": 2.5402, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.6960041522979736, + "rewards/margins": -5.065624237060547, + "rewards/rejected": 6.7616286277771, + "step": 2064 + }, + { + "epoch": 0.34, + "learning_rate": 9.511907475029645e-06, + "logits/chosen": -0.8378568291664124, + "logits/rejected": -0.8120297789573669, + "logps/chosen": -145.97410583496094, + "logps/rejected": -73.78863525390625, + "loss": 1.0374, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.1556777954101562, + "rewards/margins": -1.8444640636444092, + "rewards/rejected": 3.0001418590545654, + "step": 2065 + }, + { + "epoch": 0.34, + "learning_rate": 9.511340957081957e-06, + "logits/chosen": -1.0092785358428955, + "logits/rejected": -0.9446258544921875, + "logps/chosen": -76.99934387207031, + "logps/rejected": -75.39901733398438, + "loss": 0.6237, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.5861542224884033, + "rewards/margins": 2.1316604614257812, + "rewards/rejected": 1.454493761062622, + "step": 2066 + }, + { + "epoch": 0.34, + "learning_rate": 9.5107741274433e-06, + "logits/chosen": -0.8284846544265747, + "logits/rejected": -0.8632785081863403, + "logps/chosen": -31.58277130126953, + "logps/rejected": -71.75028991699219, + "loss": 0.8158, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.5519840121269226, + "rewards/margins": -0.26321983337402344, + "rewards/rejected": 0.815203845500946, + "step": 2067 + }, + { + "epoch": 0.34, + "learning_rate": 9.510206986152827e-06, + "logits/chosen": -1.3823045492172241, + "logits/rejected": -1.395904541015625, + "logps/chosen": -67.34243774414062, + "logps/rejected": -60.54343795776367, + "loss": 1.1275, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.2957870960235596, + "rewards/margins": -0.5454013347625732, + "rewards/rejected": 1.8411884307861328, + "step": 2068 + }, + { + "epoch": 0.34, + "learning_rate": 9.509639533249729e-06, + "logits/chosen": -0.8530190587043762, + "logits/rejected": -0.8357324600219727, + "logps/chosen": -71.99761199951172, + "logps/rejected": -100.55738830566406, + "loss": 0.8035, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.84550940990448, + "rewards/margins": -1.1814666986465454, + "rewards/rejected": 3.0269761085510254, + "step": 2069 + }, + { + "epoch": 0.34, + "learning_rate": 9.50907176877321e-06, + "logits/chosen": -1.0292657613754272, + "logits/rejected": -0.7878952026367188, + "logps/chosen": -193.94189453125, + "logps/rejected": -26.488378524780273, + "loss": 0.3627, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.2315995693206787, + "rewards/margins": 2.089071750640869, + "rewards/rejected": 0.142527773976326, + "step": 2070 + }, + { + "epoch": 0.34, + "learning_rate": 9.508503692762496e-06, + "logits/chosen": -1.2166458368301392, + "logits/rejected": -1.2391973733901978, + "logps/chosen": -135.80145263671875, + "logps/rejected": -65.82730102539062, + "loss": 1.6496, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.4403396546840668, + "rewards/margins": -3.235541582107544, + "rewards/rejected": 3.6758811473846436, + "step": 2071 + }, + { + "epoch": 0.34, + "learning_rate": 9.507935305256839e-06, + "logits/chosen": -0.9319061636924744, + "logits/rejected": -0.9736292958259583, + "logps/chosen": -125.53638458251953, + "logps/rejected": -128.7039794921875, + "loss": 1.2903, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.9718208312988281, + "rewards/margins": -0.4258079528808594, + "rewards/rejected": 2.3976287841796875, + "step": 2072 + }, + { + "epoch": 0.34, + "learning_rate": 9.507366606295504e-06, + "logits/chosen": -1.53078293800354, + "logits/rejected": -1.5113853216171265, + "logps/chosen": -29.07676124572754, + "logps/rejected": -82.712158203125, + "loss": 0.7109, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.078160047531128, + "rewards/margins": -1.0120735168457031, + "rewards/rejected": 3.090233564376831, + "step": 2073 + }, + { + "epoch": 0.34, + "learning_rate": 9.506797595917787e-06, + "logits/chosen": -0.6136572360992432, + "logits/rejected": -0.778741180896759, + "logps/chosen": -69.9412841796875, + "logps/rejected": -160.10092163085938, + "loss": 1.1561, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.000903367996216, + "rewards/margins": -2.201115369796753, + "rewards/rejected": 4.202018737792969, + "step": 2074 + }, + { + "epoch": 0.34, + "learning_rate": 9.506228274163e-06, + "logits/chosen": -0.726336658000946, + "logits/rejected": -0.781007707118988, + "logps/chosen": -78.33629608154297, + "logps/rejected": -71.53854370117188, + "loss": 0.4945, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.46012806892395, + "rewards/margins": 0.804486870765686, + "rewards/rejected": 1.6556411981582642, + "step": 2075 + }, + { + "epoch": 0.34, + "learning_rate": 9.505658641070477e-06, + "logits/chosen": -0.6245277523994446, + "logits/rejected": -0.6245277523994446, + "logps/chosen": -34.262264251708984, + "logps/rejected": -34.262264251708984, + "loss": 0.6942, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.3471607267856598, + "rewards/margins": 0.0, + "rewards/rejected": 0.3471607267856598, + "step": 2076 + }, + { + "epoch": 0.34, + "learning_rate": 9.505088696679577e-06, + "logits/chosen": -0.9397861361503601, + "logits/rejected": -0.9457151293754578, + "logps/chosen": -10.59402847290039, + "logps/rejected": -2.965648651123047, + "loss": 0.4933, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.4533925950527191, + "rewards/margins": -0.03039085865020752, + "rewards/rejected": 0.48378345370292664, + "step": 2077 + }, + { + "epoch": 0.34, + "learning_rate": 9.504518441029674e-06, + "logits/chosen": -0.7282105684280396, + "logits/rejected": -0.6887076497077942, + "logps/chosen": -72.46047973632812, + "logps/rejected": -82.62266540527344, + "loss": 0.3127, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.3942962884902954, + "rewards/margins": 0.2680175304412842, + "rewards/rejected": 1.1262787580490112, + "step": 2078 + }, + { + "epoch": 0.34, + "learning_rate": 9.503947874160168e-06, + "logits/chosen": -1.082215428352356, + "logits/rejected": -1.021400809288025, + "logps/chosen": -63.93951416015625, + "logps/rejected": -75.42339324951172, + "loss": 1.3838, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.8508636951446533, + "rewards/margins": -1.5872657299041748, + "rewards/rejected": 3.438129425048828, + "step": 2079 + }, + { + "epoch": 0.34, + "learning_rate": 9.50337699611048e-06, + "logits/chosen": -1.0668448209762573, + "logits/rejected": -1.2856799364089966, + "logps/chosen": -104.92993927001953, + "logps/rejected": -34.158870697021484, + "loss": 0.2544, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.7251747250556946, + "rewards/margins": 0.4464759826660156, + "rewards/rejected": 0.27869874238967896, + "step": 2080 + }, + { + "epoch": 0.34, + "learning_rate": 9.502805806920056e-06, + "logits/chosen": -0.8828607201576233, + "logits/rejected": -0.9078025817871094, + "logps/chosen": -79.8077392578125, + "logps/rejected": -140.12796020507812, + "loss": 1.7597, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.880476474761963, + "rewards/margins": -3.4331130981445312, + "rewards/rejected": 6.313589572906494, + "step": 2081 + }, + { + "epoch": 0.34, + "learning_rate": 9.502234306628354e-06, + "logits/chosen": -1.016487717628479, + "logits/rejected": -1.0067894458770752, + "logps/chosen": -26.51491928100586, + "logps/rejected": -65.19644165039062, + "loss": 1.3136, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.416804552078247, + "rewards/margins": -0.9785752296447754, + "rewards/rejected": 2.3953797817230225, + "step": 2082 + }, + { + "epoch": 0.34, + "learning_rate": 9.501662495274864e-06, + "logits/chosen": -0.7406004071235657, + "logits/rejected": -0.7406004071235657, + "logps/chosen": -61.977745056152344, + "logps/rejected": -61.977745056152344, + "loss": 0.3472, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.9177483320236206, + "rewards/margins": 0.0, + "rewards/rejected": 1.9177483320236206, + "step": 2083 + }, + { + "epoch": 0.34, + "learning_rate": 9.50109037289909e-06, + "logits/chosen": -0.7253075838088989, + "logits/rejected": -0.7253075838088989, + "logps/chosen": -64.82923126220703, + "logps/rejected": -64.82923126220703, + "loss": 0.5942, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.928668975830078, + "rewards/margins": 0.0, + "rewards/rejected": 2.928668975830078, + "step": 2084 + }, + { + "epoch": 0.34, + "learning_rate": 9.50051793954056e-06, + "logits/chosen": -1.058052897453308, + "logits/rejected": -0.7907535433769226, + "logps/chosen": -152.8988800048828, + "logps/rejected": -26.889537811279297, + "loss": 2.7573, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.433017253875732, + "rewards/margins": 3.5365970134735107, + "rewards/rejected": 0.8964203000068665, + "step": 2085 + }, + { + "epoch": 0.34, + "learning_rate": 9.499945195238826e-06, + "logits/chosen": -0.8114833831787109, + "logits/rejected": -0.7314046025276184, + "logps/chosen": -59.97901916503906, + "logps/rejected": -98.8386001586914, + "loss": 0.1106, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.5599411725997925, + "rewards/margins": 1.4107697010040283, + "rewards/rejected": 0.14917145669460297, + "step": 2086 + }, + { + "epoch": 0.34, + "learning_rate": 9.499372140033456e-06, + "logits/chosen": -0.9204338192939758, + "logits/rejected": -0.8064717650413513, + "logps/chosen": -57.20164489746094, + "logps/rejected": -30.844966888427734, + "loss": 0.197, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.6989822387695312, + "rewards/margins": 3.5033950805664062, + "rewards/rejected": 0.195587158203125, + "step": 2087 + }, + { + "epoch": 0.34, + "learning_rate": 9.498798773964046e-06, + "logits/chosen": -0.9719592928886414, + "logits/rejected": -1.0130350589752197, + "logps/chosen": -22.31694984436035, + "logps/rejected": -59.06841278076172, + "loss": 0.8825, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.3437983989715576, + "rewards/margins": -1.1086971759796143, + "rewards/rejected": 3.452495574951172, + "step": 2088 + }, + { + "epoch": 0.34, + "learning_rate": 9.498225097070208e-06, + "logits/chosen": -1.295465111732483, + "logits/rejected": -1.2977229356765747, + "logps/chosen": -108.56715393066406, + "logps/rejected": -49.95933532714844, + "loss": 0.0471, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.902308940887451, + "rewards/margins": 2.411628246307373, + "rewards/rejected": 3.490680694580078, + "step": 2089 + }, + { + "epoch": 0.34, + "learning_rate": 9.49765110939158e-06, + "logits/chosen": -1.2237586975097656, + "logits/rejected": -1.3603092432022095, + "logps/chosen": -262.5345764160156, + "logps/rejected": -150.6004638671875, + "loss": 1.0709, + "rewards/accuracies": 1.0, + "rewards/chosen": 7.1635284423828125, + "rewards/margins": 0.07813405990600586, + "rewards/rejected": 7.085394382476807, + "step": 2090 + }, + { + "epoch": 0.34, + "learning_rate": 9.497076810967816e-06, + "logits/chosen": -0.8047556281089783, + "logits/rejected": -0.7794623970985413, + "logps/chosen": -32.943382263183594, + "logps/rejected": -17.872310638427734, + "loss": 0.3489, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.895581066608429, + "rewards/margins": 0.37076014280319214, + "rewards/rejected": 0.5248209238052368, + "step": 2091 + }, + { + "epoch": 0.34, + "learning_rate": 9.496502201838598e-06, + "logits/chosen": -0.6723676323890686, + "logits/rejected": -0.5539097189903259, + "logps/chosen": -60.74430465698242, + "logps/rejected": -46.098365783691406, + "loss": 0.8301, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.3995227813720703, + "rewards/margins": 0.8641990423202515, + "rewards/rejected": 1.5353237390518188, + "step": 2092 + }, + { + "epoch": 0.34, + "learning_rate": 9.495927282043622e-06, + "logits/chosen": -1.093544602394104, + "logits/rejected": -0.9607190489768982, + "logps/chosen": -52.21548843383789, + "logps/rejected": -18.021875381469727, + "loss": 0.1803, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.6984951496124268, + "rewards/margins": 0.8969748020172119, + "rewards/rejected": 0.8015203475952148, + "step": 2093 + }, + { + "epoch": 0.34, + "learning_rate": 9.495352051622612e-06, + "logits/chosen": -0.9511111974716187, + "logits/rejected": -0.938773512840271, + "logps/chosen": -46.350582122802734, + "logps/rejected": -107.01905822753906, + "loss": 0.4913, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.045727252960205, + "rewards/margins": 0.19557392597198486, + "rewards/rejected": 1.8501533269882202, + "step": 2094 + }, + { + "epoch": 0.34, + "learning_rate": 9.494776510615312e-06, + "logits/chosen": -0.7624080181121826, + "logits/rejected": -0.760381817817688, + "logps/chosen": -47.29529571533203, + "logps/rejected": -58.08338165283203, + "loss": 0.8743, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.8721001148223877, + "rewards/margins": -0.9769988059997559, + "rewards/rejected": 2.8490989208221436, + "step": 2095 + }, + { + "epoch": 0.34, + "learning_rate": 9.494200659061484e-06, + "logits/chosen": -1.1400883197784424, + "logits/rejected": -1.1535043716430664, + "logps/chosen": -69.50545501708984, + "logps/rejected": -144.23529052734375, + "loss": 0.7402, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.3617881536483765, + "rewards/margins": 0.53342205286026, + "rewards/rejected": 0.8283661007881165, + "step": 2096 + }, + { + "epoch": 0.34, + "learning_rate": 9.493624497000914e-06, + "logits/chosen": -1.084053874015808, + "logits/rejected": -1.0036709308624268, + "logps/chosen": -101.085205078125, + "logps/rejected": -67.96827697753906, + "loss": 0.0282, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.886474609375, + "rewards/margins": 3.37420654296875, + "rewards/rejected": 1.51226806640625, + "step": 2097 + }, + { + "epoch": 0.34, + "learning_rate": 9.493048024473413e-06, + "logits/chosen": -1.4270250797271729, + "logits/rejected": -1.4087550640106201, + "logps/chosen": -112.85172271728516, + "logps/rejected": -72.88428497314453, + "loss": 0.404, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.516002655029297, + "rewards/margins": 1.0715758800506592, + "rewards/rejected": 3.4444267749786377, + "step": 2098 + }, + { + "epoch": 0.34, + "learning_rate": 9.492471241518804e-06, + "logits/chosen": -1.0375741720199585, + "logits/rejected": -1.0178929567337036, + "logps/chosen": -55.77930450439453, + "logps/rejected": -35.53378677368164, + "loss": 0.3555, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.761824131011963, + "rewards/margins": 1.3534092903137207, + "rewards/rejected": 2.408414840698242, + "step": 2099 + }, + { + "epoch": 0.34, + "learning_rate": 9.491894148176942e-06, + "logits/chosen": -0.900732696056366, + "logits/rejected": -1.1157537698745728, + "logps/chosen": -37.20953369140625, + "logps/rejected": -143.2874755859375, + "loss": 1.9803, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.1506507396698, + "rewards/margins": -3.3648569583892822, + "rewards/rejected": 5.515507698059082, + "step": 2100 + }, + { + "epoch": 0.34, + "learning_rate": 9.491316744487697e-06, + "logits/chosen": -0.5787387490272522, + "logits/rejected": -0.5787387490272522, + "logps/chosen": -37.67316818237305, + "logps/rejected": -37.67316818237305, + "loss": 1.2373, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.6359401941299438, + "rewards/margins": 0.0, + "rewards/rejected": 0.6359401941299438, + "step": 2101 + }, + { + "epoch": 0.34, + "learning_rate": 9.490739030490962e-06, + "logits/chosen": -1.1846622228622437, + "logits/rejected": -1.1681808233261108, + "logps/chosen": -64.79928588867188, + "logps/rejected": -18.803810119628906, + "loss": 0.7198, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.11739044636487961, + "rewards/margins": -0.6949367523193359, + "rewards/rejected": 0.8123272061347961, + "step": 2102 + }, + { + "epoch": 0.34, + "learning_rate": 9.490161006226652e-06, + "logits/chosen": -1.0783276557922363, + "logits/rejected": -0.9611931443214417, + "logps/chosen": -123.2847671508789, + "logps/rejected": -102.92216491699219, + "loss": 0.2481, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.1867302656173706, + "rewards/margins": 0.47548145055770874, + "rewards/rejected": 0.7112488150596619, + "step": 2103 + }, + { + "epoch": 0.34, + "learning_rate": 9.489582671734702e-06, + "logits/chosen": -1.668272614479065, + "logits/rejected": -1.7239121198654175, + "logps/chosen": -49.21147918701172, + "logps/rejected": -124.76579284667969, + "loss": 0.9739, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.1629208326339722, + "rewards/margins": -1.780241847038269, + "rewards/rejected": 2.943162679672241, + "step": 2104 + }, + { + "epoch": 0.34, + "learning_rate": 9.48900402705507e-06, + "logits/chosen": -1.0614960193634033, + "logits/rejected": -0.9954993724822998, + "logps/chosen": -98.31184387207031, + "logps/rejected": -113.48058319091797, + "loss": 0.0341, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.341331481933594, + "rewards/margins": 2.9256234169006348, + "rewards/rejected": 1.4157081842422485, + "step": 2105 + }, + { + "epoch": 0.34, + "learning_rate": 9.488425072227738e-06, + "logits/chosen": -0.7422699332237244, + "logits/rejected": -0.767805814743042, + "logps/chosen": -3.0767476558685303, + "logps/rejected": -35.63047409057617, + "loss": 0.4787, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.2988322973251343, + "rewards/margins": -0.07161104679107666, + "rewards/rejected": 0.37044334411621094, + "step": 2106 + }, + { + "epoch": 0.34, + "learning_rate": 9.487845807292701e-06, + "logits/chosen": -0.8283845782279968, + "logits/rejected": -0.8876969218254089, + "logps/chosen": -116.95228576660156, + "logps/rejected": -113.88074493408203, + "loss": 1.5084, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.624995470046997, + "rewards/margins": -2.6692726612091064, + "rewards/rejected": 4.2942681312561035, + "step": 2107 + }, + { + "epoch": 0.34, + "learning_rate": 9.487266232289984e-06, + "logits/chosen": -1.0488150119781494, + "logits/rejected": -0.9375607371330261, + "logps/chosen": -115.80729675292969, + "logps/rejected": -76.32371520996094, + "loss": 3.029, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.045861721038818, + "rewards/margins": 1.2647018432617188, + "rewards/rejected": 4.7811598777771, + "step": 2108 + }, + { + "epoch": 0.34, + "learning_rate": 9.486686347259629e-06, + "logits/chosen": -0.675416111946106, + "logits/rejected": -0.6820074915885925, + "logps/chosen": -1.049217700958252, + "logps/rejected": -3.333028554916382, + "loss": 0.4023, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.20413736999034882, + "rewards/margins": -0.19005052745342255, + "rewards/rejected": 0.39418789744377136, + "step": 2109 + }, + { + "epoch": 0.34, + "learning_rate": 9.486106152241699e-06, + "logits/chosen": -1.0183815956115723, + "logits/rejected": -1.025357723236084, + "logps/chosen": -113.0693359375, + "logps/rejected": -100.79490661621094, + "loss": 1.5035, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.6268326044082642, + "rewards/margins": -2.2064528465270996, + "rewards/rejected": 3.8332855701446533, + "step": 2110 + }, + { + "epoch": 0.34, + "learning_rate": 9.485525647276284e-06, + "logits/chosen": -1.049124836921692, + "logits/rejected": -1.049124836921692, + "logps/chosen": -69.17080688476562, + "logps/rejected": -69.17080688476562, + "loss": 1.4117, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.7534027099609375, + "rewards/margins": 0.0, + "rewards/rejected": 1.7534027099609375, + "step": 2111 + }, + { + "epoch": 0.34, + "learning_rate": 9.484944832403489e-06, + "logits/chosen": -0.9605666399002075, + "logits/rejected": -0.8825033903121948, + "logps/chosen": -83.98989868164062, + "logps/rejected": -73.37138366699219, + "loss": 0.6073, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.3980034589767456, + "rewards/margins": -0.7890852689743042, + "rewards/rejected": 2.18708872795105, + "step": 2112 + }, + { + "epoch": 0.34, + "learning_rate": 9.484363707663443e-06, + "logits/chosen": -1.4290865659713745, + "logits/rejected": -1.340769648551941, + "logps/chosen": -52.58298873901367, + "logps/rejected": -27.5518798828125, + "loss": 1.4052, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.6704723834991455, + "rewards/margins": 3.0875680446624756, + "rewards/rejected": -0.4170955717563629, + "step": 2113 + }, + { + "epoch": 0.34, + "learning_rate": 9.483782273096295e-06, + "logits/chosen": -0.8700032830238342, + "logits/rejected": -0.7440629601478577, + "logps/chosen": -37.0336799621582, + "logps/rejected": -19.088621139526367, + "loss": 0.6504, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.2112674713134766, + "rewards/margins": 0.4077417254447937, + "rewards/rejected": 0.8035257458686829, + "step": 2114 + }, + { + "epoch": 0.34, + "learning_rate": 9.483200528742219e-06, + "logits/chosen": -0.44485700130462646, + "logits/rejected": -1.1771671772003174, + "logps/chosen": -18.47445297241211, + "logps/rejected": -69.29718780517578, + "loss": 1.9179, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.38194772601127625, + "rewards/margins": -1.6583728790283203, + "rewards/rejected": 2.040320634841919, + "step": 2115 + }, + { + "epoch": 0.34, + "learning_rate": 9.482618474641406e-06, + "logits/chosen": -1.341780185699463, + "logits/rejected": -1.258719563484192, + "logps/chosen": -130.9730682373047, + "logps/rejected": -38.27322006225586, + "loss": 0.1698, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.4917038679122925, + "rewards/margins": 0.933417558670044, + "rewards/rejected": 0.5582863092422485, + "step": 2116 + }, + { + "epoch": 0.34, + "learning_rate": 9.482036110834072e-06, + "logits/chosen": -1.2918814420700073, + "logits/rejected": -1.280080795288086, + "logps/chosen": -46.24186325073242, + "logps/rejected": -46.597068786621094, + "loss": 0.433, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.534082055091858, + "rewards/margins": -0.29794156551361084, + "rewards/rejected": 1.8320236206054688, + "step": 2117 + }, + { + "epoch": 0.34, + "learning_rate": 9.481453437360453e-06, + "logits/chosen": -1.1767960786819458, + "logits/rejected": -1.1605147123336792, + "logps/chosen": -53.57347869873047, + "logps/rejected": -60.25398635864258, + "loss": 2.177, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.002866506576538, + "rewards/margins": -1.4065585136413574, + "rewards/rejected": 3.4094250202178955, + "step": 2118 + }, + { + "epoch": 0.34, + "learning_rate": 9.480870454260804e-06, + "logits/chosen": -1.2258336544036865, + "logits/rejected": -1.2596243619918823, + "logps/chosen": -73.82290649414062, + "logps/rejected": -103.74995422363281, + "loss": 0.2048, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.254490613937378, + "rewards/margins": 0.833127498626709, + "rewards/rejected": 1.421363115310669, + "step": 2119 + }, + { + "epoch": 0.34, + "learning_rate": 9.480287161575405e-06, + "logits/chosen": -0.8027898669242859, + "logits/rejected": -0.825185239315033, + "logps/chosen": -40.955596923828125, + "logps/rejected": -37.190643310546875, + "loss": 0.7403, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.1342689990997314, + "rewards/margins": -0.180466890335083, + "rewards/rejected": 2.3147358894348145, + "step": 2120 + }, + { + "epoch": 0.34, + "learning_rate": 9.479703559344558e-06, + "logits/chosen": -0.9436140656471252, + "logits/rejected": -0.9482328295707703, + "logps/chosen": -179.5196533203125, + "logps/rejected": -161.98663330078125, + "loss": 0.0739, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.284872531890869, + "rewards/margins": 2.506765842437744, + "rewards/rejected": 1.778106689453125, + "step": 2121 + }, + { + "epoch": 0.34, + "learning_rate": 9.47911964760858e-06, + "logits/chosen": -1.2041524648666382, + "logits/rejected": -1.234008550643921, + "logps/chosen": -87.80381774902344, + "logps/rejected": -99.39599609375, + "loss": 2.3837, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.911419630050659, + "rewards/margins": -1.5635898113250732, + "rewards/rejected": 4.475009441375732, + "step": 2122 + }, + { + "epoch": 0.34, + "learning_rate": 9.478535426407817e-06, + "logits/chosen": -1.146744966506958, + "logits/rejected": -1.0866025686264038, + "logps/chosen": -84.12245178222656, + "logps/rejected": -66.54156494140625, + "loss": 0.6467, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.4258660078048706, + "rewards/margins": -0.08772504329681396, + "rewards/rejected": 1.5135910511016846, + "step": 2123 + }, + { + "epoch": 0.34, + "learning_rate": 9.477950895782632e-06, + "logits/chosen": -0.9759294986724854, + "logits/rejected": -0.9759294986724854, + "logps/chosen": -90.80583190917969, + "logps/rejected": -90.80583190917969, + "loss": 1.0372, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.4004828929901123, + "rewards/margins": 0.0, + "rewards/rejected": 2.4004828929901123, + "step": 2124 + }, + { + "epoch": 0.34, + "learning_rate": 9.477366055773412e-06, + "logits/chosen": -0.8787668347358704, + "logits/rejected": -0.8667968511581421, + "logps/chosen": -2.4878859519958496, + "logps/rejected": -8.288673400878906, + "loss": 2.171, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.2288384884595871, + "rewards/margins": -0.010597378015518188, + "rewards/rejected": 0.23943586647510529, + "step": 2125 + }, + { + "epoch": 0.35, + "learning_rate": 9.476780906420562e-06, + "logits/chosen": -0.9433884620666504, + "logits/rejected": -0.8677079677581787, + "logps/chosen": -33.342994689941406, + "logps/rejected": -65.41344451904297, + "loss": 0.28, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.6933434009552, + "rewards/margins": 1.0210678577423096, + "rewards/rejected": 1.6722755432128906, + "step": 2126 + }, + { + "epoch": 0.35, + "learning_rate": 9.476195447764512e-06, + "logits/chosen": -0.9756327867507935, + "logits/rejected": -0.917607843875885, + "logps/chosen": -87.69345092773438, + "logps/rejected": -39.25739288330078, + "loss": 0.406, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.631921410560608, + "rewards/margins": 1.2475025653839111, + "rewards/rejected": 0.38441887497901917, + "step": 2127 + }, + { + "epoch": 0.35, + "learning_rate": 9.475609679845709e-06, + "logits/chosen": -0.9289446473121643, + "logits/rejected": -0.8907354474067688, + "logps/chosen": -56.305686950683594, + "logps/rejected": -69.4874267578125, + "loss": 1.0823, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.3164010047912598, + "rewards/margins": -0.49101948738098145, + "rewards/rejected": 2.807420492172241, + "step": 2128 + }, + { + "epoch": 0.35, + "learning_rate": 9.475023602704626e-06, + "logits/chosen": -0.7427064776420593, + "logits/rejected": -0.8084893226623535, + "logps/chosen": -65.98453521728516, + "logps/rejected": -95.24992370605469, + "loss": 0.6197, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.6105835437774658, + "rewards/margins": 0.23581624031066895, + "rewards/rejected": 1.3747673034667969, + "step": 2129 + }, + { + "epoch": 0.35, + "learning_rate": 9.474437216381756e-06, + "logits/chosen": -1.0766695737838745, + "logits/rejected": -1.194632887840271, + "logps/chosen": -136.75396728515625, + "logps/rejected": -188.31866455078125, + "loss": 0.0551, + "rewards/accuracies": 1.0, + "rewards/chosen": 9.3774995803833, + "rewards/margins": 2.290457248687744, + "rewards/rejected": 7.087042331695557, + "step": 2130 + }, + { + "epoch": 0.35, + "learning_rate": 9.473850520917611e-06, + "logits/chosen": -1.2716307640075684, + "logits/rejected": -1.3146671056747437, + "logps/chosen": -247.0902557373047, + "logps/rejected": -122.75244140625, + "loss": 3.3373, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.406843662261963, + "rewards/margins": -5.303650379180908, + "rewards/rejected": 8.710494041442871, + "step": 2131 + }, + { + "epoch": 0.35, + "learning_rate": 9.473263516352728e-06, + "logits/chosen": -0.9788240790367126, + "logits/rejected": -0.9962859749794006, + "logps/chosen": -61.76936721801758, + "logps/rejected": -75.89411163330078, + "loss": 1.0508, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.4353435039520264, + "rewards/margins": -0.2845335006713867, + "rewards/rejected": 2.719877004623413, + "step": 2132 + }, + { + "epoch": 0.35, + "learning_rate": 9.472676202727662e-06, + "logits/chosen": -1.1051628589630127, + "logits/rejected": -0.9073810577392578, + "logps/chosen": -105.88009643554688, + "logps/rejected": -89.2505111694336, + "loss": 0.0644, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.069446086883545, + "rewards/margins": 3.647451162338257, + "rewards/rejected": 2.421994924545288, + "step": 2133 + }, + { + "epoch": 0.35, + "learning_rate": 9.472088580082991e-06, + "logits/chosen": -1.03706693649292, + "logits/rejected": -1.016876459121704, + "logps/chosen": -38.096824645996094, + "logps/rejected": -26.8441219329834, + "loss": 0.6696, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.9477604031562805, + "rewards/margins": -0.9340605139732361, + "rewards/rejected": 1.8818209171295166, + "step": 2134 + }, + { + "epoch": 0.35, + "learning_rate": 9.471500648459316e-06, + "logits/chosen": -1.1189452409744263, + "logits/rejected": -1.0544633865356445, + "logps/chosen": -53.64339065551758, + "logps/rejected": -43.926876068115234, + "loss": 0.2847, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.3888325691223145, + "rewards/margins": 0.33190321922302246, + "rewards/rejected": 2.056929349899292, + "step": 2135 + }, + { + "epoch": 0.35, + "learning_rate": 9.470912407897253e-06, + "logits/chosen": -0.616843581199646, + "logits/rejected": -0.585292637348175, + "logps/chosen": -36.56529998779297, + "logps/rejected": -8.257939338684082, + "loss": 2.1408, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.4908897578716278, + "rewards/margins": 0.030652344226837158, + "rewards/rejected": 0.46023741364479065, + "step": 2136 + }, + { + "epoch": 0.35, + "learning_rate": 9.470323858437449e-06, + "logits/chosen": -0.9176361560821533, + "logits/rejected": -0.8745540976524353, + "logps/chosen": -55.0827522277832, + "logps/rejected": -63.694244384765625, + "loss": 0.8093, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.926586627960205, + "rewards/margins": -0.8037409782409668, + "rewards/rejected": 3.730327606201172, + "step": 2137 + }, + { + "epoch": 0.35, + "learning_rate": 9.469735000120564e-06, + "logits/chosen": -1.1192563772201538, + "logits/rejected": -1.1027265787124634, + "logps/chosen": -57.39398956298828, + "logps/rejected": -63.63116455078125, + "loss": 0.277, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.270681142807007, + "rewards/margins": 0.3683542013168335, + "rewards/rejected": 1.9023269414901733, + "step": 2138 + }, + { + "epoch": 0.35, + "learning_rate": 9.469145832987283e-06, + "logits/chosen": -0.4577006697654724, + "logits/rejected": -0.4619300961494446, + "logps/chosen": -4.809720993041992, + "logps/rejected": -2.226691484451294, + "loss": 0.533, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.17873965203762054, + "rewards/margins": -0.164637491106987, + "rewards/rejected": 0.34337714314460754, + "step": 2139 + }, + { + "epoch": 0.35, + "learning_rate": 9.468556357078314e-06, + "logits/chosen": -0.7688714861869812, + "logits/rejected": -0.712695300579071, + "logps/chosen": -38.69415283203125, + "logps/rejected": -45.902286529541016, + "loss": 0.889, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.381787896156311, + "rewards/margins": 0.19491004943847656, + "rewards/rejected": 1.1868778467178345, + "step": 2140 + }, + { + "epoch": 0.35, + "learning_rate": 9.467966572434381e-06, + "logits/chosen": -0.842298686504364, + "logits/rejected": -0.880895733833313, + "logps/chosen": -75.2043685913086, + "logps/rejected": -113.04812622070312, + "loss": 1.3458, + "rewards/accuracies": 0.0, + "rewards/chosen": 4.4973344802856445, + "rewards/margins": -2.1912808418273926, + "rewards/rejected": 6.688615322113037, + "step": 2141 + }, + { + "epoch": 0.35, + "learning_rate": 9.467376479096236e-06, + "logits/chosen": -0.8172710537910461, + "logits/rejected": -0.789920449256897, + "logps/chosen": -79.5560302734375, + "logps/rejected": -123.42408752441406, + "loss": 1.0682, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.645672619342804, + "rewards/margins": -1.1593260765075684, + "rewards/rejected": 1.804998755455017, + "step": 2142 + }, + { + "epoch": 0.35, + "learning_rate": 9.466786077104646e-06, + "logits/chosen": -0.6787011623382568, + "logits/rejected": -0.4209696054458618, + "logps/chosen": -74.31999206542969, + "logps/rejected": -30.114517211914062, + "loss": 0.125, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.4719223976135254, + "rewards/margins": 2.318671226501465, + "rewards/rejected": 0.15325108170509338, + "step": 2143 + }, + { + "epoch": 0.35, + "learning_rate": 9.466195366500403e-06, + "logits/chosen": -0.9050275087356567, + "logits/rejected": -0.869448721408844, + "logps/chosen": -60.30915069580078, + "logps/rejected": -75.08489990234375, + "loss": 4.9047, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.081618547439575, + "rewards/margins": -2.155349016189575, + "rewards/rejected": 4.23696756362915, + "step": 2144 + }, + { + "epoch": 0.35, + "learning_rate": 9.465604347324319e-06, + "logits/chosen": -0.8719778656959534, + "logits/rejected": -0.8298080563545227, + "logps/chosen": -56.93258285522461, + "logps/rejected": -46.40127944946289, + "loss": 0.3217, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.160126209259033, + "rewards/margins": 0.37195825576782227, + "rewards/rejected": 1.788167953491211, + "step": 2145 + }, + { + "epoch": 0.35, + "learning_rate": 9.46501301961723e-06, + "logits/chosen": -0.838917076587677, + "logits/rejected": -0.7344749569892883, + "logps/chosen": -72.50711822509766, + "logps/rejected": -54.04640579223633, + "loss": 0.6414, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.265946388244629, + "rewards/margins": 1.2638838291168213, + "rewards/rejected": 3.0020625591278076, + "step": 2146 + }, + { + "epoch": 0.35, + "learning_rate": 9.464421383419988e-06, + "logits/chosen": -0.8119193315505981, + "logits/rejected": -0.8904350399971008, + "logps/chosen": -67.83171844482422, + "logps/rejected": -46.568626403808594, + "loss": 3.3407, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.2778260707855225, + "rewards/margins": 0.11533284187316895, + "rewards/rejected": 2.1624932289123535, + "step": 2147 + }, + { + "epoch": 0.35, + "learning_rate": 9.463829438773473e-06, + "logits/chosen": -1.1629016399383545, + "logits/rejected": -1.2073308229446411, + "logps/chosen": -53.31586456298828, + "logps/rejected": -62.2122802734375, + "loss": 0.6989, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.851248264312744, + "rewards/margins": -0.5457746982574463, + "rewards/rejected": 3.3970229625701904, + "step": 2148 + }, + { + "epoch": 0.35, + "learning_rate": 9.46323718571858e-06, + "logits/chosen": -0.48739194869995117, + "logits/rejected": -0.5401881337165833, + "logps/chosen": -19.15597152709961, + "logps/rejected": -47.02685546875, + "loss": 0.6733, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.8824438452720642, + "rewards/margins": 0.5168005228042603, + "rewards/rejected": 0.36564332246780396, + "step": 2149 + }, + { + "epoch": 0.35, + "learning_rate": 9.462644624296229e-06, + "logits/chosen": -0.8200188279151917, + "logits/rejected": -0.8179506063461304, + "logps/chosen": -3.694749355316162, + "logps/rejected": -6.640538215637207, + "loss": 2.3757, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3201240599155426, + "rewards/margins": 0.19398117065429688, + "rewards/rejected": 0.12614288926124573, + "step": 2150 + }, + { + "epoch": 0.35, + "learning_rate": 9.462051754547361e-06, + "logits/chosen": -0.9775063991546631, + "logits/rejected": -1.0129379034042358, + "logps/chosen": -101.75758361816406, + "logps/rejected": -89.33596801757812, + "loss": 1.9121, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.9837127923965454, + "rewards/margins": 0.17636871337890625, + "rewards/rejected": 0.8073440790176392, + "step": 2151 + }, + { + "epoch": 0.35, + "learning_rate": 9.461458576512936e-06, + "logits/chosen": -0.7246206998825073, + "logits/rejected": -0.6916850209236145, + "logps/chosen": -74.39098358154297, + "logps/rejected": -69.83758544921875, + "loss": 0.2606, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.1007965803146362, + "rewards/margins": 0.5937530994415283, + "rewards/rejected": 0.5070434808731079, + "step": 2152 + }, + { + "epoch": 0.35, + "learning_rate": 9.46086509023394e-06, + "logits/chosen": -0.9934389591217041, + "logits/rejected": -1.0216331481933594, + "logps/chosen": -52.1866455078125, + "logps/rejected": -98.98634338378906, + "loss": 0.3252, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.8259568214416504, + "rewards/margins": 0.09032821655273438, + "rewards/rejected": 2.735628604888916, + "step": 2153 + }, + { + "epoch": 0.35, + "learning_rate": 9.460271295751373e-06, + "logits/chosen": -1.059643030166626, + "logits/rejected": -0.9904739856719971, + "logps/chosen": -117.61555480957031, + "logps/rejected": -63.977027893066406, + "loss": 0.2169, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.597541809082031, + "rewards/margins": 2.8242485523223877, + "rewards/rejected": 2.7732932567596436, + "step": 2154 + }, + { + "epoch": 0.35, + "learning_rate": 9.459677193106265e-06, + "logits/chosen": -1.220535159111023, + "logits/rejected": -1.2759861946105957, + "logps/chosen": -187.68255615234375, + "logps/rejected": -65.84677124023438, + "loss": 0.017, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.883347988128662, + "rewards/margins": 3.6970648765563965, + "rewards/rejected": 2.1862831115722656, + "step": 2155 + }, + { + "epoch": 0.35, + "learning_rate": 9.459082782339659e-06, + "logits/chosen": -1.2429667711257935, + "logits/rejected": -1.3554110527038574, + "logps/chosen": -38.817813873291016, + "logps/rejected": -145.5179443359375, + "loss": 2.4866, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.5211658477783203, + "rewards/margins": -4.759829521179199, + "rewards/rejected": 6.2809953689575195, + "step": 2156 + }, + { + "epoch": 0.35, + "learning_rate": 9.458488063492626e-06, + "logits/chosen": -0.8593010306358337, + "logits/rejected": -0.7408604025840759, + "logps/chosen": -107.2984619140625, + "logps/rejected": -53.008628845214844, + "loss": 0.5385, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.3777129650115967, + "rewards/margins": 0.14261531829833984, + "rewards/rejected": 2.235097646713257, + "step": 2157 + }, + { + "epoch": 0.35, + "learning_rate": 9.457893036606254e-06, + "logits/chosen": -0.6470271348953247, + "logits/rejected": -0.6480227112770081, + "logps/chosen": -26.035079956054688, + "logps/rejected": -38.099735260009766, + "loss": 1.0029, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.7516910433769226, + "rewards/margins": -0.21231651306152344, + "rewards/rejected": 0.964007556438446, + "step": 2158 + }, + { + "epoch": 0.35, + "learning_rate": 9.457297701721655e-06, + "logits/chosen": -1.0774259567260742, + "logits/rejected": -0.8342824578285217, + "logps/chosen": -92.91517639160156, + "logps/rejected": -96.96167755126953, + "loss": 0.3518, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.67758321762085, + "rewards/margins": 0.2109699249267578, + "rewards/rejected": 4.466613292694092, + "step": 2159 + }, + { + "epoch": 0.35, + "learning_rate": 9.456702058879958e-06, + "logits/chosen": -1.0713212490081787, + "logits/rejected": -0.980766773223877, + "logps/chosen": -49.598609924316406, + "logps/rejected": -48.177642822265625, + "loss": 1.1665, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.787329912185669, + "rewards/margins": -1.498467206954956, + "rewards/rejected": 3.285797119140625, + "step": 2160 + }, + { + "epoch": 0.35, + "learning_rate": 9.45610610812232e-06, + "logits/chosen": -0.8169768452644348, + "logits/rejected": -0.773090660572052, + "logps/chosen": -131.91729736328125, + "logps/rejected": -51.97130584716797, + "loss": 0.892, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.3359237909317017, + "rewards/margins": -0.004850029945373535, + "rewards/rejected": 1.3407738208770752, + "step": 2161 + }, + { + "epoch": 0.35, + "learning_rate": 9.455509849489915e-06, + "logits/chosen": -0.7428290247917175, + "logits/rejected": -0.7702164649963379, + "logps/chosen": -58.57891082763672, + "logps/rejected": -78.68961334228516, + "loss": 0.605, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.956182837486267, + "rewards/margins": 1.5469733476638794, + "rewards/rejected": 0.4092094600200653, + "step": 2162 + }, + { + "epoch": 0.35, + "learning_rate": 9.454913283023937e-06, + "logits/chosen": -0.7944121360778809, + "logits/rejected": -0.7643650770187378, + "logps/chosen": -75.69252014160156, + "logps/rejected": -67.0488510131836, + "loss": 0.526, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.5210587978363037, + "rewards/margins": -0.5675430297851562, + "rewards/rejected": 3.08860182762146, + "step": 2163 + }, + { + "epoch": 0.35, + "learning_rate": 9.454316408765604e-06, + "logits/chosen": -1.0722204446792603, + "logits/rejected": -1.0912758111953735, + "logps/chosen": -102.53440856933594, + "logps/rejected": -65.31967163085938, + "loss": 1.1775, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.2943283319473267, + "rewards/margins": -1.9246429204940796, + "rewards/rejected": 3.2189712524414062, + "step": 2164 + }, + { + "epoch": 0.35, + "learning_rate": 9.453719226756152e-06, + "logits/chosen": -1.002853512763977, + "logits/rejected": -0.935118556022644, + "logps/chosen": -107.26290893554688, + "logps/rejected": -66.41014099121094, + "loss": 0.7865, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.272885322570801, + "rewards/margins": 1.8931856155395508, + "rewards/rejected": 3.37969970703125, + "step": 2165 + }, + { + "epoch": 0.35, + "learning_rate": 9.453121737036846e-06, + "logits/chosen": -0.8066626191139221, + "logits/rejected": -0.770002007484436, + "logps/chosen": -64.98457336425781, + "logps/rejected": -76.59013366699219, + "loss": 0.3966, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.4378509521484375, + "rewards/margins": 1.1388137340545654, + "rewards/rejected": 1.299037218093872, + "step": 2166 + }, + { + "epoch": 0.35, + "learning_rate": 9.452523939648964e-06, + "logits/chosen": -0.868411660194397, + "logits/rejected": -0.7924934029579163, + "logps/chosen": -101.47845458984375, + "logps/rejected": -124.99710083007812, + "loss": 0.8703, + "rewards/accuracies": 0.0, + "rewards/chosen": 4.083221435546875, + "rewards/margins": -1.5120577812194824, + "rewards/rejected": 5.595279216766357, + "step": 2167 + }, + { + "epoch": 0.35, + "learning_rate": 9.451925834633806e-06, + "logits/chosen": -0.9510544538497925, + "logits/rejected": -0.859722912311554, + "logps/chosen": -80.48371887207031, + "logps/rejected": -15.451194763183594, + "loss": 1.7176, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.6177169680595398, + "rewards/margins": 0.2829326391220093, + "rewards/rejected": 0.3347843289375305, + "step": 2168 + }, + { + "epoch": 0.35, + "learning_rate": 9.451327422032698e-06, + "logits/chosen": -0.9298685789108276, + "logits/rejected": -1.0510716438293457, + "logps/chosen": -83.32791137695312, + "logps/rejected": -118.9312515258789, + "loss": 1.9037, + "rewards/accuracies": 0.0, + "rewards/chosen": 4.029290676116943, + "rewards/margins": -2.579122543334961, + "rewards/rejected": 6.608413219451904, + "step": 2169 + }, + { + "epoch": 0.35, + "learning_rate": 9.450728701886985e-06, + "logits/chosen": -0.5698947310447693, + "logits/rejected": -0.5721872448921204, + "logps/chosen": -50.21959686279297, + "logps/rejected": -64.1532974243164, + "loss": 1.1058, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.6784042119979858, + "rewards/margins": 0.9636229872703552, + "rewards/rejected": 0.7147812247276306, + "step": 2170 + }, + { + "epoch": 0.35, + "learning_rate": 9.45012967423803e-06, + "logits/chosen": -0.6358805894851685, + "logits/rejected": -0.4224168360233307, + "logps/chosen": -69.23974609375, + "logps/rejected": -29.56885528564453, + "loss": 0.0959, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.4873183965682983, + "rewards/margins": 1.5680208206176758, + "rewards/rejected": -0.08070240169763565, + "step": 2171 + }, + { + "epoch": 0.35, + "learning_rate": 9.449530339127222e-06, + "logits/chosen": -1.1727417707443237, + "logits/rejected": -1.1407486200332642, + "logps/chosen": -227.92947387695312, + "logps/rejected": -98.57789611816406, + "loss": 0.0103, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.6569061279296875, + "rewards/margins": 3.945939540863037, + "rewards/rejected": 0.7109665274620056, + "step": 2172 + }, + { + "epoch": 0.35, + "learning_rate": 9.44893069659597e-06, + "logits/chosen": -0.7566954493522644, + "logits/rejected": -0.6941391825675964, + "logps/chosen": -59.94382858276367, + "logps/rejected": -85.0398178100586, + "loss": 1.8832, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.8284549713134766, + "rewards/margins": -0.5248067378997803, + "rewards/rejected": 2.353261709213257, + "step": 2173 + }, + { + "epoch": 0.35, + "learning_rate": 9.448330746685704e-06, + "logits/chosen": -0.7475470304489136, + "logits/rejected": -0.828176736831665, + "logps/chosen": -47.417911529541016, + "logps/rejected": -132.49017333984375, + "loss": 2.2755, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.198310613632202, + "rewards/margins": -4.382303237915039, + "rewards/rejected": 6.580613613128662, + "step": 2174 + }, + { + "epoch": 0.35, + "learning_rate": 9.447730489437875e-06, + "logits/chosen": -0.8277495503425598, + "logits/rejected": -0.8346493244171143, + "logps/chosen": -70.6973876953125, + "logps/rejected": -33.12029266357422, + "loss": 0.5529, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.75774085521698, + "rewards/margins": -0.48400652408599854, + "rewards/rejected": 2.2417473793029785, + "step": 2175 + }, + { + "epoch": 0.35, + "learning_rate": 9.447129924893951e-06, + "logits/chosen": -1.2694271802902222, + "logits/rejected": -0.6908575892448425, + "logps/chosen": -76.89930725097656, + "logps/rejected": -84.86727905273438, + "loss": 0.1804, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.766580104827881, + "rewards/margins": 0.8397688865661621, + "rewards/rejected": 3.9268112182617188, + "step": 2176 + }, + { + "epoch": 0.35, + "learning_rate": 9.44652905309543e-06, + "logits/chosen": -0.8845794796943665, + "logits/rejected": -0.9897748827934265, + "logps/chosen": -46.10047149658203, + "logps/rejected": -110.33056640625, + "loss": 3.5404, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.061358690261841, + "rewards/margins": -0.4237823486328125, + "rewards/rejected": 2.4851410388946533, + "step": 2177 + }, + { + "epoch": 0.35, + "learning_rate": 9.445927874083825e-06, + "logits/chosen": -1.0069314241409302, + "logits/rejected": -0.9053717255592346, + "logps/chosen": -106.1333236694336, + "logps/rejected": -85.97305297851562, + "loss": 0.1914, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.025147438049316, + "rewards/margins": 2.356919288635254, + "rewards/rejected": 3.6682281494140625, + "step": 2178 + }, + { + "epoch": 0.35, + "learning_rate": 9.445326387900671e-06, + "logits/chosen": -0.5856302976608276, + "logits/rejected": -0.648807168006897, + "logps/chosen": -74.40310668945312, + "logps/rejected": -93.93702697753906, + "loss": 0.6711, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.08243727684021, + "rewards/margins": -1.034639596939087, + "rewards/rejected": 3.117076873779297, + "step": 2179 + }, + { + "epoch": 0.35, + "learning_rate": 9.444724594587524e-06, + "logits/chosen": -0.8520156741142273, + "logits/rejected": -0.8650653958320618, + "logps/chosen": -54.34771728515625, + "logps/rejected": -95.19017028808594, + "loss": 0.3113, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5862705111503601, + "rewards/margins": 0.2777377963066101, + "rewards/rejected": 0.30853271484375, + "step": 2180 + }, + { + "epoch": 0.35, + "learning_rate": 9.444122494185967e-06, + "logits/chosen": -0.40199610590934753, + "logits/rejected": -0.45626312494277954, + "logps/chosen": -93.38735961914062, + "logps/rejected": -65.21602630615234, + "loss": 1.3509, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.5152077078819275, + "rewards/margins": -1.5917656421661377, + "rewards/rejected": 2.10697340965271, + "step": 2181 + }, + { + "epoch": 0.35, + "learning_rate": 9.443520086737593e-06, + "logits/chosen": -0.8123083114624023, + "logits/rejected": -0.8123083114624023, + "logps/chosen": -69.3878402709961, + "logps/rejected": -69.3878402709961, + "loss": 0.4968, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.4546585083007812, + "rewards/margins": 0.0, + "rewards/rejected": 3.4546585083007812, + "step": 2182 + }, + { + "epoch": 0.35, + "learning_rate": 9.44291737228403e-06, + "logits/chosen": -1.132949709892273, + "logits/rejected": -0.7575498223304749, + "logps/chosen": -82.88688659667969, + "logps/rejected": -53.81938171386719, + "loss": 0.505, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.14652419090271, + "rewards/margins": -0.5530645847320557, + "rewards/rejected": 2.6995887756347656, + "step": 2183 + }, + { + "epoch": 0.35, + "learning_rate": 9.442314350866913e-06, + "logits/chosen": -0.9233478903770447, + "logits/rejected": -0.7026780247688293, + "logps/chosen": -69.33499145507812, + "logps/rejected": -28.662330627441406, + "loss": 0.1323, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.0412216186523438, + "rewards/margins": 1.5859336853027344, + "rewards/rejected": 0.4552879333496094, + "step": 2184 + }, + { + "epoch": 0.35, + "learning_rate": 9.441711022527908e-06, + "logits/chosen": -0.9590403437614441, + "logits/rejected": -0.8775205016136169, + "logps/chosen": -39.741065979003906, + "logps/rejected": -65.5565185546875, + "loss": 0.5458, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.3168067932128906, + "rewards/margins": -0.23782587051391602, + "rewards/rejected": 2.5546326637268066, + "step": 2185 + }, + { + "epoch": 0.35, + "learning_rate": 9.441107387308701e-06, + "logits/chosen": -0.9908920526504517, + "logits/rejected": -0.8930507898330688, + "logps/chosen": -126.80735778808594, + "logps/rejected": -40.005210876464844, + "loss": 0.6249, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.7334014773368835, + "rewards/margins": 0.21711421012878418, + "rewards/rejected": 0.5162872672080994, + "step": 2186 + }, + { + "epoch": 0.35, + "learning_rate": 9.440503445250996e-06, + "logits/chosen": -1.1252212524414062, + "logits/rejected": -1.118014931678772, + "logps/chosen": -117.87110900878906, + "logps/rejected": -94.92481231689453, + "loss": 0.1903, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.1043381690979004, + "rewards/margins": 0.8121346235275269, + "rewards/rejected": 1.2922035455703735, + "step": 2187 + }, + { + "epoch": 0.36, + "learning_rate": 9.439899196396517e-06, + "logits/chosen": -0.6676364541053772, + "logits/rejected": -0.7961335778236389, + "logps/chosen": -206.47900390625, + "logps/rejected": -132.13314819335938, + "loss": 0.3089, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.440054416656494, + "rewards/margins": 0.6809356212615967, + "rewards/rejected": 3.7591187953948975, + "step": 2188 + }, + { + "epoch": 0.36, + "learning_rate": 9.439294640787014e-06, + "logits/chosen": -1.0069985389709473, + "logits/rejected": -0.943068265914917, + "logps/chosen": -128.4293212890625, + "logps/rejected": -96.61126708984375, + "loss": 0.0268, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.608813762664795, + "rewards/margins": 3.0247926712036133, + "rewards/rejected": 3.5840210914611816, + "step": 2189 + }, + { + "epoch": 0.36, + "learning_rate": 9.438689778464258e-06, + "logits/chosen": -1.002681851387024, + "logits/rejected": -0.9755226969718933, + "logps/chosen": -49.42373275756836, + "logps/rejected": -81.134765625, + "loss": 0.7931, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.515820026397705, + "rewards/margins": -1.3533275127410889, + "rewards/rejected": 3.869147539138794, + "step": 2190 + }, + { + "epoch": 0.36, + "learning_rate": 9.438084609470038e-06, + "logits/chosen": -1.0608960390090942, + "logits/rejected": -1.198625922203064, + "logps/chosen": -39.316986083984375, + "logps/rejected": -149.5712890625, + "loss": 4.1152, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.2921829223632812, + "rewards/margins": -6.391427993774414, + "rewards/rejected": 8.683610916137695, + "step": 2191 + }, + { + "epoch": 0.36, + "learning_rate": 9.437479133846164e-06, + "logits/chosen": -0.9030137062072754, + "logits/rejected": -0.931098222732544, + "logps/chosen": -111.18769073486328, + "logps/rejected": -119.49449157714844, + "loss": 0.4237, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.8961998224258423, + "rewards/margins": 0.1015923023223877, + "rewards/rejected": 1.7946075201034546, + "step": 2192 + }, + { + "epoch": 0.36, + "learning_rate": 9.43687335163447e-06, + "logits/chosen": -1.034458875656128, + "logits/rejected": -0.9793813824653625, + "logps/chosen": -196.09584045410156, + "logps/rejected": -65.18096160888672, + "loss": 0.1362, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.8203444480896, + "rewards/margins": 1.3817877769470215, + "rewards/rejected": 4.438556671142578, + "step": 2193 + }, + { + "epoch": 0.36, + "learning_rate": 9.436267262876808e-06, + "logits/chosen": -1.112034797668457, + "logits/rejected": -1.1181766986846924, + "logps/chosen": -69.55831146240234, + "logps/rejected": -171.57025146484375, + "loss": 0.0149, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.6096014976501465, + "rewards/margins": 3.645949363708496, + "rewards/rejected": 1.9636520147323608, + "step": 2194 + }, + { + "epoch": 0.36, + "learning_rate": 9.435660867615057e-06, + "logits/chosen": -1.1414471864700317, + "logits/rejected": -1.1487714052200317, + "logps/chosen": -98.72578430175781, + "logps/rejected": -89.76077270507812, + "loss": 0.8786, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.7424087524414062, + "rewards/margins": -1.5653274059295654, + "rewards/rejected": 3.3077361583709717, + "step": 2195 + }, + { + "epoch": 0.36, + "learning_rate": 9.43505416589111e-06, + "logits/chosen": -0.9497972130775452, + "logits/rejected": -0.9785033464431763, + "logps/chosen": -82.94319152832031, + "logps/rejected": -59.662574768066406, + "loss": 0.2562, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.5351104736328125, + "rewards/margins": 0.47482144832611084, + "rewards/rejected": 1.0602890253067017, + "step": 2196 + }, + { + "epoch": 0.36, + "learning_rate": 9.434447157746884e-06, + "logits/chosen": -1.0272210836410522, + "logits/rejected": -1.1222048997879028, + "logps/chosen": -117.23381805419922, + "logps/rejected": -112.03193664550781, + "loss": 0.6295, + "rewards/accuracies": 0.0, + "rewards/chosen": 5.3543524742126465, + "rewards/margins": -0.697028636932373, + "rewards/rejected": 6.0513811111450195, + "step": 2197 + }, + { + "epoch": 0.36, + "learning_rate": 9.433839843224319e-06, + "logits/chosen": -0.8327287435531616, + "logits/rejected": -0.8476554155349731, + "logps/chosen": -135.93685913085938, + "logps/rejected": -111.20619201660156, + "loss": 2.9499, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.549566745758057, + "rewards/margins": 1.269728183746338, + "rewards/rejected": 3.2798385620117188, + "step": 2198 + }, + { + "epoch": 0.36, + "learning_rate": 9.433232222365374e-06, + "logits/chosen": -1.2336442470550537, + "logits/rejected": -1.2630447149276733, + "logps/chosen": -64.94011688232422, + "logps/rejected": -81.64537811279297, + "loss": 0.2609, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.335346221923828, + "rewards/margins": 0.7298691272735596, + "rewards/rejected": 3.6054770946502686, + "step": 2199 + }, + { + "epoch": 0.36, + "learning_rate": 9.43262429521203e-06, + "logits/chosen": -0.658717930316925, + "logits/rejected": -0.6879667043685913, + "logps/chosen": -4.516733169555664, + "logps/rejected": -36.47233581542969, + "loss": 0.6969, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.13833408057689667, + "rewards/margins": -0.12542743980884552, + "rewards/rejected": 0.2637615203857422, + "step": 2200 + }, + { + "epoch": 0.36, + "learning_rate": 9.432016061806291e-06, + "logits/chosen": -0.8959563970565796, + "logits/rejected": -0.8959563970565796, + "logps/chosen": -58.270023345947266, + "logps/rejected": -58.270023345947266, + "loss": 0.3627, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.6259992122650146, + "rewards/margins": 0.0, + "rewards/rejected": 3.6259992122650146, + "step": 2201 + }, + { + "epoch": 0.36, + "learning_rate": 9.431407522190176e-06, + "logits/chosen": -0.9224650263786316, + "logits/rejected": -1.0071890354156494, + "logps/chosen": -103.1098861694336, + "logps/rejected": -74.92578887939453, + "loss": 1.1163, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.9377540349960327, + "rewards/margins": -0.9591003656387329, + "rewards/rejected": 2.8968544006347656, + "step": 2202 + }, + { + "epoch": 0.36, + "learning_rate": 9.430798676405733e-06, + "logits/chosen": -0.736940860748291, + "logits/rejected": -0.736940860748291, + "logps/chosen": -25.284320831298828, + "logps/rejected": -25.284320831298828, + "loss": 0.3685, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.706559419631958, + "rewards/margins": 0.0, + "rewards/rejected": 1.706559419631958, + "step": 2203 + }, + { + "epoch": 0.36, + "learning_rate": 9.430189524495023e-06, + "logits/chosen": -1.3734371662139893, + "logits/rejected": -1.4673185348510742, + "logps/chosen": -107.10514831542969, + "logps/rejected": -237.68270874023438, + "loss": 1.5006, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.319056749343872, + "rewards/margins": -0.36653590202331543, + "rewards/rejected": 2.6855926513671875, + "step": 2204 + }, + { + "epoch": 0.36, + "learning_rate": 9.429580066500139e-06, + "logits/chosen": -1.2529842853546143, + "logits/rejected": -1.2260997295379639, + "logps/chosen": -141.1661376953125, + "logps/rejected": -102.49066162109375, + "loss": 0.1824, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.9199249744415283, + "rewards/margins": 1.0636764764785767, + "rewards/rejected": 0.8562484979629517, + "step": 2205 + }, + { + "epoch": 0.36, + "learning_rate": 9.428970302463185e-06, + "logits/chosen": -1.2097214460372925, + "logits/rejected": -1.1993695497512817, + "logps/chosen": -83.76466369628906, + "logps/rejected": -58.90465545654297, + "loss": 0.4041, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.1165688037872314, + "rewards/margins": -0.21077346801757812, + "rewards/rejected": 2.3273422718048096, + "step": 2206 + }, + { + "epoch": 0.36, + "learning_rate": 9.42836023242629e-06, + "logits/chosen": -0.8986557722091675, + "logits/rejected": -0.9950410723686218, + "logps/chosen": -107.38931274414062, + "logps/rejected": -156.15301513671875, + "loss": 1.9153, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.207388401031494, + "rewards/margins": -3.5601987838745117, + "rewards/rejected": 6.767587184906006, + "step": 2207 + }, + { + "epoch": 0.36, + "learning_rate": 9.427749856431603e-06, + "logits/chosen": -1.291029453277588, + "logits/rejected": -1.265599250793457, + "logps/chosen": -72.96154022216797, + "logps/rejected": -85.24801635742188, + "loss": 1.1163, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.3917717933654785, + "rewards/margins": 4.083474159240723, + "rewards/rejected": 2.308297872543335, + "step": 2208 + }, + { + "epoch": 0.36, + "learning_rate": 9.427139174521298e-06, + "logits/chosen": -1.1514892578125, + "logits/rejected": -1.1115392446517944, + "logps/chosen": -66.41517639160156, + "logps/rejected": -56.35026931762695, + "loss": 0.3116, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.446061849594116, + "rewards/margins": 1.0836477279663086, + "rewards/rejected": 2.3624141216278076, + "step": 2209 + }, + { + "epoch": 0.36, + "learning_rate": 9.426528186737566e-06, + "logits/chosen": -0.9405734539031982, + "logits/rejected": -0.8911778926849365, + "logps/chosen": -141.7574920654297, + "logps/rejected": -74.53941345214844, + "loss": 0.8147, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.9448394775390625, + "rewards/margins": 0.038471221923828125, + "rewards/rejected": 0.9063682556152344, + "step": 2210 + }, + { + "epoch": 0.36, + "learning_rate": 9.425916893122622e-06, + "logits/chosen": -0.9214791655540466, + "logits/rejected": -0.9939424395561218, + "logps/chosen": -120.69115447998047, + "logps/rejected": -109.55732727050781, + "loss": 4.0638, + "rewards/accuracies": 0.0, + "rewards/chosen": 4.174170970916748, + "rewards/margins": -2.7982521057128906, + "rewards/rejected": 6.972423076629639, + "step": 2211 + }, + { + "epoch": 0.36, + "learning_rate": 9.425305293718698e-06, + "logits/chosen": -1.0660032033920288, + "logits/rejected": -1.275501012802124, + "logps/chosen": -99.44242095947266, + "logps/rejected": -36.227745056152344, + "loss": 0.4193, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.7654838562011719, + "rewards/margins": 1.5231125354766846, + "rewards/rejected": 0.2423713654279709, + "step": 2212 + }, + { + "epoch": 0.36, + "learning_rate": 9.424693388568049e-06, + "logits/chosen": -0.6423946022987366, + "logits/rejected": -0.7143321633338928, + "logps/chosen": -28.11830711364746, + "logps/rejected": -71.198486328125, + "loss": 0.525, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.48378506302833557, + "rewards/margins": -0.09091737866401672, + "rewards/rejected": 0.5747024416923523, + "step": 2213 + }, + { + "epoch": 0.36, + "learning_rate": 9.424081177712955e-06, + "logits/chosen": -1.2250230312347412, + "logits/rejected": -1.2541862726211548, + "logps/chosen": -122.52716064453125, + "logps/rejected": -63.44475555419922, + "loss": 0.4249, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.4303557872772217, + "rewards/margins": 0.5478087663650513, + "rewards/rejected": 1.8825470209121704, + "step": 2214 + }, + { + "epoch": 0.36, + "learning_rate": 9.423468661195714e-06, + "logits/chosen": -0.7907402515411377, + "logits/rejected": -0.7331982254981995, + "logps/chosen": -88.94219207763672, + "logps/rejected": -102.02131652832031, + "loss": 1.0516, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.8647750616073608, + "rewards/margins": 0.29526281356811523, + "rewards/rejected": 1.5695122480392456, + "step": 2215 + }, + { + "epoch": 0.36, + "learning_rate": 9.422855839058641e-06, + "logits/chosen": -0.9227019548416138, + "logits/rejected": -0.9516122341156006, + "logps/chosen": -112.75665283203125, + "logps/rejected": -70.34910583496094, + "loss": 2.4703, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.3520187139511108, + "rewards/margins": -4.927212715148926, + "rewards/rejected": 6.279231548309326, + "step": 2216 + }, + { + "epoch": 0.36, + "learning_rate": 9.422242711344082e-06, + "logits/chosen": -0.7480780482292175, + "logits/rejected": -0.7500141859054565, + "logps/chosen": -3.4396634101867676, + "logps/rejected": -5.761227607727051, + "loss": 0.6183, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.3153392970561981, + "rewards/margins": -0.259334534406662, + "rewards/rejected": 0.5746738314628601, + "step": 2217 + }, + { + "epoch": 0.36, + "learning_rate": 9.421629278094394e-06, + "logits/chosen": -1.285465955734253, + "logits/rejected": -1.1824394464492798, + "logps/chosen": -164.0369110107422, + "logps/rejected": -89.89969635009766, + "loss": 0.0592, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.846739292144775, + "rewards/margins": 2.140620470046997, + "rewards/rejected": 3.7061188220977783, + "step": 2218 + }, + { + "epoch": 0.36, + "learning_rate": 9.42101553935196e-06, + "logits/chosen": -0.5687681436538696, + "logits/rejected": -0.5687681436538696, + "logps/chosen": -91.36800384521484, + "logps/rejected": -91.36800384521484, + "loss": 0.3526, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.226712942123413, + "rewards/margins": 0.0, + "rewards/rejected": 2.226712942123413, + "step": 2219 + }, + { + "epoch": 0.36, + "learning_rate": 9.420401495159184e-06, + "logits/chosen": -0.7274221181869507, + "logits/rejected": -0.6941364407539368, + "logps/chosen": -23.649723052978516, + "logps/rejected": -5.5697808265686035, + "loss": 0.6732, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.038114167749881744, + "rewards/margins": -0.21794557571411133, + "rewards/rejected": 0.17983141541481018, + "step": 2220 + }, + { + "epoch": 0.36, + "learning_rate": 9.419787145558492e-06, + "logits/chosen": -0.7974589467048645, + "logits/rejected": -0.7186848521232605, + "logps/chosen": -65.04193878173828, + "logps/rejected": -47.23460388183594, + "loss": 0.1184, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.5496795177459717, + "rewards/margins": 1.700974941253662, + "rewards/rejected": 1.8487045764923096, + "step": 2221 + }, + { + "epoch": 0.36, + "learning_rate": 9.41917249059233e-06, + "logits/chosen": -0.8791067600250244, + "logits/rejected": -0.8453786373138428, + "logps/chosen": -92.4111099243164, + "logps/rejected": -65.7273941040039, + "loss": 1.1896, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.143331289291382, + "rewards/margins": -0.05873703956604004, + "rewards/rejected": 2.202068328857422, + "step": 2222 + }, + { + "epoch": 0.36, + "learning_rate": 9.418557530303161e-06, + "logits/chosen": -1.3708842992782593, + "logits/rejected": -1.2154271602630615, + "logps/chosen": -131.3944091796875, + "logps/rejected": -52.471885681152344, + "loss": 0.0255, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.133708477020264, + "rewards/margins": 3.0292932987213135, + "rewards/rejected": 3.10441517829895, + "step": 2223 + }, + { + "epoch": 0.36, + "learning_rate": 9.417942264733478e-06, + "logits/chosen": -0.5073054432868958, + "logits/rejected": -0.5073054432868958, + "logps/chosen": -0.4109709560871124, + "logps/rejected": -0.4109709560871124, + "loss": 0.3674, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.11138942092657089, + "rewards/margins": 0.0, + "rewards/rejected": 0.11138942092657089, + "step": 2224 + }, + { + "epoch": 0.36, + "learning_rate": 9.417326693925784e-06, + "logits/chosen": -0.5637174844741821, + "logits/rejected": -0.5583112239837646, + "logps/chosen": -19.5423526763916, + "logps/rejected": -36.47476577758789, + "loss": 0.4121, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.44477176666259766, + "rewards/margins": -0.04258403182029724, + "rewards/rejected": 0.4873557984828949, + "step": 2225 + }, + { + "epoch": 0.36, + "learning_rate": 9.416710817922615e-06, + "logits/chosen": -0.903928279876709, + "logits/rejected": -0.8980729579925537, + "logps/chosen": -17.471899032592773, + "logps/rejected": -20.88221549987793, + "loss": 1.0141, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.46639251708984375, + "rewards/margins": 0.3154538869857788, + "rewards/rejected": 0.15093861520290375, + "step": 2226 + }, + { + "epoch": 0.36, + "learning_rate": 9.41609463676652e-06, + "logits/chosen": -0.6727031469345093, + "logits/rejected": -0.662284255027771, + "logps/chosen": -97.39990234375, + "logps/rejected": -86.26074981689453, + "loss": 0.4987, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.6961174011230469, + "rewards/margins": -0.3744537830352783, + "rewards/rejected": 1.0705711841583252, + "step": 2227 + }, + { + "epoch": 0.36, + "learning_rate": 9.41547815050007e-06, + "logits/chosen": -1.3564902544021606, + "logits/rejected": -1.2590916156768799, + "logps/chosen": -131.88063049316406, + "logps/rejected": -66.10615539550781, + "loss": 0.0338, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.447157382965088, + "rewards/margins": 3.057507276535034, + "rewards/rejected": 2.3896501064300537, + "step": 2228 + }, + { + "epoch": 0.36, + "learning_rate": 9.41486135916586e-06, + "logits/chosen": -0.9839614033699036, + "logits/rejected": -0.914695143699646, + "logps/chosen": -57.574806213378906, + "logps/rejected": -41.54015350341797, + "loss": 0.1152, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.738455295562744, + "rewards/margins": 1.3595036268234253, + "rewards/rejected": 1.3789516687393188, + "step": 2229 + }, + { + "epoch": 0.36, + "learning_rate": 9.414244262806503e-06, + "logits/chosen": -1.2479681968688965, + "logits/rejected": -1.1511863470077515, + "logps/chosen": -77.9973373413086, + "logps/rejected": -34.65876770019531, + "loss": 1.2848, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.8485832214355469, + "rewards/margins": -1.2185180187225342, + "rewards/rejected": 2.067101240158081, + "step": 2230 + }, + { + "epoch": 0.36, + "learning_rate": 9.413626861464636e-06, + "logits/chosen": -1.0947498083114624, + "logits/rejected": -0.9536238312721252, + "logps/chosen": -101.82723236083984, + "logps/rejected": -24.38425064086914, + "loss": 0.0472, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.981143951416016, + "rewards/margins": 4.389275550842285, + "rewards/rejected": 1.591868281364441, + "step": 2231 + }, + { + "epoch": 0.36, + "learning_rate": 9.413009155182915e-06, + "logits/chosen": -1.1314847469329834, + "logits/rejected": -1.0566328763961792, + "logps/chosen": -116.27699279785156, + "logps/rejected": -82.70062255859375, + "loss": 0.0518, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.810240268707275, + "rewards/margins": 2.2230660915374756, + "rewards/rejected": 3.5871741771698, + "step": 2232 + }, + { + "epoch": 0.36, + "learning_rate": 9.412391144004019e-06, + "logits/chosen": -1.363826036453247, + "logits/rejected": -1.223152995109558, + "logps/chosen": -108.06669616699219, + "logps/rejected": -105.0293197631836, + "loss": 0.6255, + "rewards/accuracies": 0.0, + "rewards/chosen": 6.836174011230469, + "rewards/margins": -0.7527246475219727, + "rewards/rejected": 7.588898658752441, + "step": 2233 + }, + { + "epoch": 0.36, + "learning_rate": 9.411772827970642e-06, + "logits/chosen": -0.9989888072013855, + "logits/rejected": -1.0445518493652344, + "logps/chosen": -31.278005599975586, + "logps/rejected": -115.53428649902344, + "loss": 1.1415, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.5214728116989136, + "rewards/margins": -0.3095865249633789, + "rewards/rejected": 1.8310593366622925, + "step": 2234 + }, + { + "epoch": 0.36, + "learning_rate": 9.41115420712551e-06, + "logits/chosen": -0.9294377565383911, + "logits/rejected": -0.8305491209030151, + "logps/chosen": -102.60436248779297, + "logps/rejected": -81.67871856689453, + "loss": 0.2338, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.7277657985687256, + "rewards/margins": 0.5183274745941162, + "rewards/rejected": 2.2094383239746094, + "step": 2235 + }, + { + "epoch": 0.36, + "learning_rate": 9.41053528151136e-06, + "logits/chosen": -0.7503229975700378, + "logits/rejected": -0.6500683426856995, + "logps/chosen": -41.52357482910156, + "logps/rejected": -32.20698547363281, + "loss": 0.7803, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.510266065597534, + "rewards/margins": 0.6871428489685059, + "rewards/rejected": 1.8231232166290283, + "step": 2236 + }, + { + "epoch": 0.36, + "learning_rate": 9.409916051170956e-06, + "logits/chosen": -0.7584630250930786, + "logits/rejected": -0.7656531929969788, + "logps/chosen": -53.73738098144531, + "logps/rejected": -68.53043365478516, + "loss": 0.6392, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.9277541637420654, + "rewards/margins": -0.07196426391601562, + "rewards/rejected": 2.999718427658081, + "step": 2237 + }, + { + "epoch": 0.36, + "learning_rate": 9.409296516147079e-06, + "logits/chosen": -1.139386773109436, + "logits/rejected": -1.099539041519165, + "logps/chosen": -118.67682647705078, + "logps/rejected": -69.586181640625, + "loss": 0.5613, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.113771915435791, + "rewards/margins": -0.17464971542358398, + "rewards/rejected": 2.288421630859375, + "step": 2238 + }, + { + "epoch": 0.36, + "learning_rate": 9.408676676482533e-06, + "logits/chosen": -1.1021486520767212, + "logits/rejected": -1.0512473583221436, + "logps/chosen": -83.17520141601562, + "logps/rejected": -72.1912612915039, + "loss": 0.9255, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.384665012359619, + "rewards/margins": 0.35994505882263184, + "rewards/rejected": 2.0247199535369873, + "step": 2239 + }, + { + "epoch": 0.36, + "learning_rate": 9.408056532220144e-06, + "logits/chosen": -0.9743667840957642, + "logits/rejected": -0.9749886393547058, + "logps/chosen": -43.33507537841797, + "logps/rejected": -79.93199920654297, + "loss": 1.0176, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.354121446609497, + "rewards/margins": -1.5842719078063965, + "rewards/rejected": 3.9383933544158936, + "step": 2240 + }, + { + "epoch": 0.36, + "learning_rate": 9.40743608340276e-06, + "logits/chosen": -0.862797737121582, + "logits/rejected": -0.7565639615058899, + "logps/chosen": -81.55152893066406, + "logps/rejected": -38.58625030517578, + "loss": 0.6639, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.6431976556777954, + "rewards/margins": -0.8031066656112671, + "rewards/rejected": 1.4463043212890625, + "step": 2241 + }, + { + "epoch": 0.36, + "learning_rate": 9.406815330073244e-06, + "logits/chosen": -1.524374008178711, + "logits/rejected": -1.4331640005111694, + "logps/chosen": -73.17082214355469, + "logps/rejected": -35.85435104370117, + "loss": 0.352, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.619537353515625, + "rewards/margins": 2.336699962615967, + "rewards/rejected": 0.28283730149269104, + "step": 2242 + }, + { + "epoch": 0.36, + "learning_rate": 9.406194272274489e-06, + "logits/chosen": -0.8275321125984192, + "logits/rejected": -0.8121375441551208, + "logps/chosen": -90.14409637451172, + "logps/rejected": -43.58843231201172, + "loss": 0.8784, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.4957077503204346, + "rewards/margins": -0.4412921667098999, + "rewards/rejected": 1.9369999170303345, + "step": 2243 + }, + { + "epoch": 0.36, + "learning_rate": 9.405572910049399e-06, + "logits/chosen": -0.7919104695320129, + "logits/rejected": -0.8457132577896118, + "logps/chosen": -52.920406341552734, + "logps/rejected": -79.47674560546875, + "loss": 0.6303, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.5037952661514282, + "rewards/margins": -0.4868755340576172, + "rewards/rejected": 0.9906708002090454, + "step": 2244 + }, + { + "epoch": 0.36, + "learning_rate": 9.404951243440908e-06, + "logits/chosen": -1.1308321952819824, + "logits/rejected": -1.1308321952819824, + "logps/chosen": -37.634376525878906, + "logps/rejected": -37.634376525878906, + "loss": 0.5458, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.5912978649139404, + "rewards/margins": 0.0, + "rewards/rejected": 3.5912978649139404, + "step": 2245 + }, + { + "epoch": 0.36, + "learning_rate": 9.404329272491966e-06, + "logits/chosen": -0.7003469467163086, + "logits/rejected": -0.6585701107978821, + "logps/chosen": -84.0146255493164, + "logps/rejected": -84.14848327636719, + "loss": 1.2715, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.9329124689102173, + "rewards/margins": -0.33670127391815186, + "rewards/rejected": 2.269613742828369, + "step": 2246 + }, + { + "epoch": 0.36, + "learning_rate": 9.403706997245546e-06, + "logits/chosen": -1.491292119026184, + "logits/rejected": -1.302621603012085, + "logps/chosen": -181.05963134765625, + "logps/rejected": -90.16134643554688, + "loss": 0.167, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.777426242828369, + "rewards/margins": 0.9790787696838379, + "rewards/rejected": 4.798347473144531, + "step": 2247 + }, + { + "epoch": 0.36, + "learning_rate": 9.40308441774464e-06, + "logits/chosen": -0.6211395859718323, + "logits/rejected": -0.6271795034408569, + "logps/chosen": -8.068641662597656, + "logps/rejected": -20.243566513061523, + "loss": 0.9396, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.029820729047060013, + "rewards/margins": -0.19014865159988403, + "rewards/rejected": 0.21996937692165375, + "step": 2248 + }, + { + "epoch": 0.37, + "learning_rate": 9.402461534032264e-06, + "logits/chosen": -0.8818292617797852, + "logits/rejected": -0.8482478857040405, + "logps/chosen": -116.45606994628906, + "logps/rejected": -147.70867919921875, + "loss": 0.5643, + "rewards/accuracies": 0.0, + "rewards/chosen": 6.645555019378662, + "rewards/margins": -0.7281055450439453, + "rewards/rejected": 7.373660564422607, + "step": 2249 + }, + { + "epoch": 0.37, + "learning_rate": 9.40183834615145e-06, + "logits/chosen": -0.9604092240333557, + "logits/rejected": -0.8093259334564209, + "logps/chosen": -91.23420715332031, + "logps/rejected": -78.89503479003906, + "loss": 1.0579, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.6658525466918945, + "rewards/margins": 4.745906352996826, + "rewards/rejected": 0.9199463129043579, + "step": 2250 + }, + { + "epoch": 0.37, + "learning_rate": 9.40121485414526e-06, + "logits/chosen": -0.6080793142318726, + "logits/rejected": -0.5043410062789917, + "logps/chosen": -54.900596618652344, + "logps/rejected": -72.69772338867188, + "loss": 0.4013, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.0338821411132812, + "rewards/margins": 1.7750991582870483, + "rewards/rejected": 0.2587829530239105, + "step": 2251 + }, + { + "epoch": 0.37, + "learning_rate": 9.400591058056768e-06, + "logits/chosen": -0.8802952766418457, + "logits/rejected": -0.9866369366645813, + "logps/chosen": -68.06451416015625, + "logps/rejected": -135.8108673095703, + "loss": 1.6698, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.5242021083831787, + "rewards/margins": -2.3032376766204834, + "rewards/rejected": 4.827439785003662, + "step": 2252 + }, + { + "epoch": 0.37, + "learning_rate": 9.399966957929069e-06, + "logits/chosen": -1.117071509361267, + "logits/rejected": -1.0948257446289062, + "logps/chosen": -58.824554443359375, + "logps/rejected": -82.80489349365234, + "loss": 0.8509, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.8179657459259033, + "rewards/margins": -1.0885231494903564, + "rewards/rejected": 2.9064888954162598, + "step": 2253 + }, + { + "epoch": 0.37, + "learning_rate": 9.399342553805289e-06, + "logits/chosen": -1.3229199647903442, + "logits/rejected": -1.2577686309814453, + "logps/chosen": -151.00973510742188, + "logps/rejected": -37.607940673828125, + "loss": 0.0897, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.936322212219238, + "rewards/margins": 1.7735686302185059, + "rewards/rejected": 5.162753582000732, + "step": 2254 + }, + { + "epoch": 0.37, + "learning_rate": 9.398717845728566e-06, + "logits/chosen": -0.7455236911773682, + "logits/rejected": -0.7138603925704956, + "logps/chosen": -40.33133316040039, + "logps/rejected": -19.060958862304688, + "loss": 0.956, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5821636319160461, + "rewards/margins": 0.3500906229019165, + "rewards/rejected": 0.23207302391529083, + "step": 2255 + }, + { + "epoch": 0.37, + "learning_rate": 9.398092833742059e-06, + "logits/chosen": -0.9950966835021973, + "logits/rejected": -0.9509793519973755, + "logps/chosen": -47.26374435424805, + "logps/rejected": -26.371929168701172, + "loss": 0.8101, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.6337158679962158, + "rewards/margins": -1.0275111198425293, + "rewards/rejected": 2.661226987838745, + "step": 2256 + }, + { + "epoch": 0.37, + "learning_rate": 9.397467517888953e-06, + "logits/chosen": -1.0101284980773926, + "logits/rejected": -1.0205581188201904, + "logps/chosen": -50.657493591308594, + "logps/rejected": -111.08624267578125, + "loss": 1.4469, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.8303734064102173, + "rewards/margins": 0.44341814517974854, + "rewards/rejected": 1.3869552612304688, + "step": 2257 + }, + { + "epoch": 0.37, + "learning_rate": 9.396841898212452e-06, + "logits/chosen": -1.104319453239441, + "logits/rejected": -1.0491796731948853, + "logps/chosen": -73.2484359741211, + "logps/rejected": -59.95480728149414, + "loss": 3.2878, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.7837212085723877, + "rewards/margins": 2.2154643535614014, + "rewards/rejected": 0.5682567954063416, + "step": 2258 + }, + { + "epoch": 0.37, + "learning_rate": 9.396215974755777e-06, + "logits/chosen": -1.1557282209396362, + "logits/rejected": -1.2631027698516846, + "logps/chosen": -101.21113586425781, + "logps/rejected": -94.93529510498047, + "loss": 2.8667, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.802435278892517, + "rewards/margins": -2.6204400062561035, + "rewards/rejected": 4.42287540435791, + "step": 2259 + }, + { + "epoch": 0.37, + "learning_rate": 9.395589747562179e-06, + "logits/chosen": -1.3493579626083374, + "logits/rejected": -1.058339238166809, + "logps/chosen": -102.94062805175781, + "logps/rejected": -28.6629581451416, + "loss": 0.0761, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.0041704177856445, + "rewards/margins": 2.7423737049102783, + "rewards/rejected": 2.261796712875366, + "step": 2260 + }, + { + "epoch": 0.37, + "learning_rate": 9.394963216674919e-06, + "logits/chosen": -1.2284382581710815, + "logits/rejected": -1.1627744436264038, + "logps/chosen": -77.35102844238281, + "logps/rejected": -56.21932601928711, + "loss": 0.6973, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.3962295055389404, + "rewards/margins": 0.8457255363464355, + "rewards/rejected": 1.5505039691925049, + "step": 2261 + }, + { + "epoch": 0.37, + "learning_rate": 9.394336382137285e-06, + "logits/chosen": -1.1718355417251587, + "logits/rejected": -1.2740246057510376, + "logps/chosen": -63.76584243774414, + "logps/rejected": -158.35491943359375, + "loss": 2.8621, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.6646274328231812, + "rewards/margins": -4.46464204788208, + "rewards/rejected": 6.129269599914551, + "step": 2262 + }, + { + "epoch": 0.37, + "learning_rate": 9.393709243992588e-06, + "logits/chosen": -0.9968715310096741, + "logits/rejected": -1.0172394514083862, + "logps/chosen": -115.02879333496094, + "logps/rejected": -118.6015625, + "loss": 0.7215, + "rewards/accuracies": 0.0, + "rewards/chosen": 4.799949645996094, + "rewards/margins": -1.1344680786132812, + "rewards/rejected": 5.934417724609375, + "step": 2263 + }, + { + "epoch": 0.37, + "learning_rate": 9.393081802284154e-06, + "logits/chosen": -0.8964487314224243, + "logits/rejected": -0.8964487314224243, + "logps/chosen": -37.30198287963867, + "logps/rejected": -37.30198287963867, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.8558750152587891, + "rewards/margins": 0.0, + "rewards/rejected": 0.8558750152587891, + "step": 2264 + }, + { + "epoch": 0.37, + "learning_rate": 9.392454057055337e-06, + "logits/chosen": -1.1067564487457275, + "logits/rejected": -1.2300684452056885, + "logps/chosen": -67.77729797363281, + "logps/rejected": -130.64889526367188, + "loss": 2.4745, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.234178304672241, + "rewards/margins": -3.8526933193206787, + "rewards/rejected": 6.08687162399292, + "step": 2265 + }, + { + "epoch": 0.37, + "learning_rate": 9.391826008349507e-06, + "logits/chosen": -1.262753963470459, + "logits/rejected": -1.2133598327636719, + "logps/chosen": -105.5152587890625, + "logps/rejected": -135.60862731933594, + "loss": 0.192, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.75441312789917, + "rewards/margins": 0.903569221496582, + "rewards/rejected": 5.850843906402588, + "step": 2266 + }, + { + "epoch": 0.37, + "learning_rate": 9.391197656210054e-06, + "logits/chosen": -0.9048691987991333, + "logits/rejected": -0.9048691987991333, + "logps/chosen": -55.720611572265625, + "logps/rejected": -55.720611572265625, + "loss": 2.393, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.036302328109741, + "rewards/margins": 0.0, + "rewards/rejected": 2.036302328109741, + "step": 2267 + }, + { + "epoch": 0.37, + "learning_rate": 9.390569000680394e-06, + "logits/chosen": -1.2913203239440918, + "logits/rejected": -1.2882404327392578, + "logps/chosen": -40.62782287597656, + "logps/rejected": -37.716400146484375, + "loss": 0.2792, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.03389048576355, + "rewards/margins": 0.4423530101776123, + "rewards/rejected": 1.5915374755859375, + "step": 2268 + }, + { + "epoch": 0.37, + "learning_rate": 9.38994004180396e-06, + "logits/chosen": -0.8868154287338257, + "logits/rejected": -0.8226727247238159, + "logps/chosen": -79.66502380371094, + "logps/rejected": -102.07954406738281, + "loss": 1.5783, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.4763290882110596, + "rewards/margins": 0.5582001209259033, + "rewards/rejected": 2.9181289672851562, + "step": 2269 + }, + { + "epoch": 0.37, + "learning_rate": 9.389310779624206e-06, + "logits/chosen": -0.9089471101760864, + "logits/rejected": -0.8532925248146057, + "logps/chosen": -88.01019287109375, + "logps/rejected": -57.15376281738281, + "loss": 1.0298, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.109259843826294, + "rewards/margins": 0.17139136791229248, + "rewards/rejected": 1.9378684759140015, + "step": 2270 + }, + { + "epoch": 0.37, + "learning_rate": 9.38868121418461e-06, + "logits/chosen": -0.7260327339172363, + "logits/rejected": -0.7260327339172363, + "logps/chosen": -66.33881378173828, + "logps/rejected": -66.33881378173828, + "loss": 0.7111, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.6684143543243408, + "rewards/margins": 0.0, + "rewards/rejected": 1.6684143543243408, + "step": 2271 + }, + { + "epoch": 0.37, + "learning_rate": 9.388051345528668e-06, + "logits/chosen": -1.3030600547790527, + "logits/rejected": -1.006218671798706, + "logps/chosen": -182.70932006835938, + "logps/rejected": -38.09520721435547, + "loss": 0.0079, + "rewards/accuracies": 1.0, + "rewards/chosen": 7.990512371063232, + "rewards/margins": 4.834449291229248, + "rewards/rejected": 3.1560630798339844, + "step": 2272 + }, + { + "epoch": 0.37, + "learning_rate": 9.387421173699898e-06, + "logits/chosen": -1.163474440574646, + "logits/rejected": -1.3028051853179932, + "logps/chosen": -89.95344543457031, + "logps/rejected": -135.06101989746094, + "loss": 0.6816, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.198406934738159, + "rewards/margins": -0.8527207374572754, + "rewards/rejected": 3.0511276721954346, + "step": 2273 + }, + { + "epoch": 0.37, + "learning_rate": 9.38679069874184e-06, + "logits/chosen": -0.8939145803451538, + "logits/rejected": -0.7310494184494019, + "logps/chosen": -81.64785766601562, + "logps/rejected": -91.09446716308594, + "loss": 1.3364, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.3373429775238037, + "rewards/margins": -0.3641471862792969, + "rewards/rejected": 2.7014901638031006, + "step": 2274 + }, + { + "epoch": 0.37, + "learning_rate": 9.386159920698052e-06, + "logits/chosen": -0.8406442403793335, + "logits/rejected": -0.9121870994567871, + "logps/chosen": -26.463932037353516, + "logps/rejected": -52.10335159301758, + "loss": 0.4013, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.223102331161499, + "rewards/margins": -0.1993110179901123, + "rewards/rejected": 2.4224133491516113, + "step": 2275 + }, + { + "epoch": 0.37, + "learning_rate": 9.385528839612115e-06, + "logits/chosen": -1.1389578580856323, + "logits/rejected": -1.0846596956253052, + "logps/chosen": -52.19486999511719, + "logps/rejected": -12.95544719696045, + "loss": 0.1593, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.6970771551132202, + "rewards/margins": 1.214532494544983, + "rewards/rejected": 0.4825446307659149, + "step": 2276 + }, + { + "epoch": 0.37, + "learning_rate": 9.384897455527633e-06, + "logits/chosen": -0.906837522983551, + "logits/rejected": -0.8935155272483826, + "logps/chosen": -77.41957092285156, + "logps/rejected": -114.08332824707031, + "loss": 0.7913, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.0118820667266846, + "rewards/margins": -0.07228398323059082, + "rewards/rejected": 2.0841660499572754, + "step": 2277 + }, + { + "epoch": 0.37, + "learning_rate": 9.384265768488226e-06, + "logits/chosen": -1.137958288192749, + "logits/rejected": -1.1619479656219482, + "logps/chosen": -78.0108642578125, + "logps/rejected": -89.6146011352539, + "loss": 0.6342, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.56585693359375, + "rewards/margins": 0.9163093566894531, + "rewards/rejected": 1.6495475769042969, + "step": 2278 + }, + { + "epoch": 0.37, + "learning_rate": 9.38363377853754e-06, + "logits/chosen": -1.127249836921692, + "logits/rejected": -0.8771397471427917, + "logps/chosen": -149.695068359375, + "logps/rejected": -74.31600952148438, + "loss": 0.0415, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.680215358734131, + "rewards/margins": 2.5839216709136963, + "rewards/rejected": 3.0962936878204346, + "step": 2279 + }, + { + "epoch": 0.37, + "learning_rate": 9.383001485719237e-06, + "logits/chosen": -0.5302821397781372, + "logits/rejected": -0.5302821397781372, + "logps/chosen": -18.937118530273438, + "logps/rejected": -18.937118530273438, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.3431219160556793, + "rewards/margins": 0.0, + "rewards/rejected": 0.3431219160556793, + "step": 2280 + }, + { + "epoch": 0.37, + "learning_rate": 9.382368890077004e-06, + "logits/chosen": -1.109968662261963, + "logits/rejected": -1.0439826250076294, + "logps/chosen": -79.97994995117188, + "logps/rejected": -37.823265075683594, + "loss": 1.2824, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.006727695465088, + "rewards/margins": 1.5252671241760254, + "rewards/rejected": 0.4814605712890625, + "step": 2281 + }, + { + "epoch": 0.37, + "learning_rate": 9.381735991654547e-06, + "logits/chosen": -0.8323072195053101, + "logits/rejected": -0.8323072195053101, + "logps/chosen": -16.5610294342041, + "logps/rejected": -16.5610294342041, + "loss": 0.5639, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.2617105543613434, + "rewards/margins": 0.0, + "rewards/rejected": 0.2617105543613434, + "step": 2282 + }, + { + "epoch": 0.37, + "learning_rate": 9.381102790495593e-06, + "logits/chosen": -1.1047172546386719, + "logits/rejected": -0.8771855235099792, + "logps/chosen": -87.58617401123047, + "logps/rejected": -36.83208465576172, + "loss": 0.0777, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.377347469329834, + "rewards/margins": 4.562836170196533, + "rewards/rejected": -0.1854885071516037, + "step": 2283 + }, + { + "epoch": 0.37, + "learning_rate": 9.380469286643892e-06, + "logits/chosen": -1.0883804559707642, + "logits/rejected": -1.1701552867889404, + "logps/chosen": -63.24445343017578, + "logps/rejected": -79.14677429199219, + "loss": 1.8261, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.116558074951172, + "rewards/margins": -3.5711207389831543, + "rewards/rejected": 5.687678813934326, + "step": 2284 + }, + { + "epoch": 0.37, + "learning_rate": 9.37983548014321e-06, + "logits/chosen": -0.9498952627182007, + "logits/rejected": -0.9858366250991821, + "logps/chosen": -64.25550842285156, + "logps/rejected": -92.88378143310547, + "loss": 0.8701, + "rewards/accuracies": 0.0, + "rewards/chosen": 4.508869171142578, + "rewards/margins": -1.5047564506530762, + "rewards/rejected": 6.013625621795654, + "step": 2285 + }, + { + "epoch": 0.37, + "learning_rate": 9.37920137103734e-06, + "logits/chosen": -1.1848759651184082, + "logits/rejected": -1.2532297372817993, + "logps/chosen": -207.66966247558594, + "logps/rejected": -178.1842041015625, + "loss": 0.1623, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.8715837001800537, + "rewards/margins": 1.0787155628204346, + "rewards/rejected": 2.792868137359619, + "step": 2286 + }, + { + "epoch": 0.37, + "learning_rate": 9.37856695937009e-06, + "logits/chosen": -1.3065412044525146, + "logits/rejected": -1.324179768562317, + "logps/chosen": -75.74547576904297, + "logps/rejected": -151.65374755859375, + "loss": 0.429, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.545180797576904, + "rewards/margins": 0.11568260192871094, + "rewards/rejected": 6.429498195648193, + "step": 2287 + }, + { + "epoch": 0.37, + "learning_rate": 9.377932245185296e-06, + "logits/chosen": -1.0136069059371948, + "logits/rejected": -0.9364776015281677, + "logps/chosen": -46.73750305175781, + "logps/rejected": -55.068695068359375, + "loss": 0.3734, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.5326470136642456, + "rewards/margins": -0.07627332210540771, + "rewards/rejected": 1.6089203357696533, + "step": 2288 + }, + { + "epoch": 0.37, + "learning_rate": 9.377297228526807e-06, + "logits/chosen": -0.81978839635849, + "logits/rejected": -0.8171266317367554, + "logps/chosen": -89.30668640136719, + "logps/rejected": -106.3252182006836, + "loss": 0.1718, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.407703399658203, + "rewards/margins": 0.9182921648025513, + "rewards/rejected": 1.4894112348556519, + "step": 2289 + }, + { + "epoch": 0.37, + "learning_rate": 9.376661909438496e-06, + "logits/chosen": -0.6548418998718262, + "logits/rejected": -0.5630453824996948, + "logps/chosen": -46.643943786621094, + "logps/rejected": -19.88430404663086, + "loss": 0.1503, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.6836098432540894, + "rewards/margins": 1.5624516010284424, + "rewards/rejected": 0.12115821987390518, + "step": 2290 + }, + { + "epoch": 0.37, + "learning_rate": 9.376026287964262e-06, + "logits/chosen": -0.9076887965202332, + "logits/rejected": -0.6480115056037903, + "logps/chosen": -151.40887451171875, + "logps/rejected": -79.60079956054688, + "loss": 0.0979, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.169293403625488, + "rewards/margins": 1.5366318225860596, + "rewards/rejected": 2.6326615810394287, + "step": 2291 + }, + { + "epoch": 0.37, + "learning_rate": 9.375390364148017e-06, + "logits/chosen": -0.8693550229072571, + "logits/rejected": -0.9673246145248413, + "logps/chosen": -78.30497741699219, + "logps/rejected": -104.53599548339844, + "loss": 1.269, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.3952378034591675, + "rewards/margins": -1.5640884637832642, + "rewards/rejected": 2.9593262672424316, + "step": 2292 + }, + { + "epoch": 0.37, + "learning_rate": 9.374754138033697e-06, + "logits/chosen": -0.8963645696640015, + "logits/rejected": -0.892049252986908, + "logps/chosen": -57.26251220703125, + "logps/rejected": -109.46519470214844, + "loss": 0.8038, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.6306908130645752, + "rewards/margins": 0.22698438167572021, + "rewards/rejected": 1.403706431388855, + "step": 2293 + }, + { + "epoch": 0.37, + "learning_rate": 9.374117609665263e-06, + "logits/chosen": -1.0740913152694702, + "logits/rejected": -1.0790557861328125, + "logps/chosen": -40.58860778808594, + "logps/rejected": -51.67510986328125, + "loss": 0.3204, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9227523803710938, + "rewards/margins": 0.8344786167144775, + "rewards/rejected": 2.088273763656616, + "step": 2294 + }, + { + "epoch": 0.37, + "learning_rate": 9.373480779086688e-06, + "logits/chosen": -0.8506032228469849, + "logits/rejected": -0.8844373226165771, + "logps/chosen": -94.12295532226562, + "logps/rejected": -80.19114685058594, + "loss": 0.6286, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.9147980213165283, + "rewards/margins": -0.708681583404541, + "rewards/rejected": 2.6234796047210693, + "step": 2295 + }, + { + "epoch": 0.37, + "learning_rate": 9.372843646341974e-06, + "logits/chosen": -0.9571319222450256, + "logits/rejected": -1.005557894706726, + "logps/chosen": -59.742210388183594, + "logps/rejected": -73.392578125, + "loss": 0.9933, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.8050247430801392, + "rewards/margins": -1.3885499238967896, + "rewards/rejected": 3.1935746669769287, + "step": 2296 + }, + { + "epoch": 0.37, + "learning_rate": 9.372206211475141e-06, + "logits/chosen": -1.217199683189392, + "logits/rejected": -1.0306202173233032, + "logps/chosen": -105.9482421875, + "logps/rejected": -164.665771484375, + "loss": 1.7428, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.6146652698516846, + "rewards/margins": -2.9690749645233154, + "rewards/rejected": 4.583740234375, + "step": 2297 + }, + { + "epoch": 0.37, + "learning_rate": 9.371568474530228e-06, + "logits/chosen": -0.7692803740501404, + "logits/rejected": -0.7801226377487183, + "logps/chosen": -32.99416732788086, + "logps/rejected": -105.65510559082031, + "loss": 1.9855, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.36914101243019104, + "rewards/margins": -0.5987949371337891, + "rewards/rejected": 0.9679359793663025, + "step": 2298 + }, + { + "epoch": 0.37, + "learning_rate": 9.370930435551298e-06, + "logits/chosen": -0.9435452222824097, + "logits/rejected": -0.9456911087036133, + "logps/chosen": -5.3495774269104, + "logps/rejected": -16.8029842376709, + "loss": 0.6779, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.32142481207847595, + "rewards/margins": 0.06197082996368408, + "rewards/rejected": 0.25945398211479187, + "step": 2299 + }, + { + "epoch": 0.37, + "learning_rate": 9.370292094582434e-06, + "logits/chosen": -1.0313161611557007, + "logits/rejected": -0.8402178287506104, + "logps/chosen": -72.34506225585938, + "logps/rejected": -9.3731689453125, + "loss": 0.2996, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.1417083740234375, + "rewards/margins": 1.574008822441101, + "rewards/rejected": 0.5676995515823364, + "step": 2300 + }, + { + "epoch": 0.37, + "learning_rate": 9.369653451667738e-06, + "logits/chosen": -1.1156647205352783, + "logits/rejected": -1.2641569375991821, + "logps/chosen": -108.61195373535156, + "logps/rejected": -36.46814727783203, + "loss": 0.1745, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.2284355163574219, + "rewards/margins": 1.0484611988067627, + "rewards/rejected": 0.17997436225414276, + "step": 2301 + }, + { + "epoch": 0.37, + "learning_rate": 9.369014506851334e-06, + "logits/chosen": -0.7845228910446167, + "logits/rejected": -0.7260895371437073, + "logps/chosen": -42.594696044921875, + "logps/rejected": -45.85684585571289, + "loss": 0.4431, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.6883087158203125, + "rewards/margins": 0.5776386260986328, + "rewards/rejected": 1.1106700897216797, + "step": 2302 + }, + { + "epoch": 0.37, + "learning_rate": 9.368375260177367e-06, + "logits/chosen": -1.2745506763458252, + "logits/rejected": -1.3061407804489136, + "logps/chosen": -133.68716430664062, + "logps/rejected": -87.13351440429688, + "loss": 0.6878, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.1474685668945312, + "rewards/margins": 0.02308797836303711, + "rewards/rejected": 2.124380588531494, + "step": 2303 + }, + { + "epoch": 0.37, + "learning_rate": 9.367735711690005e-06, + "logits/chosen": -1.017656922340393, + "logits/rejected": -1.20731782913208, + "logps/chosen": -53.976539611816406, + "logps/rejected": -139.65399169921875, + "loss": 2.1204, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.0701920986175537, + "rewards/margins": -3.88901686668396, + "rewards/rejected": 5.959208965301514, + "step": 2304 + }, + { + "epoch": 0.37, + "learning_rate": 9.367095861433432e-06, + "logits/chosen": -1.0671749114990234, + "logits/rejected": -0.9378573298454285, + "logps/chosen": -51.20838928222656, + "logps/rejected": -54.6323127746582, + "loss": 0.3569, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1447646617889404, + "rewards/margins": 0.21110248565673828, + "rewards/rejected": 2.933662176132202, + "step": 2305 + }, + { + "epoch": 0.37, + "learning_rate": 9.366455709451857e-06, + "logits/chosen": -1.3201086521148682, + "logits/rejected": -1.3455740213394165, + "logps/chosen": -101.34672546386719, + "logps/rejected": -70.04022216796875, + "loss": 0.9104, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.4220352172851562, + "rewards/margins": -1.519416093826294, + "rewards/rejected": 3.94145131111145, + "step": 2306 + }, + { + "epoch": 0.37, + "learning_rate": 9.365815255789507e-06, + "logits/chosen": -1.1272417306900024, + "logits/rejected": -1.061032772064209, + "logps/chosen": -72.32408142089844, + "logps/rejected": -48.330806732177734, + "loss": 0.8504, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.220300316810608, + "rewards/margins": -0.36667370796203613, + "rewards/rejected": 1.586974024772644, + "step": 2307 + }, + { + "epoch": 0.37, + "learning_rate": 9.365174500490635e-06, + "logits/chosen": -1.165306568145752, + "logits/rejected": -1.247016191482544, + "logps/chosen": -137.96298217773438, + "logps/rejected": -86.93328857421875, + "loss": 1.3898, + "rewards/accuracies": 0.0, + "rewards/chosen": 6.057861328125, + "rewards/margins": -0.197906494140625, + "rewards/rejected": 6.255767822265625, + "step": 2308 + }, + { + "epoch": 0.37, + "learning_rate": 9.364533443599508e-06, + "logits/chosen": -1.338762879371643, + "logits/rejected": -1.1035875082015991, + "logps/chosen": -165.1817626953125, + "logps/rejected": -33.650020599365234, + "loss": 0.0184, + "rewards/accuracies": 1.0, + "rewards/chosen": 7.1937456130981445, + "rewards/margins": 6.899297714233398, + "rewards/rejected": 0.29444772005081177, + "step": 2309 + }, + { + "epoch": 0.37, + "learning_rate": 9.363892085160418e-06, + "logits/chosen": -1.012378454208374, + "logits/rejected": -1.012378454208374, + "logps/chosen": -77.51939392089844, + "logps/rejected": -77.51939392089844, + "loss": 0.3643, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.6225478649139404, + "rewards/margins": 0.0, + "rewards/rejected": 2.6225478649139404, + "step": 2310 + }, + { + "epoch": 0.38, + "learning_rate": 9.363250425217675e-06, + "logits/chosen": -1.0914196968078613, + "logits/rejected": -1.1489739418029785, + "logps/chosen": -87.88398742675781, + "logps/rejected": -48.381492614746094, + "loss": 0.5292, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.7354629039764404, + "rewards/margins": -0.5101151466369629, + "rewards/rejected": 3.2455780506134033, + "step": 2311 + }, + { + "epoch": 0.38, + "learning_rate": 9.362608463815614e-06, + "logits/chosen": -0.6154536008834839, + "logits/rejected": -0.6154536008834839, + "logps/chosen": -107.94966125488281, + "logps/rejected": -107.94966125488281, + "loss": 0.844, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.6848466396331787, + "rewards/margins": 0.0, + "rewards/rejected": 2.6848466396331787, + "step": 2312 + }, + { + "epoch": 0.38, + "learning_rate": 9.361966200998587e-06, + "logits/chosen": -0.8108139634132385, + "logits/rejected": -0.8108139634132385, + "logps/chosen": -60.73965835571289, + "logps/rejected": -60.73965835571289, + "loss": 0.939, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.2979763746261597, + "rewards/margins": 0.0, + "rewards/rejected": 1.2979763746261597, + "step": 2313 + }, + { + "epoch": 0.38, + "learning_rate": 9.36132363681097e-06, + "logits/chosen": -0.994178831577301, + "logits/rejected": -1.0530282258987427, + "logps/chosen": -58.08511734008789, + "logps/rejected": -83.65037536621094, + "loss": 0.4687, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.59837007522583, + "rewards/margins": -0.2868831157684326, + "rewards/rejected": 2.8852531909942627, + "step": 2314 + }, + { + "epoch": 0.38, + "learning_rate": 9.360680771297155e-06, + "logits/chosen": -1.109061598777771, + "logits/rejected": -0.9556238651275635, + "logps/chosen": -108.48318481445312, + "logps/rejected": -59.5947265625, + "loss": 0.0805, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.112231492996216, + "rewards/margins": 1.8176296949386597, + "rewards/rejected": 0.29460182785987854, + "step": 2315 + }, + { + "epoch": 0.38, + "learning_rate": 9.360037604501561e-06, + "logits/chosen": -0.8784880042076111, + "logits/rejected": -0.7813712954521179, + "logps/chosen": -69.44989013671875, + "logps/rejected": -76.69600677490234, + "loss": 0.1084, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1564781665802, + "rewards/margins": 1.6994446516036987, + "rewards/rejected": 1.4570335149765015, + "step": 2316 + }, + { + "epoch": 0.38, + "learning_rate": 9.359394136468625e-06, + "logits/chosen": -1.2946794033050537, + "logits/rejected": -1.1343615055084229, + "logps/chosen": -129.94393920898438, + "logps/rejected": -55.309913635253906, + "loss": 1.0187, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.905770778656006, + "rewards/margins": 5.775656223297119, + "rewards/rejected": 0.1301143616437912, + "step": 2317 + }, + { + "epoch": 0.38, + "learning_rate": 9.358750367242802e-06, + "logits/chosen": -1.1276299953460693, + "logits/rejected": -1.1102710962295532, + "logps/chosen": -22.65729522705078, + "logps/rejected": -58.20493698120117, + "loss": 0.3315, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.5729999542236328, + "rewards/margins": 0.26454615592956543, + "rewards/rejected": 1.3084537982940674, + "step": 2318 + }, + { + "epoch": 0.38, + "learning_rate": 9.35810629686857e-06, + "logits/chosen": -1.1866400241851807, + "logits/rejected": -1.0998419523239136, + "logps/chosen": -68.07572174072266, + "logps/rejected": -59.113624572753906, + "loss": 0.985, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.622617244720459, + "rewards/margins": 0.5926923751831055, + "rewards/rejected": 4.0299248695373535, + "step": 2319 + }, + { + "epoch": 0.38, + "learning_rate": 9.357461925390432e-06, + "logits/chosen": -0.6733152270317078, + "logits/rejected": -0.6678617596626282, + "logps/chosen": -4.9141926765441895, + "logps/rejected": -29.716859817504883, + "loss": 0.8447, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.42526698112487793, + "rewards/margins": -0.5354502201080322, + "rewards/rejected": 0.9607172012329102, + "step": 2320 + }, + { + "epoch": 0.38, + "learning_rate": 9.356817252852904e-06, + "logits/chosen": -0.8865653276443481, + "logits/rejected": -0.8495904803276062, + "logps/chosen": -75.10043334960938, + "logps/rejected": -84.14132690429688, + "loss": 1.0136, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.1498345136642456, + "rewards/margins": -1.8664168119430542, + "rewards/rejected": 3.0162513256073, + "step": 2321 + }, + { + "epoch": 0.38, + "learning_rate": 9.356172279300528e-06, + "logits/chosen": -0.961037278175354, + "logits/rejected": -0.9731350541114807, + "logps/chosen": -7.893820285797119, + "logps/rejected": -4.23169469833374, + "loss": 0.3482, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2517930567264557, + "rewards/margins": 0.031052634119987488, + "rewards/rejected": 0.2207404226064682, + "step": 2322 + }, + { + "epoch": 0.38, + "learning_rate": 9.355527004777868e-06, + "logits/chosen": -0.4006352722644806, + "logits/rejected": -0.42970019578933716, + "logps/chosen": -12.891963005065918, + "logps/rejected": -67.77108001708984, + "loss": 1.4329, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.2185807228088379, + "rewards/margins": -0.03667593002319336, + "rewards/rejected": 0.25525665283203125, + "step": 2323 + }, + { + "epoch": 0.38, + "learning_rate": 9.354881429329504e-06, + "logits/chosen": -1.0839790105819702, + "logits/rejected": -1.015230655670166, + "logps/chosen": -89.26105499267578, + "logps/rejected": -46.00847244262695, + "loss": 1.6295, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.343074083328247, + "rewards/margins": 1.1396331787109375, + "rewards/rejected": 1.2034409046173096, + "step": 2324 + }, + { + "epoch": 0.38, + "learning_rate": 9.354235553000037e-06, + "logits/chosen": -0.7079460024833679, + "logits/rejected": -0.6625537276268005, + "logps/chosen": -42.63539123535156, + "logps/rejected": -84.21049499511719, + "loss": 1.8648, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.7903140783309937, + "rewards/margins": -2.860222816467285, + "rewards/rejected": 4.650537014007568, + "step": 2325 + }, + { + "epoch": 0.38, + "learning_rate": 9.353589375834095e-06, + "logits/chosen": -0.8073450922966003, + "logits/rejected": -0.8073450922966003, + "logps/chosen": -31.941970825195312, + "logps/rejected": -31.941970825195312, + "loss": 0.46, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.6672554016113281, + "rewards/margins": 0.0, + "rewards/rejected": 0.6672554016113281, + "step": 2326 + }, + { + "epoch": 0.38, + "learning_rate": 9.352942897876323e-06, + "logits/chosen": -1.0445899963378906, + "logits/rejected": -1.1722357273101807, + "logps/chosen": -65.4861831665039, + "logps/rejected": -147.4047088623047, + "loss": 2.3993, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.2054375410079956, + "rewards/margins": -4.717875003814697, + "rewards/rejected": 5.923312664031982, + "step": 2327 + }, + { + "epoch": 0.38, + "learning_rate": 9.352296119171382e-06, + "logits/chosen": -0.9116876125335693, + "logits/rejected": -0.9021227359771729, + "logps/chosen": -31.822975158691406, + "logps/rejected": -39.41815185546875, + "loss": 1.0886, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.140032172203064, + "rewards/margins": -2.0540175437927246, + "rewards/rejected": 3.194049835205078, + "step": 2328 + }, + { + "epoch": 0.38, + "learning_rate": 9.351649039763963e-06, + "logits/chosen": -1.4550279378890991, + "logits/rejected": -1.5278117656707764, + "logps/chosen": -88.81535339355469, + "logps/rejected": -132.052978515625, + "loss": 2.001, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.4726508855819702, + "rewards/margins": -2.9496545791625977, + "rewards/rejected": 4.422305583953857, + "step": 2329 + }, + { + "epoch": 0.38, + "learning_rate": 9.35100165969877e-06, + "logits/chosen": -1.1878467798233032, + "logits/rejected": -1.1937775611877441, + "logps/chosen": -65.7521743774414, + "logps/rejected": -72.81787109375, + "loss": 0.7926, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.0938987731933594, + "rewards/margins": -0.8428847789764404, + "rewards/rejected": 2.9367835521698, + "step": 2330 + }, + { + "epoch": 0.38, + "learning_rate": 9.350353979020532e-06, + "logits/chosen": -0.3975752592086792, + "logits/rejected": -0.39905786514282227, + "logps/chosen": -2.8587281703948975, + "logps/rejected": -2.040428638458252, + "loss": 0.8747, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.1445942223072052, + "rewards/margins": -0.07652513682842255, + "rewards/rejected": 0.22111935913562775, + "step": 2331 + }, + { + "epoch": 0.38, + "learning_rate": 9.349705997773997e-06, + "logits/chosen": -1.2882572412490845, + "logits/rejected": -1.2263720035552979, + "logps/chosen": -88.02357482910156, + "logps/rejected": -84.00092315673828, + "loss": 0.9262, + "rewards/accuracies": 0.0, + "rewards/chosen": 5.575619697570801, + "rewards/margins": -0.3781728744506836, + "rewards/rejected": 5.953792572021484, + "step": 2332 + }, + { + "epoch": 0.38, + "learning_rate": 9.349057716003936e-06, + "logits/chosen": -1.1159706115722656, + "logits/rejected": -1.1199476718902588, + "logps/chosen": -57.19538879394531, + "logps/rejected": -101.9538345336914, + "loss": 1.0155, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.8980331420898438, + "rewards/margins": -1.705909013748169, + "rewards/rejected": 2.6039421558380127, + "step": 2333 + }, + { + "epoch": 0.38, + "learning_rate": 9.348409133755137e-06, + "logits/chosen": -1.2186081409454346, + "logits/rejected": -1.00344717502594, + "logps/chosen": -93.64886474609375, + "logps/rejected": -73.52879333496094, + "loss": 0.4159, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.235116481781006, + "rewards/margins": 3.5459654331207275, + "rewards/rejected": 2.6891510486602783, + "step": 2334 + }, + { + "epoch": 0.38, + "learning_rate": 9.347760251072412e-06, + "logits/chosen": -0.7259886264801025, + "logits/rejected": -0.6110419034957886, + "logps/chosen": -65.69782257080078, + "logps/rejected": -9.194710731506348, + "loss": 0.1522, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.6855103969573975, + "rewards/margins": 1.630799651145935, + "rewards/rejected": 1.0547107458114624, + "step": 2335 + }, + { + "epoch": 0.38, + "learning_rate": 9.347111068000594e-06, + "logits/chosen": -0.7046142816543579, + "logits/rejected": -0.7101750373840332, + "logps/chosen": -39.034629821777344, + "logps/rejected": -81.59901428222656, + "loss": 0.4973, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.9566627740859985, + "rewards/margins": -0.37354886531829834, + "rewards/rejected": 2.330211639404297, + "step": 2336 + }, + { + "epoch": 0.38, + "learning_rate": 9.346461584584531e-06, + "logits/chosen": -1.2371770143508911, + "logits/rejected": -1.0541574954986572, + "logps/chosen": -104.21924591064453, + "logps/rejected": -87.07467651367188, + "loss": 0.328, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.638115882873535, + "rewards/margins": 4.156200408935547, + "rewards/rejected": 2.481915235519409, + "step": 2337 + }, + { + "epoch": 0.38, + "learning_rate": 9.3458118008691e-06, + "logits/chosen": -1.017877221107483, + "logits/rejected": -0.9858449101448059, + "logps/chosen": -25.278905868530273, + "logps/rejected": -45.630714416503906, + "loss": 1.0722, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.3967008590698242, + "rewards/margins": -1.184326410293579, + "rewards/rejected": 2.5810272693634033, + "step": 2338 + }, + { + "epoch": 0.38, + "learning_rate": 9.345161716899196e-06, + "logits/chosen": -0.569315493106842, + "logits/rejected": -0.569315493106842, + "logps/chosen": -39.685142517089844, + "logps/rejected": -39.685142517089844, + "loss": 0.6034, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.3224124908447266, + "rewards/margins": 0.0, + "rewards/rejected": 1.3224124908447266, + "step": 2339 + }, + { + "epoch": 0.38, + "learning_rate": 9.34451133271973e-06, + "logits/chosen": -1.2940928936004639, + "logits/rejected": -1.2084301710128784, + "logps/chosen": -40.45659637451172, + "logps/rejected": -10.701072692871094, + "loss": 0.3806, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.5175797939300537, + "rewards/margins": 2.1903300285339355, + "rewards/rejected": 0.3272497355937958, + "step": 2340 + }, + { + "epoch": 0.38, + "learning_rate": 9.34386064837564e-06, + "logits/chosen": -0.6758198738098145, + "logits/rejected": -0.6822760105133057, + "logps/chosen": -37.80188751220703, + "logps/rejected": -26.920438766479492, + "loss": 1.385, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.8132984042167664, + "rewards/margins": -0.29320472478866577, + "rewards/rejected": 1.1065031290054321, + "step": 2341 + }, + { + "epoch": 0.38, + "learning_rate": 9.343209663911882e-06, + "logits/chosen": -1.1855790615081787, + "logits/rejected": -1.2791550159454346, + "logps/chosen": -78.79313659667969, + "logps/rejected": -109.62968444824219, + "loss": 0.8179, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.2099220752716064, + "rewards/margins": -1.2683112621307373, + "rewards/rejected": 2.4782333374023438, + "step": 2342 + }, + { + "epoch": 0.38, + "learning_rate": 9.34255837937343e-06, + "logits/chosen": -1.213197112083435, + "logits/rejected": -1.2035192251205444, + "logps/chosen": -89.27955627441406, + "logps/rejected": -67.28531646728516, + "loss": 0.2912, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.154198408126831, + "rewards/margins": 0.3668631315231323, + "rewards/rejected": 1.7873352766036987, + "step": 2343 + }, + { + "epoch": 0.38, + "learning_rate": 9.341906794805285e-06, + "logits/chosen": -1.275673270225525, + "logits/rejected": -1.3240519762039185, + "logps/chosen": -125.66669464111328, + "logps/rejected": -151.88458251953125, + "loss": 0.856, + "rewards/accuracies": 0.0, + "rewards/chosen": 5.7938103675842285, + "rewards/margins": -0.5918188095092773, + "rewards/rejected": 6.385629177093506, + "step": 2344 + }, + { + "epoch": 0.38, + "learning_rate": 9.341254910252462e-06, + "logits/chosen": -0.5790380239486694, + "logits/rejected": -0.5975205898284912, + "logps/chosen": -95.37034606933594, + "logps/rejected": -40.21502685546875, + "loss": 1.2208, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.15476380288600922, + "rewards/margins": -2.20851469039917, + "rewards/rejected": 2.053750991821289, + "step": 2345 + }, + { + "epoch": 0.38, + "learning_rate": 9.340602725760003e-06, + "logits/chosen": -1.1367186307907104, + "logits/rejected": -1.114358901977539, + "logps/chosen": -81.38461303710938, + "logps/rejected": -67.7984390258789, + "loss": 0.6649, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.04160475730896, + "rewards/margins": 0.9479875564575195, + "rewards/rejected": 2.0936172008514404, + "step": 2346 + }, + { + "epoch": 0.38, + "learning_rate": 9.339950241372969e-06, + "logits/chosen": -0.6910792589187622, + "logits/rejected": -0.6147702932357788, + "logps/chosen": -41.33944320678711, + "logps/rejected": -37.59162139892578, + "loss": 0.1622, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.654764175415039, + "rewards/margins": 0.9968478679656982, + "rewards/rejected": 1.6579163074493408, + "step": 2347 + }, + { + "epoch": 0.38, + "learning_rate": 9.339297457136435e-06, + "logits/chosen": -1.2216991186141968, + "logits/rejected": -1.1543124914169312, + "logps/chosen": -122.8746109008789, + "logps/rejected": -112.98229217529297, + "loss": 1.041, + "rewards/accuracies": 0.0, + "rewards/chosen": 4.619420528411865, + "rewards/margins": -1.8822665214538574, + "rewards/rejected": 6.501687049865723, + "step": 2348 + }, + { + "epoch": 0.38, + "learning_rate": 9.338644373095507e-06, + "logits/chosen": -0.9972350001335144, + "logits/rejected": -1.0271607637405396, + "logps/chosen": -89.92083740234375, + "logps/rejected": -95.65548706054688, + "loss": 1.2009, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.0901634693145752, + "rewards/margins": -2.1538825035095215, + "rewards/rejected": 3.2440459728240967, + "step": 2349 + }, + { + "epoch": 0.38, + "learning_rate": 9.337990989295306e-06, + "logits/chosen": -0.9465568661689758, + "logits/rejected": -0.8827171921730042, + "logps/chosen": -129.9507598876953, + "logps/rejected": -122.91284942626953, + "loss": 0.1875, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.138153076171875, + "rewards/margins": 1.4044134616851807, + "rewards/rejected": 2.7337396144866943, + "step": 2350 + }, + { + "epoch": 0.38, + "learning_rate": 9.337337305780973e-06, + "logits/chosen": -1.2638344764709473, + "logits/rejected": -1.1975181102752686, + "logps/chosen": -89.62521362304688, + "logps/rejected": -30.08929443359375, + "loss": 0.1317, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.5662567615509033, + "rewards/margins": 1.788257122039795, + "rewards/rejected": -0.22200031578540802, + "step": 2351 + }, + { + "epoch": 0.38, + "learning_rate": 9.336683322597673e-06, + "logits/chosen": -1.101871371269226, + "logits/rejected": -0.9421815872192383, + "logps/chosen": -114.68842315673828, + "logps/rejected": -61.795833587646484, + "loss": 0.3913, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.904299259185791, + "rewards/margins": 3.189924955368042, + "rewards/rejected": 2.714374303817749, + "step": 2352 + }, + { + "epoch": 0.38, + "learning_rate": 9.33602903979059e-06, + "logits/chosen": -0.9736893177032471, + "logits/rejected": -0.804232656955719, + "logps/chosen": -48.11082458496094, + "logps/rejected": -19.077224731445312, + "loss": 0.2526, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.3349571228027344, + "rewards/margins": 0.9901439547538757, + "rewards/rejected": 0.34481316804885864, + "step": 2353 + }, + { + "epoch": 0.38, + "learning_rate": 9.335374457404928e-06, + "logits/chosen": -0.9460827708244324, + "logits/rejected": -0.9840667843818665, + "logps/chosen": -58.39031982421875, + "logps/rejected": -137.90811157226562, + "loss": 0.9792, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.8404037952423096, + "rewards/margins": -1.7166626453399658, + "rewards/rejected": 3.5570664405822754, + "step": 2354 + }, + { + "epoch": 0.38, + "learning_rate": 9.334719575485913e-06, + "logits/chosen": -0.831308901309967, + "logits/rejected": -0.8015797138214111, + "logps/chosen": -44.96408462524414, + "logps/rejected": -17.49271011352539, + "loss": 0.1977, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.8765541315078735, + "rewards/margins": 1.457018494606018, + "rewards/rejected": 0.41953563690185547, + "step": 2355 + }, + { + "epoch": 0.38, + "learning_rate": 9.33406439407879e-06, + "logits/chosen": -1.1310551166534424, + "logits/rejected": -1.1435338258743286, + "logps/chosen": -128.69296264648438, + "logps/rejected": -71.64904022216797, + "loss": 0.1193, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1632416248321533, + "rewards/margins": 1.6480194330215454, + "rewards/rejected": 1.515222191810608, + "step": 2356 + }, + { + "epoch": 0.38, + "learning_rate": 9.333408913228826e-06, + "logits/chosen": -1.2122981548309326, + "logits/rejected": -1.1794459819793701, + "logps/chosen": -73.482177734375, + "logps/rejected": -82.25241088867188, + "loss": 1.8369, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.48120880126953125, + "rewards/margins": -0.34407806396484375, + "rewards/rejected": 0.825286865234375, + "step": 2357 + }, + { + "epoch": 0.38, + "learning_rate": 9.332753132981311e-06, + "logits/chosen": -1.075556755065918, + "logits/rejected": -0.9147676229476929, + "logps/chosen": -82.33784484863281, + "logps/rejected": -124.56013488769531, + "loss": 1.2847, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.0152177810668945, + "rewards/margins": 0.5345704555511475, + "rewards/rejected": 3.480647325515747, + "step": 2358 + }, + { + "epoch": 0.38, + "learning_rate": 9.33209705338155e-06, + "logits/chosen": -1.5395022630691528, + "logits/rejected": -1.5505657196044922, + "logps/chosen": -55.02685546875, + "logps/rejected": -30.261978149414062, + "loss": 0.325, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.803850531578064, + "rewards/margins": 0.8694549202919006, + "rewards/rejected": 0.9343956112861633, + "step": 2359 + }, + { + "epoch": 0.38, + "learning_rate": 9.331440674474875e-06, + "logits/chosen": -1.0123447179794312, + "logits/rejected": -0.9857871532440186, + "logps/chosen": -81.07001495361328, + "logps/rejected": -116.94512939453125, + "loss": 0.0945, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.511353492736816, + "rewards/margins": 1.7070672512054443, + "rewards/rejected": 2.804286241531372, + "step": 2360 + }, + { + "epoch": 0.38, + "learning_rate": 9.330783996306631e-06, + "logits/chosen": -0.7856325507164001, + "logits/rejected": -0.8678767681121826, + "logps/chosen": -127.94274139404297, + "logps/rejected": -129.47305297851562, + "loss": 1.3524, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.204419732093811, + "rewards/margins": -2.5447745323181152, + "rewards/rejected": 3.749194383621216, + "step": 2361 + }, + { + "epoch": 0.38, + "learning_rate": 9.330127018922195e-06, + "logits/chosen": -0.7226228713989258, + "logits/rejected": -0.7529537677764893, + "logps/chosen": -34.30758285522461, + "logps/rejected": -59.4162483215332, + "loss": 0.48, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.0554828643798828, + "rewards/margins": -0.16128003597259521, + "rewards/rejected": 1.216762900352478, + "step": 2362 + }, + { + "epoch": 0.38, + "learning_rate": 9.32946974236695e-06, + "logits/chosen": -1.273611307144165, + "logits/rejected": -0.8695481419563293, + "logps/chosen": -132.75845336914062, + "logps/rejected": -42.71427536010742, + "loss": 0.6078, + "rewards/accuracies": 1.0, + "rewards/chosen": 7.977649211883545, + "rewards/margins": 6.617284774780273, + "rewards/rejected": 1.3603641986846924, + "step": 2363 + }, + { + "epoch": 0.38, + "learning_rate": 9.328812166686313e-06, + "logits/chosen": -1.0228580236434937, + "logits/rejected": -1.0344972610473633, + "logps/chosen": -237.44720458984375, + "logps/rejected": -143.5997772216797, + "loss": 0.0234, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.596246242523193, + "rewards/margins": 4.323237419128418, + "rewards/rejected": 2.2730088233947754, + "step": 2364 + }, + { + "epoch": 0.38, + "learning_rate": 9.328154291925717e-06, + "logits/chosen": -1.2440143823623657, + "logits/rejected": -1.2196274995803833, + "logps/chosen": -125.12408447265625, + "logps/rejected": -103.88681030273438, + "loss": 0.5657, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.73900306224823, + "rewards/margins": -0.12555992603302002, + "rewards/rejected": 1.86456298828125, + "step": 2365 + }, + { + "epoch": 0.38, + "learning_rate": 9.32749611813061e-06, + "logits/chosen": -1.0840028524398804, + "logits/rejected": -0.9528692364692688, + "logps/chosen": -69.45890808105469, + "logps/rejected": -16.17511558532715, + "loss": 0.2529, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.0278762578964233, + "rewards/margins": 0.6934188604354858, + "rewards/rejected": 0.3344573974609375, + "step": 2366 + }, + { + "epoch": 0.38, + "learning_rate": 9.326837645346472e-06, + "logits/chosen": -0.8876081705093384, + "logits/rejected": -0.7689169049263, + "logps/chosen": -98.43899536132812, + "logps/rejected": -60.49134826660156, + "loss": 0.1206, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.712860107421875, + "rewards/margins": 1.3195061683654785, + "rewards/rejected": 4.3933539390563965, + "step": 2367 + }, + { + "epoch": 0.38, + "learning_rate": 9.32617887361879e-06, + "logits/chosen": -0.3628062605857849, + "logits/rejected": -0.38920918107032776, + "logps/chosen": -18.85384750366211, + "logps/rejected": -50.03159713745117, + "loss": 0.8165, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.478009432554245, + "rewards/margins": -0.9731566905975342, + "rewards/rejected": 1.4511661529541016, + "step": 2368 + }, + { + "epoch": 0.38, + "learning_rate": 9.325519802993083e-06, + "logits/chosen": -0.9162931442260742, + "logits/rejected": -0.9412392377853394, + "logps/chosen": -133.04774475097656, + "logps/rejected": -87.1668701171875, + "loss": 1.9386, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.9739822745323181, + "rewards/margins": -1.2425353527069092, + "rewards/rejected": 2.216517686843872, + "step": 2369 + }, + { + "epoch": 0.38, + "learning_rate": 9.324860433514888e-06, + "logits/chosen": -0.6979153752326965, + "logits/rejected": -0.7294917702674866, + "logps/chosen": -55.879573822021484, + "logps/rejected": -56.516754150390625, + "loss": 0.6315, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.9568424224853516, + "rewards/margins": -0.5157253742218018, + "rewards/rejected": 1.4725677967071533, + "step": 2370 + }, + { + "epoch": 0.38, + "learning_rate": 9.324200765229757e-06, + "logits/chosen": -0.9126970171928406, + "logits/rejected": -0.9338549375534058, + "logps/chosen": -89.90931701660156, + "logps/rejected": -80.17841339111328, + "loss": 3.0446, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.7541648745536804, + "rewards/margins": -5.369165420532227, + "rewards/rejected": 6.123330116271973, + "step": 2371 + }, + { + "epoch": 0.39, + "learning_rate": 9.32354079818327e-06, + "logits/chosen": -0.7790963053703308, + "logits/rejected": -0.7355197668075562, + "logps/chosen": -28.75601577758789, + "logps/rejected": -19.977550506591797, + "loss": 0.2993, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5175525546073914, + "rewards/margins": 0.2778327763080597, + "rewards/rejected": 0.23971977829933167, + "step": 2372 + }, + { + "epoch": 0.39, + "learning_rate": 9.322880532421023e-06, + "logits/chosen": -0.8322799801826477, + "logits/rejected": -0.6746616363525391, + "logps/chosen": -48.24219512939453, + "logps/rejected": -43.36115646362305, + "loss": 0.4421, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.896874189376831, + "rewards/margins": -0.029107332229614258, + "rewards/rejected": 2.9259815216064453, + "step": 2373 + }, + { + "epoch": 0.39, + "learning_rate": 9.322219967988638e-06, + "logits/chosen": -1.2835240364074707, + "logits/rejected": -1.167894959449768, + "logps/chosen": -222.88714599609375, + "logps/rejected": -198.32366943359375, + "loss": 0.9523, + "rewards/accuracies": 1.0, + "rewards/chosen": 7.054391384124756, + "rewards/margins": 2.40234375, + "rewards/rejected": 4.652047634124756, + "step": 2374 + }, + { + "epoch": 0.39, + "learning_rate": 9.321559104931746e-06, + "logits/chosen": -0.602736234664917, + "logits/rejected": -0.602736234664917, + "logps/chosen": -1.7202551364898682, + "logps/rejected": -1.7202551364898682, + "loss": 0.4806, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.21636846661567688, + "rewards/margins": 0.0, + "rewards/rejected": 0.21636846661567688, + "step": 2375 + }, + { + "epoch": 0.39, + "learning_rate": 9.320897943296012e-06, + "logits/chosen": -1.0038981437683105, + "logits/rejected": -0.9192039370536804, + "logps/chosen": -90.81063842773438, + "logps/rejected": -93.12633514404297, + "loss": 0.4189, + "rewards/accuracies": 0.0, + "rewards/chosen": 5.804492473602295, + "rewards/margins": -0.22290706634521484, + "rewards/rejected": 6.02739953994751, + "step": 2376 + }, + { + "epoch": 0.39, + "learning_rate": 9.320236483127116e-06, + "logits/chosen": -1.0420827865600586, + "logits/rejected": -1.018500804901123, + "logps/chosen": -74.9976577758789, + "logps/rejected": -68.2555160522461, + "loss": 0.6655, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.4560952186584473, + "rewards/margins": -0.7973372936248779, + "rewards/rejected": 3.253432512283325, + "step": 2377 + }, + { + "epoch": 0.39, + "learning_rate": 9.319574724470756e-06, + "logits/chosen": -1.2642821073532104, + "logits/rejected": -1.1223660707473755, + "logps/chosen": -78.26232147216797, + "logps/rejected": -65.10684204101562, + "loss": 0.7611, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.3628555536270142, + "rewards/margins": -0.25557637214660645, + "rewards/rejected": 1.6184319257736206, + "step": 2378 + }, + { + "epoch": 0.39, + "learning_rate": 9.318912667372657e-06, + "logits/chosen": -0.9836109280586243, + "logits/rejected": -0.9742099046707153, + "logps/chosen": -116.54057312011719, + "logps/rejected": -87.19654846191406, + "loss": 0.6295, + "rewards/accuracies": 0.0, + "rewards/chosen": 6.312420845031738, + "rewards/margins": -0.923853874206543, + "rewards/rejected": 7.236274719238281, + "step": 2379 + }, + { + "epoch": 0.39, + "learning_rate": 9.318250311878558e-06, + "logits/chosen": -0.6531223058700562, + "logits/rejected": -0.5085976123809814, + "logps/chosen": -61.14514923095703, + "logps/rejected": -32.13199234008789, + "loss": 0.4619, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.8645706176757812, + "rewards/margins": 0.40979456901550293, + "rewards/rejected": 1.4547760486602783, + "step": 2380 + }, + { + "epoch": 0.39, + "learning_rate": 9.317587658034221e-06, + "logits/chosen": -1.0901998281478882, + "logits/rejected": -0.9618087410926819, + "logps/chosen": -118.89216613769531, + "logps/rejected": -51.614017486572266, + "loss": 0.202, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.8044846057891846, + "rewards/margins": 1.1356608867645264, + "rewards/rejected": 0.6688236594200134, + "step": 2381 + }, + { + "epoch": 0.39, + "learning_rate": 9.316924705885431e-06, + "logits/chosen": -0.8699376583099365, + "logits/rejected": -0.7271264791488647, + "logps/chosen": -69.05966186523438, + "logps/rejected": -30.13125991821289, + "loss": 0.9085, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.168064832687378, + "rewards/margins": 2.097402572631836, + "rewards/rejected": 1.0706623792648315, + "step": 2382 + }, + { + "epoch": 0.39, + "learning_rate": 9.31626145547799e-06, + "logits/chosen": -1.0084956884384155, + "logits/rejected": -1.01432204246521, + "logps/chosen": -51.67700958251953, + "logps/rejected": -90.39059448242188, + "loss": 0.5904, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.9367501735687256, + "rewards/margins": 0.046396732330322266, + "rewards/rejected": 3.8903534412384033, + "step": 2383 + }, + { + "epoch": 0.39, + "learning_rate": 9.315597906857723e-06, + "logits/chosen": -0.6567169427871704, + "logits/rejected": -0.6558994054794312, + "logps/chosen": -35.45504379272461, + "logps/rejected": -49.87736511230469, + "loss": 1.0989, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.094689965248108, + "rewards/margins": -1.8034981489181519, + "rewards/rejected": 2.8981881141662598, + "step": 2384 + }, + { + "epoch": 0.39, + "learning_rate": 9.314934060070478e-06, + "logits/chosen": -0.8979476094245911, + "logits/rejected": -0.892819344997406, + "logps/chosen": -39.23478698730469, + "logps/rejected": -63.1316032409668, + "loss": 1.1732, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.6652530431747437, + "rewards/margins": -1.9423004388809204, + "rewards/rejected": 3.607553482055664, + "step": 2385 + }, + { + "epoch": 0.39, + "learning_rate": 9.314269915162115e-06, + "logits/chosen": -1.159717321395874, + "logits/rejected": -1.1402912139892578, + "logps/chosen": -106.72982788085938, + "logps/rejected": -87.01337432861328, + "loss": 0.829, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.4785537719726562, + "rewards/margins": 0.1544867753982544, + "rewards/rejected": 1.3240669965744019, + "step": 2386 + }, + { + "epoch": 0.39, + "learning_rate": 9.313605472178524e-06, + "logits/chosen": -0.8605858087539673, + "logits/rejected": -0.8600437045097351, + "logps/chosen": -4.001447677612305, + "logps/rejected": -44.69684982299805, + "loss": 0.3132, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5201464891433716, + "rewards/margins": 0.4169085621833801, + "rewards/rejected": 0.10323791950941086, + "step": 2387 + }, + { + "epoch": 0.39, + "learning_rate": 9.31294073116561e-06, + "logits/chosen": -0.6031510233879089, + "logits/rejected": -0.5504189133644104, + "logps/chosen": -70.08529663085938, + "logps/rejected": -30.130268096923828, + "loss": 0.6263, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.9990402460098267, + "rewards/margins": 0.3576793670654297, + "rewards/rejected": 1.641360878944397, + "step": 2388 + }, + { + "epoch": 0.39, + "learning_rate": 9.3122756921693e-06, + "logits/chosen": -0.7350512146949768, + "logits/rejected": -0.6998268365859985, + "logps/chosen": -69.03225708007812, + "logps/rejected": -53.689762115478516, + "loss": 0.2378, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.954302191734314, + "rewards/margins": 1.3192641735076904, + "rewards/rejected": 0.6350380182266235, + "step": 2389 + }, + { + "epoch": 0.39, + "learning_rate": 9.311610355235545e-06, + "logits/chosen": -1.1780339479446411, + "logits/rejected": -1.211209774017334, + "logps/chosen": -60.371646881103516, + "logps/rejected": -97.89833068847656, + "loss": 1.23, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.5732227563858032, + "rewards/margins": -0.9275859594345093, + "rewards/rejected": 1.5008087158203125, + "step": 2390 + }, + { + "epoch": 0.39, + "learning_rate": 9.310944720410312e-06, + "logits/chosen": -1.0927315950393677, + "logits/rejected": -1.0633124113082886, + "logps/chosen": -66.30195617675781, + "logps/rejected": -79.61617279052734, + "loss": 0.7119, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.538106679916382, + "rewards/margins": -0.9588072299957275, + "rewards/rejected": 3.4969139099121094, + "step": 2391 + }, + { + "epoch": 0.39, + "learning_rate": 9.310278787739587e-06, + "logits/chosen": -1.0137970447540283, + "logits/rejected": -0.9823021292686462, + "logps/chosen": -25.464088439941406, + "logps/rejected": -23.681182861328125, + "loss": 0.7011, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.1777496337890625, + "rewards/margins": 0.19396698474884033, + "rewards/rejected": 1.9837826490402222, + "step": 2392 + }, + { + "epoch": 0.39, + "learning_rate": 9.30961255726938e-06, + "logits/chosen": -1.1176775693893433, + "logits/rejected": -1.205418348312378, + "logps/chosen": -138.04122924804688, + "logps/rejected": -104.60444641113281, + "loss": 0.328, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1108977794647217, + "rewards/margins": 0.3580000400543213, + "rewards/rejected": 2.7528977394104004, + "step": 2393 + }, + { + "epoch": 0.39, + "learning_rate": 9.308946029045726e-06, + "logits/chosen": -1.0994864702224731, + "logits/rejected": -1.0859930515289307, + "logps/chosen": -92.01068878173828, + "logps/rejected": -182.13510131835938, + "loss": 0.2187, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.347890377044678, + "rewards/margins": 0.6540794372558594, + "rewards/rejected": 4.693810939788818, + "step": 2394 + }, + { + "epoch": 0.39, + "learning_rate": 9.308279203114673e-06, + "logits/chosen": -1.0164484977722168, + "logits/rejected": -0.8510238528251648, + "logps/chosen": -187.34854125976562, + "logps/rejected": -76.5888671875, + "loss": 0.3687, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.927639961242676, + "rewards/margins": 3.3900208473205566, + "rewards/rejected": 3.537619113922119, + "step": 2395 + }, + { + "epoch": 0.39, + "learning_rate": 9.307612079522294e-06, + "logits/chosen": -0.989890992641449, + "logits/rejected": -0.8070515394210815, + "logps/chosen": -103.65796661376953, + "logps/rejected": -75.92823791503906, + "loss": 1.1641, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.412853240966797, + "rewards/margins": 1.789846658706665, + "rewards/rejected": 2.623006582260132, + "step": 2396 + }, + { + "epoch": 0.39, + "learning_rate": 9.306944658314677e-06, + "logits/chosen": -0.8257468938827515, + "logits/rejected": -0.7890236973762512, + "logps/chosen": -45.09452819824219, + "logps/rejected": -50.82307434082031, + "loss": 0.638, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.13968825340271, + "rewards/margins": 0.897371768951416, + "rewards/rejected": 1.242316484451294, + "step": 2397 + }, + { + "epoch": 0.39, + "learning_rate": 9.306276939537938e-06, + "logits/chosen": -1.2994084358215332, + "logits/rejected": -1.2381370067596436, + "logps/chosen": -92.59205627441406, + "logps/rejected": -57.03117752075195, + "loss": 0.1548, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.759122610092163, + "rewards/margins": 1.0570416450500488, + "rewards/rejected": 1.7020809650421143, + "step": 2398 + }, + { + "epoch": 0.39, + "learning_rate": 9.305608923238207e-06, + "logits/chosen": -0.8423422574996948, + "logits/rejected": -0.8224499821662903, + "logps/chosen": -93.60871124267578, + "logps/rejected": -81.02289581298828, + "loss": 0.3297, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.2737412452697754, + "rewards/margins": 1.8723855018615723, + "rewards/rejected": 0.4013557434082031, + "step": 2399 + }, + { + "epoch": 0.39, + "learning_rate": 9.304940609461641e-06, + "logits/chosen": -1.0909433364868164, + "logits/rejected": -1.129477620124817, + "logps/chosen": -84.17068481445312, + "logps/rejected": -124.66851043701172, + "loss": 0.6469, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.5535621643066406, + "rewards/margins": -0.8925902843475342, + "rewards/rejected": 2.446152448654175, + "step": 2400 + }, + { + "epoch": 0.39, + "learning_rate": 9.304271998254413e-06, + "logits/chosen": -1.158496618270874, + "logits/rejected": -1.4845845699310303, + "logps/chosen": -156.02740478515625, + "logps/rejected": -103.7862548828125, + "loss": 0.1308, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.650631904602051, + "rewards/margins": 1.2453460693359375, + "rewards/rejected": 4.405285835266113, + "step": 2401 + }, + { + "epoch": 0.39, + "learning_rate": 9.303603089662717e-06, + "logits/chosen": -0.8823562264442444, + "logits/rejected": -0.833966076374054, + "logps/chosen": -93.19572448730469, + "logps/rejected": -52.34493637084961, + "loss": 0.41, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.556112766265869, + "rewards/margins": 2.0013515949249268, + "rewards/rejected": 1.5547611713409424, + "step": 2402 + }, + { + "epoch": 0.39, + "learning_rate": 9.302933883732768e-06, + "logits/chosen": -0.7131012082099915, + "logits/rejected": -0.7131012082099915, + "logps/chosen": -1.1369104385375977, + "logps/rejected": -1.1369104385375977, + "loss": 0.6244, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.23540449142456055, + "rewards/margins": 0.0, + "rewards/rejected": 0.23540449142456055, + "step": 2403 + }, + { + "epoch": 0.39, + "learning_rate": 9.302264380510802e-06, + "logits/chosen": -1.096592903137207, + "logits/rejected": -1.1579254865646362, + "logps/chosen": -53.968963623046875, + "logps/rejected": -94.694580078125, + "loss": 1.8876, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.1879518032073975, + "rewards/margins": -2.7114593982696533, + "rewards/rejected": 4.899411201477051, + "step": 2404 + }, + { + "epoch": 0.39, + "learning_rate": 9.301594580043076e-06, + "logits/chosen": -0.7097282409667969, + "logits/rejected": -0.6582285165786743, + "logps/chosen": -77.3871841430664, + "logps/rejected": -39.82273483276367, + "loss": 0.2338, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.518408179283142, + "rewards/margins": 1.21889066696167, + "rewards/rejected": 0.2995174527168274, + "step": 2405 + }, + { + "epoch": 0.39, + "learning_rate": 9.300924482375866e-06, + "logits/chosen": -0.6572228074073792, + "logits/rejected": -0.6782733201980591, + "logps/chosen": -1.52825129032135, + "logps/rejected": -35.196224212646484, + "loss": 0.4302, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.35851866006851196, + "rewards/margins": -0.13273271918296814, + "rewards/rejected": 0.4912513792514801, + "step": 2406 + }, + { + "epoch": 0.39, + "learning_rate": 9.300254087555472e-06, + "logits/chosen": -0.9272451996803284, + "logits/rejected": -0.6867626905441284, + "logps/chosen": -200.5328369140625, + "logps/rejected": -20.3940486907959, + "loss": 0.0582, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.4768311977386475, + "rewards/margins": 2.1316630840301514, + "rewards/rejected": 0.3451681137084961, + "step": 2407 + }, + { + "epoch": 0.39, + "learning_rate": 9.299583395628209e-06, + "logits/chosen": -0.3737289011478424, + "logits/rejected": -0.36304813623428345, + "logps/chosen": -1.0202746391296387, + "logps/rejected": -9.435254096984863, + "loss": 2.0887, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.168832927942276, + "rewards/margins": 0.2598588466644287, + "rewards/rejected": -0.0910259261727333, + "step": 2408 + }, + { + "epoch": 0.39, + "learning_rate": 9.298912406640414e-06, + "logits/chosen": -0.9288560152053833, + "logits/rejected": -0.8390205502510071, + "logps/chosen": -61.2582893371582, + "logps/rejected": -27.092330932617188, + "loss": 0.7238, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.025970220565796, + "rewards/margins": 1.192897915840149, + "rewards/rejected": 0.833072304725647, + "step": 2409 + }, + { + "epoch": 0.39, + "learning_rate": 9.298241120638451e-06, + "logits/chosen": -0.9184150695800781, + "logits/rejected": -0.9184150695800781, + "logps/chosen": -50.67353439331055, + "logps/rejected": -50.67353439331055, + "loss": 0.4418, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.914036989212036, + "rewards/margins": 0.0, + "rewards/rejected": 2.914036989212036, + "step": 2410 + }, + { + "epoch": 0.39, + "learning_rate": 9.297569537668697e-06, + "logits/chosen": -1.0824954509735107, + "logits/rejected": -1.041140079498291, + "logps/chosen": -54.690696716308594, + "logps/rejected": -60.179534912109375, + "loss": 0.1808, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.5457634925842285, + "rewards/margins": 1.1674957275390625, + "rewards/rejected": 2.378267765045166, + "step": 2411 + }, + { + "epoch": 0.39, + "learning_rate": 9.296897657777551e-06, + "logits/chosen": -1.2525269985198975, + "logits/rejected": -1.2000290155410767, + "logps/chosen": -115.0027847290039, + "logps/rejected": -84.952392578125, + "loss": 0.3968, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.7624199390411377, + "rewards/margins": -0.12903070449829102, + "rewards/rejected": 3.8914506435394287, + "step": 2412 + }, + { + "epoch": 0.39, + "learning_rate": 9.296225481011436e-06, + "logits/chosen": -0.7642812728881836, + "logits/rejected": -0.7642812728881836, + "logps/chosen": -95.57636260986328, + "logps/rejected": -95.57636260986328, + "loss": 0.4967, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.5525611639022827, + "rewards/margins": 0.0, + "rewards/rejected": 1.5525611639022827, + "step": 2413 + }, + { + "epoch": 0.39, + "learning_rate": 9.29555300741679e-06, + "logits/chosen": -0.6057077646255493, + "logits/rejected": -0.6105709671974182, + "logps/chosen": -13.251192092895508, + "logps/rejected": -5.087040901184082, + "loss": 0.4605, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.117389015853405, + "rewards/margins": -0.393283873796463, + "rewards/rejected": 0.2758948504924774, + "step": 2414 + }, + { + "epoch": 0.39, + "learning_rate": 9.294880237040076e-06, + "logits/chosen": -0.7547921538352966, + "logits/rejected": -0.8216100931167603, + "logps/chosen": -79.9376220703125, + "logps/rejected": -72.05439758300781, + "loss": 0.869, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.3941162824630737, + "rewards/margins": -0.2409834861755371, + "rewards/rejected": 1.6350997686386108, + "step": 2415 + }, + { + "epoch": 0.39, + "learning_rate": 9.294207169927776e-06, + "logits/chosen": -1.0157338380813599, + "logits/rejected": -0.7660840153694153, + "logps/chosen": -61.74801254272461, + "logps/rejected": -22.84801483154297, + "loss": 0.2803, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.9389218091964722, + "rewards/margins": 1.2907516956329346, + "rewards/rejected": 0.6481701135635376, + "step": 2416 + }, + { + "epoch": 0.39, + "learning_rate": 9.293533806126394e-06, + "logits/chosen": -0.5383018255233765, + "logits/rejected": -0.4689437448978424, + "logps/chosen": -46.2410888671875, + "logps/rejected": -40.686004638671875, + "loss": 0.6925, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.9373962879180908, + "rewards/margins": 1.071597695350647, + "rewards/rejected": 0.8657985925674438, + "step": 2417 + }, + { + "epoch": 0.39, + "learning_rate": 9.292860145682451e-06, + "logits/chosen": -1.0797922611236572, + "logits/rejected": -1.046282410621643, + "logps/chosen": -113.26014709472656, + "logps/rejected": -47.41629409790039, + "loss": 0.8327, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.9079407453536987, + "rewards/margins": -0.17754781246185303, + "rewards/rejected": 2.0854885578155518, + "step": 2418 + }, + { + "epoch": 0.39, + "learning_rate": 9.292186188642491e-06, + "logits/chosen": -1.1261974573135376, + "logits/rejected": -1.0554611682891846, + "logps/chosen": -73.54495239257812, + "logps/rejected": -91.09181213378906, + "loss": 1.7765, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.5741240978240967, + "rewards/margins": -1.8200652599334717, + "rewards/rejected": 4.394189357757568, + "step": 2419 + }, + { + "epoch": 0.39, + "learning_rate": 9.29151193505308e-06, + "logits/chosen": -0.9058332443237305, + "logits/rejected": -0.7746568322181702, + "logps/chosen": -41.18742370605469, + "logps/rejected": -22.015954971313477, + "loss": 0.543, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.296626329421997, + "rewards/margins": 0.44758474826812744, + "rewards/rejected": 0.8490415811538696, + "step": 2420 + }, + { + "epoch": 0.39, + "learning_rate": 9.2908373849608e-06, + "logits/chosen": -1.1463252305984497, + "logits/rejected": -1.0949136018753052, + "logps/chosen": -51.67022705078125, + "logps/rejected": -46.73255920410156, + "loss": 0.4431, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.1883513927459717, + "rewards/margins": -0.29892730712890625, + "rewards/rejected": 2.487278699874878, + "step": 2421 + }, + { + "epoch": 0.39, + "learning_rate": 9.290162538412257e-06, + "logits/chosen": -1.1600770950317383, + "logits/rejected": -1.0130151510238647, + "logps/chosen": -153.97067260742188, + "logps/rejected": -48.86402130126953, + "loss": 0.107, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.709173679351807, + "rewards/margins": 2.1554114818573, + "rewards/rejected": 2.553762197494507, + "step": 2422 + }, + { + "epoch": 0.39, + "learning_rate": 9.289487395454075e-06, + "logits/chosen": -0.8952364325523376, + "logits/rejected": -0.7497251629829407, + "logps/chosen": -93.70645904541016, + "logps/rejected": -43.997535705566406, + "loss": 0.1726, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.8057594299316406, + "rewards/margins": 1.0680153369903564, + "rewards/rejected": 0.737744152545929, + "step": 2423 + }, + { + "epoch": 0.39, + "learning_rate": 9.288811956132903e-06, + "logits/chosen": -1.1034624576568604, + "logits/rejected": -1.0659881830215454, + "logps/chosen": -94.43170166015625, + "logps/rejected": -48.18860626220703, + "loss": 0.7196, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.891284167766571, + "rewards/margins": 0.8882766366004944, + "rewards/rejected": 0.0030075074173510075, + "step": 2424 + }, + { + "epoch": 0.39, + "learning_rate": 9.288136220495406e-06, + "logits/chosen": -0.8485438227653503, + "logits/rejected": -0.854817807674408, + "logps/chosen": -63.563575744628906, + "logps/rejected": -87.94329071044922, + "loss": 0.4395, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.7462745904922485, + "rewards/margins": 1.0854880809783936, + "rewards/rejected": 0.6607864499092102, + "step": 2425 + }, + { + "epoch": 0.39, + "learning_rate": 9.287460188588272e-06, + "logits/chosen": -1.0953954458236694, + "logits/rejected": -1.1369209289550781, + "logps/chosen": -88.47798919677734, + "logps/rejected": -106.22999572753906, + "loss": 1.0158, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.15461802482605, + "rewards/margins": -1.7934153079986572, + "rewards/rejected": 4.948033332824707, + "step": 2426 + }, + { + "epoch": 0.39, + "learning_rate": 9.286783860458203e-06, + "logits/chosen": -1.2168372869491577, + "logits/rejected": -1.288752555847168, + "logps/chosen": -98.83723449707031, + "logps/rejected": -123.12167358398438, + "loss": 2.363, + "rewards/accuracies": 0.0, + "rewards/chosen": 4.698280334472656, + "rewards/margins": -4.593630790710449, + "rewards/rejected": 9.291911125183105, + "step": 2427 + }, + { + "epoch": 0.39, + "learning_rate": 9.286107236151935e-06, + "logits/chosen": -1.1079307794570923, + "logits/rejected": -1.114816427230835, + "logps/chosen": -85.72254943847656, + "logps/rejected": -53.34070587158203, + "loss": 1.2721, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.3057609796524048, + "rewards/margins": -1.303796410560608, + "rewards/rejected": 2.6095573902130127, + "step": 2428 + }, + { + "epoch": 0.39, + "learning_rate": 9.285430315716213e-06, + "logits/chosen": -0.8674629926681519, + "logits/rejected": -0.8417015075683594, + "logps/chosen": -61.61220932006836, + "logps/rejected": -59.43910217285156, + "loss": 0.9983, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.6157124042510986, + "rewards/margins": -1.8216512203216553, + "rewards/rejected": 5.437363624572754, + "step": 2429 + }, + { + "epoch": 0.39, + "learning_rate": 9.284753099197803e-06, + "logits/chosen": -1.1644012928009033, + "logits/rejected": -1.1964819431304932, + "logps/chosen": -24.498943328857422, + "logps/rejected": -56.12340545654297, + "loss": 2.4186, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.3860164880752563, + "rewards/margins": 0.7559650540351868, + "rewards/rejected": 0.6300514340400696, + "step": 2430 + }, + { + "epoch": 0.39, + "learning_rate": 9.284075586643498e-06, + "logits/chosen": -1.1471434831619263, + "logits/rejected": -1.1531213521957397, + "logps/chosen": -99.12418365478516, + "logps/rejected": -102.2791748046875, + "loss": 1.242, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.6002724170684814, + "rewards/margins": -0.43501973152160645, + "rewards/rejected": 2.035292148590088, + "step": 2431 + }, + { + "epoch": 0.39, + "learning_rate": 9.283397778100105e-06, + "logits/chosen": -1.0985848903656006, + "logits/rejected": -0.8746470212936401, + "logps/chosen": -134.5578155517578, + "logps/rejected": -77.69761657714844, + "loss": 0.0155, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.570231914520264, + "rewards/margins": 4.5869598388671875, + "rewards/rejected": 1.983271837234497, + "step": 2432 + }, + { + "epoch": 0.39, + "learning_rate": 9.282719673614456e-06, + "logits/chosen": -0.9441848993301392, + "logits/rejected": -0.947834849357605, + "logps/chosen": -77.51657104492188, + "logps/rejected": -81.21482849121094, + "loss": 0.5688, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.218095541000366, + "rewards/margins": 0.5802431106567383, + "rewards/rejected": 2.637852430343628, + "step": 2433 + }, + { + "epoch": 0.4, + "learning_rate": 9.282041273233402e-06, + "logits/chosen": -1.1936919689178467, + "logits/rejected": -1.3440192937850952, + "logps/chosen": -153.4419708251953, + "logps/rejected": -37.37812805175781, + "loss": 0.3244, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.6309982538223267, + "rewards/margins": 0.2692245543003082, + "rewards/rejected": 0.36177369952201843, + "step": 2434 + }, + { + "epoch": 0.4, + "learning_rate": 9.281362577003812e-06, + "logits/chosen": -0.997056782245636, + "logits/rejected": -0.9949753284454346, + "logps/chosen": -161.9324951171875, + "logps/rejected": -136.37551879882812, + "loss": 2.6115, + "rewards/accuracies": 0.0, + "rewards/chosen": 4.961172580718994, + "rewards/margins": -2.7640671730041504, + "rewards/rejected": 7.7252397537231445, + "step": 2435 + }, + { + "epoch": 0.4, + "learning_rate": 9.280683584972579e-06, + "logits/chosen": -0.5500524044036865, + "logits/rejected": -0.5516325831413269, + "logps/chosen": -4.157991409301758, + "logps/rejected": -2.732128143310547, + "loss": 0.4565, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.31531500816345215, + "rewards/margins": -0.16265425086021423, + "rewards/rejected": 0.4779692590236664, + "step": 2436 + }, + { + "epoch": 0.4, + "learning_rate": 9.280004297186614e-06, + "logits/chosen": -1.0909525156021118, + "logits/rejected": -1.0495842695236206, + "logps/chosen": -197.68899536132812, + "logps/rejected": -72.16944122314453, + "loss": 0.0225, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.4839982986450195, + "rewards/margins": 4.145874977111816, + "rewards/rejected": 1.3381233215332031, + "step": 2437 + }, + { + "epoch": 0.4, + "learning_rate": 9.27932471369285e-06, + "logits/chosen": -0.9052298665046692, + "logits/rejected": -1.1963036060333252, + "logps/chosen": -30.851573944091797, + "logps/rejected": -67.93022155761719, + "loss": 0.8518, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.6895580291748047, + "rewards/margins": -0.15533864498138428, + "rewards/rejected": 1.844896674156189, + "step": 2438 + }, + { + "epoch": 0.4, + "learning_rate": 9.278644834538239e-06, + "logits/chosen": -1.2194464206695557, + "logits/rejected": -1.2459112405776978, + "logps/chosen": -74.21330261230469, + "logps/rejected": -41.06279754638672, + "loss": 1.2213, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.1336610317230225, + "rewards/margins": 0.19623422622680664, + "rewards/rejected": 1.9374268054962158, + "step": 2439 + }, + { + "epoch": 0.4, + "learning_rate": 9.277964659769756e-06, + "logits/chosen": -0.47299522161483765, + "logits/rejected": -0.5113580226898193, + "logps/chosen": -82.83576965332031, + "logps/rejected": -62.89388656616211, + "loss": 1.6822, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.8274597525596619, + "rewards/margins": -2.5577754974365234, + "rewards/rejected": 3.38523530960083, + "step": 2440 + }, + { + "epoch": 0.4, + "learning_rate": 9.277284189434393e-06, + "logits/chosen": -0.7764999270439148, + "logits/rejected": -0.7977227568626404, + "logps/chosen": -108.78450775146484, + "logps/rejected": -53.20395278930664, + "loss": 0.5815, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.0686821937561035, + "rewards/margins": -0.7800700664520264, + "rewards/rejected": 2.84875226020813, + "step": 2441 + }, + { + "epoch": 0.4, + "learning_rate": 9.276603423579164e-06, + "logits/chosen": -1.164337396621704, + "logits/rejected": -1.1818742752075195, + "logps/chosen": -98.86360931396484, + "logps/rejected": -145.77862548828125, + "loss": 0.6064, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.2726967334747314, + "rewards/margins": -0.8437798023223877, + "rewards/rejected": 2.116476535797119, + "step": 2442 + }, + { + "epoch": 0.4, + "learning_rate": 9.275922362251106e-06, + "logits/chosen": -1.1259820461273193, + "logits/rejected": -1.1259820461273193, + "logps/chosen": -53.18629455566406, + "logps/rejected": -53.18629455566406, + "loss": 1.0529, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.3805992603302, + "rewards/margins": 0.0, + "rewards/rejected": 2.3805992603302, + "step": 2443 + }, + { + "epoch": 0.4, + "learning_rate": 9.27524100549727e-06, + "logits/chosen": -0.5296345949172974, + "logits/rejected": -0.48575490713119507, + "logps/chosen": -67.21257781982422, + "logps/rejected": -66.40931701660156, + "loss": 0.5021, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.8797295093536377, + "rewards/margins": 0.9065742492675781, + "rewards/rejected": 1.9731552600860596, + "step": 2444 + }, + { + "epoch": 0.4, + "learning_rate": 9.274559353364734e-06, + "logits/chosen": -1.0220961570739746, + "logits/rejected": -0.970869243144989, + "logps/chosen": -65.05329895019531, + "logps/rejected": -77.27877807617188, + "loss": 0.9799, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.6908737421035767, + "rewards/margins": -0.017980217933654785, + "rewards/rejected": 1.7088539600372314, + "step": 2445 + }, + { + "epoch": 0.4, + "learning_rate": 9.273877405900594e-06, + "logits/chosen": -0.5780473947525024, + "logits/rejected": -0.5622237324714661, + "logps/chosen": -43.411373138427734, + "logps/rejected": -114.14616394042969, + "loss": 0.094, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.9159812927246094, + "rewards/margins": 1.6022316217422485, + "rewards/rejected": 0.3137497007846832, + "step": 2446 + }, + { + "epoch": 0.4, + "learning_rate": 9.273195163151962e-06, + "logits/chosen": -0.8766098618507385, + "logits/rejected": -0.8766098618507385, + "logps/chosen": -50.946983337402344, + "logps/rejected": -50.946983337402344, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.4228546619415283, + "rewards/margins": 0.0, + "rewards/rejected": 1.4228546619415283, + "step": 2447 + }, + { + "epoch": 0.4, + "learning_rate": 9.27251262516598e-06, + "logits/chosen": -0.9002398252487183, + "logits/rejected": -0.7653412818908691, + "logps/chosen": -69.27213287353516, + "logps/rejected": -25.535213470458984, + "loss": 0.6708, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.700769066810608, + "rewards/margins": 1.902185320854187, + "rewards/rejected": -0.20141620934009552, + "step": 2448 + }, + { + "epoch": 0.4, + "learning_rate": 9.271829791989801e-06, + "logits/chosen": -1.2676801681518555, + "logits/rejected": -1.2706706523895264, + "logps/chosen": -86.81299591064453, + "logps/rejected": -83.58285522460938, + "loss": 2.3117, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.1406502723693848, + "rewards/margins": -0.030171871185302734, + "rewards/rejected": 2.1708221435546875, + "step": 2449 + }, + { + "epoch": 0.4, + "learning_rate": 9.271146663670605e-06, + "logits/chosen": -0.6394873857498169, + "logits/rejected": -0.6387398838996887, + "logps/chosen": -5.545102596282959, + "logps/rejected": -1.5508925914764404, + "loss": 0.6209, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.4606700539588928, + "rewards/margins": -0.06113934516906738, + "rewards/rejected": 0.5218093991279602, + "step": 2450 + }, + { + "epoch": 0.4, + "learning_rate": 9.270463240255589e-06, + "logits/chosen": -0.8126300573348999, + "logits/rejected": -0.7819572687149048, + "logps/chosen": -63.97209930419922, + "logps/rejected": -83.22309875488281, + "loss": 0.151, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.7353341579437256, + "rewards/margins": 1.070906162261963, + "rewards/rejected": 1.6644279956817627, + "step": 2451 + }, + { + "epoch": 0.4, + "learning_rate": 9.269779521791968e-06, + "logits/chosen": -1.2139376401901245, + "logits/rejected": -1.1983133554458618, + "logps/chosen": -52.813907623291016, + "logps/rejected": -80.26249694824219, + "loss": 2.8431, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.3490437269210815, + "rewards/margins": -2.771420478820801, + "rewards/rejected": 4.120464324951172, + "step": 2452 + }, + { + "epoch": 0.4, + "learning_rate": 9.269095508326986e-06, + "logits/chosen": -1.1850546598434448, + "logits/rejected": -1.1932421922683716, + "logps/chosen": -129.73609924316406, + "logps/rejected": -162.25804138183594, + "loss": 3.9481, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.6855179071426392, + "rewards/margins": -7.880002498626709, + "rewards/rejected": 9.565520286560059, + "step": 2453 + }, + { + "epoch": 0.4, + "learning_rate": 9.268411199907898e-06, + "logits/chosen": -0.9082164764404297, + "logits/rejected": -0.845238983631134, + "logps/chosen": -74.92931365966797, + "logps/rejected": -18.253650665283203, + "loss": 0.6699, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.3871109187602997, + "rewards/margins": -0.026910006999969482, + "rewards/rejected": 0.41402092576026917, + "step": 2454 + }, + { + "epoch": 0.4, + "learning_rate": 9.267726596581983e-06, + "logits/chosen": -0.700586199760437, + "logits/rejected": -0.6764764189720154, + "logps/chosen": -105.29402160644531, + "logps/rejected": -44.21595764160156, + "loss": 1.3287, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.5245590209960938, + "rewards/margins": 0.02804112434387207, + "rewards/rejected": 2.4965178966522217, + "step": 2455 + }, + { + "epoch": 0.4, + "learning_rate": 9.267041698396544e-06, + "logits/chosen": -1.2933297157287598, + "logits/rejected": -1.0159540176391602, + "logps/chosen": -180.5439453125, + "logps/rejected": -61.560447692871094, + "loss": 0.1636, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.110928535461426, + "rewards/margins": 3.8833236694335938, + "rewards/rejected": 1.2276047468185425, + "step": 2456 + }, + { + "epoch": 0.4, + "learning_rate": 9.266356505398897e-06, + "logits/chosen": -1.2693135738372803, + "logits/rejected": -1.2693135738372803, + "logps/chosen": -50.199092864990234, + "logps/rejected": -50.199092864990234, + "loss": 0.5907, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.4200267791748047, + "rewards/margins": 0.0, + "rewards/rejected": 2.4200267791748047, + "step": 2457 + }, + { + "epoch": 0.4, + "learning_rate": 9.265671017636384e-06, + "logits/chosen": -0.954566478729248, + "logits/rejected": -0.954566478729248, + "logps/chosen": -50.56364059448242, + "logps/rejected": -50.56364059448242, + "loss": 0.3581, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.811755895614624, + "rewards/margins": 0.0, + "rewards/rejected": 3.811755895614624, + "step": 2458 + }, + { + "epoch": 0.4, + "learning_rate": 9.264985235156368e-06, + "logits/chosen": -0.6397049427032471, + "logits/rejected": -0.6397049427032471, + "logps/chosen": -18.703914642333984, + "logps/rejected": -18.703914642333984, + "loss": 0.3484, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.1991569995880127, + "rewards/margins": 0.0, + "rewards/rejected": 1.1991569995880127, + "step": 2459 + }, + { + "epoch": 0.4, + "learning_rate": 9.264299158006225e-06, + "logits/chosen": -1.0595815181732178, + "logits/rejected": -1.0365512371063232, + "logps/chosen": -41.44142150878906, + "logps/rejected": -91.89624786376953, + "loss": 1.0064, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.013681173324585, + "rewards/margins": -0.7456626892089844, + "rewards/rejected": 2.7593438625335693, + "step": 2460 + }, + { + "epoch": 0.4, + "learning_rate": 9.26361278623336e-06, + "logits/chosen": -1.1962562799453735, + "logits/rejected": -0.9987640380859375, + "logps/chosen": -115.25650787353516, + "logps/rejected": -91.18319702148438, + "loss": 0.2054, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.062386512756348, + "rewards/margins": 1.0974602699279785, + "rewards/rejected": 3.964926242828369, + "step": 2461 + }, + { + "epoch": 0.4, + "learning_rate": 9.262926119885196e-06, + "logits/chosen": -1.0903509855270386, + "logits/rejected": -1.08397376537323, + "logps/chosen": -55.63726043701172, + "logps/rejected": -76.32644653320312, + "loss": 0.5833, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.204079508781433, + "rewards/margins": -0.7912917137145996, + "rewards/rejected": 1.9953712224960327, + "step": 2462 + }, + { + "epoch": 0.4, + "learning_rate": 9.262239159009173e-06, + "logits/chosen": -1.1429567337036133, + "logits/rejected": -1.1113286018371582, + "logps/chosen": -88.28136444091797, + "logps/rejected": -87.0899658203125, + "loss": 0.6543, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.5784683227539062, + "rewards/margins": 0.047301530838012695, + "rewards/rejected": 3.5311667919158936, + "step": 2463 + }, + { + "epoch": 0.4, + "learning_rate": 9.261551903652752e-06, + "logits/chosen": -0.9069075584411621, + "logits/rejected": -0.8709163665771484, + "logps/chosen": -45.92047119140625, + "logps/rejected": -20.81447982788086, + "loss": 1.4971, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.60982745885849, + "rewards/margins": 0.39673352241516113, + "rewards/rejected": 0.21309395134449005, + "step": 2464 + }, + { + "epoch": 0.4, + "learning_rate": 9.260864353863419e-06, + "logits/chosen": -0.8291428089141846, + "logits/rejected": -0.7026644945144653, + "logps/chosen": -74.58000946044922, + "logps/rejected": -27.53622055053711, + "loss": 1.7188, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.296276092529297, + "rewards/margins": 2.412670612335205, + "rewards/rejected": -0.11639442294836044, + "step": 2465 + }, + { + "epoch": 0.4, + "learning_rate": 9.260176509688673e-06, + "logits/chosen": -0.8908304572105408, + "logits/rejected": -0.8663274049758911, + "logps/chosen": -45.09109115600586, + "logps/rejected": -85.68143463134766, + "loss": 1.9945, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.7396358251571655, + "rewards/margins": 0.4990382194519043, + "rewards/rejected": 1.2405976057052612, + "step": 2466 + }, + { + "epoch": 0.4, + "learning_rate": 9.259488371176043e-06, + "logits/chosen": -0.8838218450546265, + "logits/rejected": -0.7693413496017456, + "logps/chosen": -67.61846160888672, + "logps/rejected": -64.36294555664062, + "loss": 0.6303, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.8547248840332031, + "rewards/margins": 0.34882891178131104, + "rewards/rejected": 1.505895972251892, + "step": 2467 + }, + { + "epoch": 0.4, + "learning_rate": 9.258799938373071e-06, + "logits/chosen": -1.1365035772323608, + "logits/rejected": -1.0631424188613892, + "logps/chosen": -46.43343734741211, + "logps/rejected": -52.02293014526367, + "loss": 0.7667, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.4635555744171143, + "rewards/margins": -0.45360028743743896, + "rewards/rejected": 1.9171558618545532, + "step": 2468 + }, + { + "epoch": 0.4, + "learning_rate": 9.25811121132732e-06, + "logits/chosen": -1.035723328590393, + "logits/rejected": -0.8930950164794922, + "logps/chosen": -217.241455078125, + "logps/rejected": -68.09898376464844, + "loss": 0.3633, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.214877605438232, + "rewards/margins": 2.2090301513671875, + "rewards/rejected": 4.005847454071045, + "step": 2469 + }, + { + "epoch": 0.4, + "learning_rate": 9.257422190086374e-06, + "logits/chosen": -0.8841732144355774, + "logits/rejected": -0.8902891874313354, + "logps/chosen": -5.474064826965332, + "logps/rejected": -18.78350067138672, + "loss": 0.7181, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.2620924115180969, + "rewards/margins": -0.43420809507369995, + "rewards/rejected": 0.6963005065917969, + "step": 2470 + }, + { + "epoch": 0.4, + "learning_rate": 9.25673287469784e-06, + "logits/chosen": -1.1137175559997559, + "logits/rejected": -1.046011209487915, + "logps/chosen": -128.9940948486328, + "logps/rejected": -39.539459228515625, + "loss": 0.8426, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.107347249984741, + "rewards/margins": 0.395632266998291, + "rewards/rejected": 1.7117149829864502, + "step": 2471 + }, + { + "epoch": 0.4, + "learning_rate": 9.25604326520934e-06, + "logits/chosen": -0.6976543664932251, + "logits/rejected": -0.6699342131614685, + "logps/chosen": -137.9183807373047, + "logps/rejected": -108.382080078125, + "loss": 2.1617, + "rewards/accuracies": 0.0, + "rewards/chosen": 5.398524761199951, + "rewards/margins": -0.025665283203125, + "rewards/rejected": 5.424190044403076, + "step": 2472 + }, + { + "epoch": 0.4, + "learning_rate": 9.255353361668522e-06, + "logits/chosen": -0.9861177802085876, + "logits/rejected": -1.0173500776290894, + "logps/chosen": -13.3223295211792, + "logps/rejected": -57.91953659057617, + "loss": 0.5407, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.28888389468193054, + "rewards/margins": -0.5261380672454834, + "rewards/rejected": 0.8150219321250916, + "step": 2473 + }, + { + "epoch": 0.4, + "learning_rate": 9.254663164123052e-06, + "logits/chosen": -0.638342022895813, + "logits/rejected": -0.6253756880760193, + "logps/chosen": -12.087776184082031, + "logps/rejected": -3.900524139404297, + "loss": 0.5023, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.4114822447299957, + "rewards/margins": -0.1467137634754181, + "rewards/rejected": 0.5581960082054138, + "step": 2474 + }, + { + "epoch": 0.4, + "learning_rate": 9.253972672620615e-06, + "logits/chosen": -0.6210988759994507, + "logits/rejected": -0.6297062635421753, + "logps/chosen": -2.8711671829223633, + "logps/rejected": -4.506859302520752, + "loss": 0.3297, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.1702440232038498, + "rewards/margins": 0.10647005587816238, + "rewards/rejected": 0.06377396732568741, + "step": 2475 + }, + { + "epoch": 0.4, + "learning_rate": 9.253281887208917e-06, + "logits/chosen": -1.4404386281967163, + "logits/rejected": -1.4224433898925781, + "logps/chosen": -125.68394470214844, + "logps/rejected": -70.79255676269531, + "loss": 0.4458, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.2751206159591675, + "rewards/margins": -0.3389861583709717, + "rewards/rejected": 1.6141067743301392, + "step": 2476 + }, + { + "epoch": 0.4, + "learning_rate": 9.252590807935686e-06, + "logits/chosen": -0.9741504788398743, + "logits/rejected": -0.9767856001853943, + "logps/chosen": -83.97171020507812, + "logps/rejected": -90.91797637939453, + "loss": 0.8291, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.7411469221115112, + "rewards/margins": 0.47324299812316895, + "rewards/rejected": 1.2679039239883423, + "step": 2477 + }, + { + "epoch": 0.4, + "learning_rate": 9.25189943484867e-06, + "logits/chosen": -1.1964925527572632, + "logits/rejected": -1.3563648462295532, + "logps/chosen": -182.93841552734375, + "logps/rejected": -124.26498413085938, + "loss": 0.1321, + "rewards/accuracies": 1.0, + "rewards/chosen": 7.438672065734863, + "rewards/margins": 1.2670807838439941, + "rewards/rejected": 6.171591281890869, + "step": 2478 + }, + { + "epoch": 0.4, + "learning_rate": 9.251207767995633e-06, + "logits/chosen": -1.4927178621292114, + "logits/rejected": -1.4512859582901, + "logps/chosen": -243.7216796875, + "logps/rejected": -142.09909057617188, + "loss": 0.3338, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.686132907867432, + "rewards/margins": 0.171142578125, + "rewards/rejected": 6.514990329742432, + "step": 2479 + }, + { + "epoch": 0.4, + "learning_rate": 9.250515807424365e-06, + "logits/chosen": -1.3846195936203003, + "logits/rejected": -1.3532003164291382, + "logps/chosen": -118.49655151367188, + "logps/rejected": -73.39103698730469, + "loss": 0.7914, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.5600478649139404, + "rewards/margins": -1.2878265380859375, + "rewards/rejected": 3.847874402999878, + "step": 2480 + }, + { + "epoch": 0.4, + "learning_rate": 9.249823553182675e-06, + "logits/chosen": -0.9169479608535767, + "logits/rejected": -0.8294755816459656, + "logps/chosen": -94.23894500732422, + "logps/rejected": -66.22529602050781, + "loss": 0.4079, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2987473011016846, + "rewards/margins": 0.8685545921325684, + "rewards/rejected": 2.430192708969116, + "step": 2481 + }, + { + "epoch": 0.4, + "learning_rate": 9.249131005318388e-06, + "logits/chosen": -0.8180201649665833, + "logits/rejected": -0.8498108983039856, + "logps/chosen": -101.79385375976562, + "logps/rejected": -56.75782012939453, + "loss": 0.9363, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.1390923261642456, + "rewards/margins": -0.5880385637283325, + "rewards/rejected": 1.7271308898925781, + "step": 2482 + }, + { + "epoch": 0.4, + "learning_rate": 9.248438163879354e-06, + "logits/chosen": -1.1789917945861816, + "logits/rejected": -1.1313732862472534, + "logps/chosen": -142.89559936523438, + "logps/rejected": -106.40399169921875, + "loss": 0.3938, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.7829803824424744, + "rewards/margins": 0.3511291742324829, + "rewards/rejected": 0.43185120820999146, + "step": 2483 + }, + { + "epoch": 0.4, + "learning_rate": 9.247745028913444e-06, + "logits/chosen": -1.362015962600708, + "logits/rejected": -1.3035792112350464, + "logps/chosen": -96.17579650878906, + "logps/rejected": -18.672714233398438, + "loss": 0.0203, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.6809587478637695, + "rewards/margins": 4.128978252410889, + "rewards/rejected": 0.5519806146621704, + "step": 2484 + }, + { + "epoch": 0.4, + "learning_rate": 9.247051600468544e-06, + "logits/chosen": -1.051954984664917, + "logits/rejected": -1.0865137577056885, + "logps/chosen": -79.4317626953125, + "logps/rejected": -65.12503814697266, + "loss": 1.4406, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.7209205627441406, + "rewards/margins": -2.120288372039795, + "rewards/rejected": 4.8412089347839355, + "step": 2485 + }, + { + "epoch": 0.4, + "learning_rate": 9.246357878592562e-06, + "logits/chosen": -0.9619613289833069, + "logits/rejected": -1.0459626913070679, + "logps/chosen": -85.52510070800781, + "logps/rejected": -112.84478759765625, + "loss": 2.0706, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.512066602706909, + "rewards/margins": -3.259408712387085, + "rewards/rejected": 5.771475315093994, + "step": 2486 + }, + { + "epoch": 0.4, + "learning_rate": 9.245663863333433e-06, + "logits/chosen": -0.5129128694534302, + "logits/rejected": -0.47131994366645813, + "logps/chosen": -92.27879333496094, + "logps/rejected": -41.012508392333984, + "loss": 0.3551, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.9299439191818237, + "rewards/margins": 0.38276946544647217, + "rewards/rejected": 1.5471744537353516, + "step": 2487 + }, + { + "epoch": 0.4, + "learning_rate": 9.244969554739103e-06, + "logits/chosen": -0.6407654881477356, + "logits/rejected": -0.6407654881477356, + "logps/chosen": -41.49859619140625, + "logps/rejected": -41.49859619140625, + "loss": 0.5475, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.5838607549667358, + "rewards/margins": 0.0, + "rewards/rejected": 1.5838607549667358, + "step": 2488 + }, + { + "epoch": 0.4, + "learning_rate": 9.244274952857543e-06, + "logits/chosen": -0.9075081944465637, + "logits/rejected": -0.8608379364013672, + "logps/chosen": -59.80915069580078, + "logps/rejected": -7.886382102966309, + "loss": 0.7513, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.8450508117675781, + "rewards/margins": 0.5777777433395386, + "rewards/rejected": 0.26727303862571716, + "step": 2489 + }, + { + "epoch": 0.4, + "learning_rate": 9.243580057736743e-06, + "logits/chosen": -0.9953659772872925, + "logits/rejected": -0.8722814321517944, + "logps/chosen": -55.542236328125, + "logps/rejected": -19.25389862060547, + "loss": 0.6883, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.733319103717804, + "rewards/margins": -0.13976019620895386, + "rewards/rejected": 0.8730792999267578, + "step": 2490 + }, + { + "epoch": 0.4, + "learning_rate": 9.242884869424716e-06, + "logits/chosen": -1.0417791604995728, + "logits/rejected": -1.0394667387008667, + "logps/chosen": -132.78778076171875, + "logps/rejected": -96.92646789550781, + "loss": 0.8776, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.405526727437973, + "rewards/margins": -1.192901611328125, + "rewards/rejected": 1.5984283685684204, + "step": 2491 + }, + { + "epoch": 0.4, + "learning_rate": 9.242189387969488e-06, + "logits/chosen": -1.061753273010254, + "logits/rejected": -1.1770806312561035, + "logps/chosen": -142.1012725830078, + "logps/rejected": -105.07388305664062, + "loss": 1.3013, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.7107269763946533, + "rewards/margins": -2.4291093349456787, + "rewards/rejected": 4.139836311340332, + "step": 2492 + }, + { + "epoch": 0.4, + "learning_rate": 9.241493613419114e-06, + "logits/chosen": -1.0821475982666016, + "logits/rejected": -1.021832823753357, + "logps/chosen": -51.86412048339844, + "logps/rejected": -68.85386657714844, + "loss": 0.6943, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.4667022228240967, + "rewards/margins": -1.0327789783477783, + "rewards/rejected": 3.499481201171875, + "step": 2493 + }, + { + "epoch": 0.4, + "learning_rate": 9.240797545821666e-06, + "logits/chosen": -0.5969445705413818, + "logits/rejected": -0.5907109379768372, + "logps/chosen": -46.767555236816406, + "logps/rejected": -65.3945083618164, + "loss": 2.4158, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.9347953796386719, + "rewards/margins": -1.7967491149902344, + "rewards/rejected": 2.7315444946289062, + "step": 2494 + }, + { + "epoch": 0.4, + "learning_rate": 9.240101185225234e-06, + "logits/chosen": -1.057548999786377, + "logits/rejected": -1.0741878747940063, + "logps/chosen": -168.20045471191406, + "logps/rejected": -107.49910736083984, + "loss": 0.4026, + "rewards/accuracies": 1.0, + "rewards/chosen": 7.048802375793457, + "rewards/margins": 0.033907413482666016, + "rewards/rejected": 7.014894962310791, + "step": 2495 + }, + { + "epoch": 0.41, + "learning_rate": 9.239404531677932e-06, + "logits/chosen": -1.4727380275726318, + "logits/rejected": -1.4397515058517456, + "logps/chosen": -69.17085266113281, + "logps/rejected": -41.21086120605469, + "loss": 1.0969, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.9630165100097656, + "rewards/margins": -1.5857605934143066, + "rewards/rejected": 3.5487771034240723, + "step": 2496 + }, + { + "epoch": 0.41, + "learning_rate": 9.238707585227887e-06, + "logits/chosen": -1.462620735168457, + "logits/rejected": -1.3363786935806274, + "logps/chosen": -141.77032470703125, + "logps/rejected": -84.29927062988281, + "loss": 0.3053, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.601346015930176, + "rewards/margins": 0.1810011863708496, + "rewards/rejected": 5.420344829559326, + "step": 2497 + }, + { + "epoch": 0.41, + "learning_rate": 9.238010345923257e-06, + "logits/chosen": -1.055521845817566, + "logits/rejected": -1.02146577835083, + "logps/chosen": -101.53545379638672, + "logps/rejected": -105.12956237792969, + "loss": 0.6357, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.025214672088623, + "rewards/margins": 0.27962446212768555, + "rewards/rejected": 3.7455902099609375, + "step": 2498 + }, + { + "epoch": 0.41, + "learning_rate": 9.237312813812213e-06, + "logits/chosen": -0.790764570236206, + "logits/rejected": -0.9082835912704468, + "logps/chosen": -96.90412902832031, + "logps/rejected": -133.47274780273438, + "loss": 2.6374, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.916516065597534, + "rewards/margins": -4.992388725280762, + "rewards/rejected": 8.908905029296875, + "step": 2499 + }, + { + "epoch": 0.41, + "learning_rate": 9.236614988942945e-06, + "logits/chosen": -0.8764036297798157, + "logits/rejected": -0.8225786685943604, + "logps/chosen": -111.73674011230469, + "logps/rejected": -45.247135162353516, + "loss": 0.1709, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.348031759262085, + "rewards/margins": 2.0384113788604736, + "rewards/rejected": 0.30962029099464417, + "step": 2500 + }, + { + "epoch": 0.41, + "learning_rate": 9.23591687136367e-06, + "logits/chosen": -0.7160916924476624, + "logits/rejected": -0.7146352529525757, + "logps/chosen": -101.57191467285156, + "logps/rejected": -39.4505615234375, + "loss": 0.935, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.9181854128837585, + "rewards/margins": -1.5571296215057373, + "rewards/rejected": 2.4753150939941406, + "step": 2501 + }, + { + "epoch": 0.41, + "learning_rate": 9.23521846112262e-06, + "logits/chosen": -0.8144896626472473, + "logits/rejected": -0.8144896626472473, + "logps/chosen": -1.6347264051437378, + "logps/rejected": -1.6347264051437378, + "loss": 0.6693, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.25252795219421387, + "rewards/margins": 0.0, + "rewards/rejected": 0.25252795219421387, + "step": 2502 + }, + { + "epoch": 0.41, + "learning_rate": 9.23451975826805e-06, + "logits/chosen": -0.6164443492889404, + "logits/rejected": -0.6155967116355896, + "logps/chosen": -4.8158416748046875, + "logps/rejected": -6.480325698852539, + "loss": 0.8512, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.30289527773857117, + "rewards/margins": -0.010094046592712402, + "rewards/rejected": 0.31298932433128357, + "step": 2503 + }, + { + "epoch": 0.41, + "learning_rate": 9.233820762848229e-06, + "logits/chosen": -1.2445589303970337, + "logits/rejected": -0.9518656134605408, + "logps/chosen": -94.95578002929688, + "logps/rejected": -18.12845802307129, + "loss": 0.898, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.786978244781494, + "rewards/margins": 5.174797534942627, + "rewards/rejected": 0.6121805310249329, + "step": 2504 + }, + { + "epoch": 0.41, + "learning_rate": 9.233121474911455e-06, + "logits/chosen": -0.9520929455757141, + "logits/rejected": -0.8898484706878662, + "logps/chosen": -52.91749572753906, + "logps/rejected": -43.31214141845703, + "loss": 0.6914, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.0221824645996094, + "rewards/margins": 0.28130531311035156, + "rewards/rejected": 0.7408771514892578, + "step": 2505 + }, + { + "epoch": 0.41, + "learning_rate": 9.232421894506043e-06, + "logits/chosen": -0.768966555595398, + "logits/rejected": -0.801581084728241, + "logps/chosen": -87.50220489501953, + "logps/rejected": -75.53691101074219, + "loss": 1.2625, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.7566596865653992, + "rewards/margins": -1.0821418762207031, + "rewards/rejected": 1.838801622390747, + "step": 2506 + }, + { + "epoch": 0.41, + "learning_rate": 9.231722021680324e-06, + "logits/chosen": -1.1890058517456055, + "logits/rejected": -1.2193158864974976, + "logps/chosen": -72.33964538574219, + "logps/rejected": -103.190185546875, + "loss": 1.642, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.6825958490371704, + "rewards/margins": -2.567384719848633, + "rewards/rejected": 4.249980449676514, + "step": 2507 + }, + { + "epoch": 0.41, + "learning_rate": 9.231021856482654e-06, + "logits/chosen": -1.0318595170974731, + "logits/rejected": -1.0318595170974731, + "logps/chosen": -28.34955596923828, + "logps/rejected": -28.34955596923828, + "loss": 0.4764, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.9902924299240112, + "rewards/margins": 0.0, + "rewards/rejected": 1.9902924299240112, + "step": 2508 + }, + { + "epoch": 0.41, + "learning_rate": 9.230321398961408e-06, + "logits/chosen": -0.4790615439414978, + "logits/rejected": -0.4700009822845459, + "logps/chosen": -1.528757929801941, + "logps/rejected": -10.970094680786133, + "loss": 0.7124, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.27268314361572266, + "rewards/margins": -0.0754031240940094, + "rewards/rejected": 0.34808626770973206, + "step": 2509 + }, + { + "epoch": 0.41, + "learning_rate": 9.229620649164982e-06, + "logits/chosen": -0.7857494950294495, + "logits/rejected": -0.8608614206314087, + "logps/chosen": -81.21038818359375, + "logps/rejected": -63.09453582763672, + "loss": 1.3305, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.622822642326355, + "rewards/margins": -1.596638560295105, + "rewards/rejected": 3.21946120262146, + "step": 2510 + }, + { + "epoch": 0.41, + "learning_rate": 9.228919607141788e-06, + "logits/chosen": -0.6104477047920227, + "logits/rejected": -0.6104477047920227, + "logps/chosen": -55.02252960205078, + "logps/rejected": -55.02252960205078, + "loss": 0.4018, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.0035957098007202, + "rewards/margins": 0.0, + "rewards/rejected": 1.0035957098007202, + "step": 2511 + }, + { + "epoch": 0.41, + "learning_rate": 9.228218272940265e-06, + "logits/chosen": -1.0773775577545166, + "logits/rejected": -1.2594338655471802, + "logps/chosen": -95.88778686523438, + "logps/rejected": -111.94976043701172, + "loss": 1.5313, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.27435302734375, + "rewards/margins": -2.9431052207946777, + "rewards/rejected": 6.217458248138428, + "step": 2512 + }, + { + "epoch": 0.41, + "learning_rate": 9.22751664660887e-06, + "logits/chosen": -0.9524378180503845, + "logits/rejected": -0.9887617826461792, + "logps/chosen": -44.987815856933594, + "logps/rejected": -54.05712127685547, + "loss": 0.8093, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.233221411705017, + "rewards/margins": -0.954154372215271, + "rewards/rejected": 2.187375783920288, + "step": 2513 + }, + { + "epoch": 0.41, + "learning_rate": 9.226814728196072e-06, + "logits/chosen": -0.9489555954933167, + "logits/rejected": -1.0293550491333008, + "logps/chosen": -105.24300384521484, + "logps/rejected": -128.08494567871094, + "loss": 1.7776, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.4587150812149048, + "rewards/margins": -0.5660148859024048, + "rewards/rejected": 2.0247299671173096, + "step": 2514 + }, + { + "epoch": 0.41, + "learning_rate": 9.226112517750372e-06, + "logits/chosen": -0.8930079340934753, + "logits/rejected": -0.9618164896965027, + "logps/chosen": -26.67922592163086, + "logps/rejected": -42.55357360839844, + "loss": 1.2632, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.385901689529419, + "rewards/margins": -1.624091386795044, + "rewards/rejected": 3.009993076324463, + "step": 2515 + }, + { + "epoch": 0.41, + "learning_rate": 9.225410015320285e-06, + "logits/chosen": -1.0709105730056763, + "logits/rejected": -1.1849814653396606, + "logps/chosen": -84.5205078125, + "logps/rejected": -131.82229614257812, + "loss": 1.4544, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.2941573858261108, + "rewards/margins": -2.317897319793701, + "rewards/rejected": 3.6120545864105225, + "step": 2516 + }, + { + "epoch": 0.41, + "learning_rate": 9.224707220954347e-06, + "logits/chosen": -0.5581818222999573, + "logits/rejected": -0.5577186346054077, + "logps/chosen": -4.104816436767578, + "logps/rejected": -5.554691314697266, + "loss": 1.3366, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2636008858680725, + "rewards/margins": 0.07603517174720764, + "rewards/rejected": 0.18756571412086487, + "step": 2517 + }, + { + "epoch": 0.41, + "learning_rate": 9.224004134701115e-06, + "logits/chosen": -0.9489824175834656, + "logits/rejected": -0.8136804103851318, + "logps/chosen": -52.435752868652344, + "logps/rejected": -38.83869934082031, + "loss": 0.8701, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.6737823486328125, + "rewards/margins": -0.6148910522460938, + "rewards/rejected": 3.2886734008789062, + "step": 2518 + }, + { + "epoch": 0.41, + "learning_rate": 9.223300756609165e-06, + "logits/chosen": -0.890887975692749, + "logits/rejected": -0.8451367020606995, + "logps/chosen": -74.38251495361328, + "logps/rejected": -57.42866897583008, + "loss": 0.4898, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.918921709060669, + "rewards/margins": 1.1036250591278076, + "rewards/rejected": 0.8152965903282166, + "step": 2519 + }, + { + "epoch": 0.41, + "learning_rate": 9.222597086727094e-06, + "logits/chosen": -0.9482429623603821, + "logits/rejected": -0.9850285053253174, + "logps/chosen": -68.7125244140625, + "logps/rejected": -56.599388122558594, + "loss": 0.5144, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.741781711578369, + "rewards/margins": 0.5668487548828125, + "rewards/rejected": 2.1749329566955566, + "step": 2520 + }, + { + "epoch": 0.41, + "learning_rate": 9.221893125103519e-06, + "logits/chosen": -0.619195818901062, + "logits/rejected": -0.6050023436546326, + "logps/chosen": -27.6680908203125, + "logps/rejected": -8.680765151977539, + "loss": 1.2022, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.15150833129882812, + "rewards/margins": -0.4429807662963867, + "rewards/rejected": 0.5944890975952148, + "step": 2521 + }, + { + "epoch": 0.41, + "learning_rate": 9.221188871787076e-06, + "logits/chosen": -1.1920228004455566, + "logits/rejected": -1.2544673681259155, + "logps/chosen": -47.23464584350586, + "logps/rejected": -70.94281005859375, + "loss": 0.4925, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.8768069744110107, + "rewards/margins": 0.8520457744598389, + "rewards/rejected": 2.024761199951172, + "step": 2522 + }, + { + "epoch": 0.41, + "learning_rate": 9.220484326826423e-06, + "logits/chosen": -1.306311011314392, + "logits/rejected": -1.2009867429733276, + "logps/chosen": -206.83316040039062, + "logps/rejected": -80.61135864257812, + "loss": 0.0665, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.826895236968994, + "rewards/margins": 1.9779267311096191, + "rewards/rejected": 4.848968505859375, + "step": 2523 + }, + { + "epoch": 0.41, + "learning_rate": 9.21977949027024e-06, + "logits/chosen": -1.2789571285247803, + "logits/rejected": -1.3733031749725342, + "logps/chosen": -139.57774353027344, + "logps/rejected": -167.158447265625, + "loss": 0.8926, + "rewards/accuracies": 0.0, + "rewards/chosen": 5.11825704574585, + "rewards/margins": -0.9316544532775879, + "rewards/rejected": 6.0499114990234375, + "step": 2524 + }, + { + "epoch": 0.41, + "learning_rate": 9.219074362167219e-06, + "logits/chosen": -0.8219619989395142, + "logits/rejected": -0.7870134711265564, + "logps/chosen": -47.168724060058594, + "logps/rejected": -51.835121154785156, + "loss": 0.3169, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.9853317737579346, + "rewards/margins": 0.35231101512908936, + "rewards/rejected": 1.6330207586288452, + "step": 2525 + }, + { + "epoch": 0.41, + "learning_rate": 9.218368942566082e-06, + "logits/chosen": -0.5810496807098389, + "logits/rejected": -0.5810496807098389, + "logps/chosen": -48.984519958496094, + "logps/rejected": -48.984519958496094, + "loss": 1.24, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.405127763748169, + "rewards/margins": 0.0, + "rewards/rejected": 2.405127763748169, + "step": 2526 + }, + { + "epoch": 0.41, + "learning_rate": 9.217663231515567e-06, + "logits/chosen": -1.2300118207931519, + "logits/rejected": -1.2961726188659668, + "logps/chosen": -208.21575927734375, + "logps/rejected": -147.62197875976562, + "loss": 0.5408, + "rewards/accuracies": 0.0, + "rewards/chosen": 6.133978366851807, + "rewards/margins": -0.46892547607421875, + "rewards/rejected": 6.602903842926025, + "step": 2527 + }, + { + "epoch": 0.41, + "learning_rate": 9.21695722906443e-06, + "logits/chosen": -0.8943797945976257, + "logits/rejected": -0.5722516775131226, + "logps/chosen": -119.78005981445312, + "logps/rejected": -59.48212432861328, + "loss": 0.029, + "rewards/accuracies": 1.0, + "rewards/chosen": 7.443966865539551, + "rewards/margins": 2.916390895843506, + "rewards/rejected": 4.527575969696045, + "step": 2528 + }, + { + "epoch": 0.41, + "learning_rate": 9.216250935261447e-06, + "logits/chosen": -1.3118374347686768, + "logits/rejected": -1.3898122310638428, + "logps/chosen": -128.3249053955078, + "logps/rejected": -92.96700286865234, + "loss": 1.0086, + "rewards/accuracies": 0.0, + "rewards/chosen": 4.014692783355713, + "rewards/margins": -1.8294854164123535, + "rewards/rejected": 5.844178199768066, + "step": 2529 + }, + { + "epoch": 0.41, + "learning_rate": 9.215544350155423e-06, + "logits/chosen": -1.583207130432129, + "logits/rejected": -1.5139436721801758, + "logps/chosen": -86.79332733154297, + "logps/rejected": -66.35437774658203, + "loss": 0.3223, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.534127950668335, + "rewards/margins": 0.22332096099853516, + "rewards/rejected": 2.3108069896698, + "step": 2530 + }, + { + "epoch": 0.41, + "learning_rate": 9.21483747379517e-06, + "logits/chosen": -1.021803617477417, + "logits/rejected": -0.9788996577262878, + "logps/chosen": -56.734092712402344, + "logps/rejected": -98.09257507324219, + "loss": 0.5271, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.9890480041503906, + "rewards/margins": 0.8328315019607544, + "rewards/rejected": 1.1562165021896362, + "step": 2531 + }, + { + "epoch": 0.41, + "learning_rate": 9.21413030622953e-06, + "logits/chosen": -0.5945389270782471, + "logits/rejected": -0.5945389270782471, + "logps/chosen": -66.74252319335938, + "logps/rejected": -66.74252319335938, + "loss": 0.6269, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.8628898859024048, + "rewards/margins": 0.0, + "rewards/rejected": 0.8628898859024048, + "step": 2532 + }, + { + "epoch": 0.41, + "learning_rate": 9.213422847507358e-06, + "logits/chosen": -0.7174690961837769, + "logits/rejected": -0.7207720279693604, + "logps/chosen": -3.095754861831665, + "logps/rejected": -4.472954750061035, + "loss": 1.1077, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.1722043752670288, + "rewards/margins": -0.1433624029159546, + "rewards/rejected": 0.3155667781829834, + "step": 2533 + }, + { + "epoch": 0.41, + "learning_rate": 9.212715097677537e-06, + "logits/chosen": -0.8477425575256348, + "logits/rejected": -0.8150972127914429, + "logps/chosen": -59.26002883911133, + "logps/rejected": -77.02564239501953, + "loss": 1.282, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.900681734085083, + "rewards/margins": -1.1237483024597168, + "rewards/rejected": 3.0244300365448, + "step": 2534 + }, + { + "epoch": 0.41, + "learning_rate": 9.212007056788965e-06, + "logits/chosen": -0.978234589099884, + "logits/rejected": -0.7346203327178955, + "logps/chosen": -114.08567810058594, + "logps/rejected": -51.116065979003906, + "loss": 0.0176, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.295353889465332, + "rewards/margins": 3.65163516998291, + "rewards/rejected": 1.6437187194824219, + "step": 2535 + }, + { + "epoch": 0.41, + "learning_rate": 9.211298724890558e-06, + "logits/chosen": -1.2703819274902344, + "logits/rejected": -1.0294811725616455, + "logps/chosen": -112.65675354003906, + "logps/rejected": -21.792224884033203, + "loss": 0.1408, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.6788864135742188, + "rewards/margins": 3.380629301071167, + "rewards/rejected": 0.29825708270072937, + "step": 2536 + }, + { + "epoch": 0.41, + "learning_rate": 9.210590102031257e-06, + "logits/chosen": -1.0390360355377197, + "logits/rejected": -0.9865265488624573, + "logps/chosen": -50.529685974121094, + "logps/rejected": -59.137306213378906, + "loss": 0.4067, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.230518341064453, + "rewards/margins": -0.16359949111938477, + "rewards/rejected": 2.394117832183838, + "step": 2537 + }, + { + "epoch": 0.41, + "learning_rate": 9.209881188260021e-06, + "logits/chosen": -0.9463124871253967, + "logits/rejected": -0.9480205774307251, + "logps/chosen": -5.371768951416016, + "logps/rejected": -11.723068237304688, + "loss": 0.3317, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5934417843818665, + "rewards/margins": 0.14155197143554688, + "rewards/rejected": 0.4518898129463196, + "step": 2538 + }, + { + "epoch": 0.41, + "learning_rate": 9.209171983625828e-06, + "logits/chosen": -0.8394628167152405, + "logits/rejected": -0.8382869362831116, + "logps/chosen": -14.565130233764648, + "logps/rejected": -14.520004272460938, + "loss": 0.6201, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.059685613960027695, + "rewards/margins": -0.3704586923122406, + "rewards/rejected": 0.4301443099975586, + "step": 2539 + }, + { + "epoch": 0.41, + "learning_rate": 9.208462488177679e-06, + "logits/chosen": -0.869407057762146, + "logits/rejected": -0.8622036576271057, + "logps/chosen": -17.667293548583984, + "logps/rejected": -0.5905320048332214, + "loss": 0.7708, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2618551254272461, + "rewards/margins": 0.1358317881822586, + "rewards/rejected": 0.1260233372449875, + "step": 2540 + }, + { + "epoch": 0.41, + "learning_rate": 9.207752701964595e-06, + "logits/chosen": -0.8668415546417236, + "logits/rejected": -0.8416810631752014, + "logps/chosen": -41.90889358520508, + "logps/rejected": -56.15354919433594, + "loss": 0.5337, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.972806930541992, + "rewards/margins": 0.4349558353424072, + "rewards/rejected": 3.537851095199585, + "step": 2541 + }, + { + "epoch": 0.41, + "learning_rate": 9.207042625035612e-06, + "logits/chosen": -0.6804572939872742, + "logits/rejected": -0.7731508612632751, + "logps/chosen": -42.762596130371094, + "logps/rejected": -92.29762268066406, + "loss": 0.7212, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.3011764287948608, + "rewards/margins": -1.0601152181625366, + "rewards/rejected": 2.3612916469573975, + "step": 2542 + }, + { + "epoch": 0.41, + "learning_rate": 9.20633225743979e-06, + "logits/chosen": -0.9805650115013123, + "logits/rejected": -0.8704425096511841, + "logps/chosen": -161.28939819335938, + "logps/rejected": -50.25299072265625, + "loss": 0.0164, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.893155097961426, + "rewards/margins": 3.686798334121704, + "rewards/rejected": 3.2063567638397217, + "step": 2543 + }, + { + "epoch": 0.41, + "learning_rate": 9.205621599226209e-06, + "logits/chosen": -1.1312687397003174, + "logits/rejected": -1.124525785446167, + "logps/chosen": -91.20721435546875, + "logps/rejected": -90.52278137207031, + "loss": 0.0084, + "rewards/accuracies": 1.0, + "rewards/chosen": 7.189271450042725, + "rewards/margins": 5.606832504272461, + "rewards/rejected": 1.5824387073516846, + "step": 2544 + }, + { + "epoch": 0.41, + "learning_rate": 9.204910650443972e-06, + "logits/chosen": -1.2982003688812256, + "logits/rejected": -1.2668589353561401, + "logps/chosen": -74.35348510742188, + "logps/rejected": -190.3194580078125, + "loss": 3.2508, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.6489853858947754, + "rewards/margins": -6.33065938949585, + "rewards/rejected": 8.979644775390625, + "step": 2545 + }, + { + "epoch": 0.41, + "learning_rate": 9.204199411142196e-06, + "logits/chosen": -0.7532612681388855, + "logits/rejected": -0.890515923500061, + "logps/chosen": -215.8206787109375, + "logps/rejected": -85.75642395019531, + "loss": 1.3919, + "rewards/accuracies": 0.0, + "rewards/chosen": 5.609363079071045, + "rewards/margins": -2.613429546356201, + "rewards/rejected": 8.222792625427246, + "step": 2546 + }, + { + "epoch": 0.41, + "learning_rate": 9.20348788137002e-06, + "logits/chosen": -0.891961932182312, + "logits/rejected": -1.0507395267486572, + "logps/chosen": -29.29096031188965, + "logps/rejected": -115.73115539550781, + "loss": 3.1275, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.4036505222320557, + "rewards/margins": -5.657275199890137, + "rewards/rejected": 8.060925483703613, + "step": 2547 + }, + { + "epoch": 0.41, + "learning_rate": 9.202776061176606e-06, + "logits/chosen": -0.8651117086410522, + "logits/rejected": -0.6683793663978577, + "logps/chosen": -233.1218719482422, + "logps/rejected": -50.64884948730469, + "loss": 0.2464, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.6508895754814148, + "rewards/margins": 0.46076202392578125, + "rewards/rejected": 0.19012756645679474, + "step": 2548 + }, + { + "epoch": 0.41, + "learning_rate": 9.202063950611133e-06, + "logits/chosen": -1.2058688402175903, + "logits/rejected": -1.0937708616256714, + "logps/chosen": -130.33963012695312, + "logps/rejected": -84.07838439941406, + "loss": 0.5813, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.756106853485107, + "rewards/margins": 2.132554054260254, + "rewards/rejected": 3.6235527992248535, + "step": 2549 + }, + { + "epoch": 0.41, + "learning_rate": 9.201351549722801e-06, + "logits/chosen": -1.3102744817733765, + "logits/rejected": -0.9260371923446655, + "logps/chosen": -56.473323822021484, + "logps/rejected": -78.44657135009766, + "loss": 0.6943, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.2866132259368896, + "rewards/margins": 0.6239408254623413, + "rewards/rejected": 1.6626724004745483, + "step": 2550 + }, + { + "epoch": 0.41, + "learning_rate": 9.20063885856083e-06, + "logits/chosen": -1.1169377565383911, + "logits/rejected": -1.0791041851043701, + "logps/chosen": -71.41767120361328, + "logps/rejected": -13.669328689575195, + "loss": 0.6834, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.4729362726211548, + "rewards/margins": 0.5918337106704712, + "rewards/rejected": 0.8811025619506836, + "step": 2551 + }, + { + "epoch": 0.41, + "learning_rate": 9.199925877174462e-06, + "logits/chosen": -0.7177534103393555, + "logits/rejected": -0.7637656331062317, + "logps/chosen": -28.873924255371094, + "logps/rejected": -89.93148803710938, + "loss": 0.5318, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.8854911923408508, + "rewards/margins": 0.02643585205078125, + "rewards/rejected": 0.8590553402900696, + "step": 2552 + }, + { + "epoch": 0.41, + "learning_rate": 9.199212605612954e-06, + "logits/chosen": -0.546679675579071, + "logits/rejected": -0.5587242245674133, + "logps/chosen": -4.955740451812744, + "logps/rejected": -23.66939926147461, + "loss": 0.6074, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3413587212562561, + "rewards/margins": 0.13652397692203522, + "rewards/rejected": 0.2048347443342209, + "step": 2553 + }, + { + "epoch": 0.41, + "learning_rate": 9.198499043925591e-06, + "logits/chosen": -0.6391192674636841, + "logits/rejected": -0.5980490446090698, + "logps/chosen": -72.68071746826172, + "logps/rejected": -45.779632568359375, + "loss": 0.7234, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.5910758972167969, + "rewards/margins": -0.8937981128692627, + "rewards/rejected": 2.4848740100860596, + "step": 2554 + }, + { + "epoch": 0.41, + "learning_rate": 9.19778519216167e-06, + "logits/chosen": -0.817725658416748, + "logits/rejected": -0.5906213521957397, + "logps/chosen": -46.4625358581543, + "logps/rejected": -24.697072982788086, + "loss": 0.4921, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.3040790557861328, + "rewards/margins": 0.9126975536346436, + "rewards/rejected": 0.39138147234916687, + "step": 2555 + }, + { + "epoch": 0.41, + "learning_rate": 9.19707105037051e-06, + "logits/chosen": -1.0266538858413696, + "logits/rejected": -0.8654606342315674, + "logps/chosen": -99.44363403320312, + "logps/rejected": -56.285560607910156, + "loss": 0.0178, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.203271389007568, + "rewards/margins": 3.8835806846618652, + "rewards/rejected": 2.319690704345703, + "step": 2556 + }, + { + "epoch": 0.42, + "learning_rate": 9.196356618601455e-06, + "logits/chosen": -0.9138749241828918, + "logits/rejected": -0.8782238960266113, + "logps/chosen": -84.25456237792969, + "logps/rejected": -60.261474609375, + "loss": 1.3939, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.693048119544983, + "rewards/margins": -0.5340806245803833, + "rewards/rejected": 2.227128744125366, + "step": 2557 + }, + { + "epoch": 0.42, + "learning_rate": 9.195641896903863e-06, + "logits/chosen": -0.8236832022666931, + "logits/rejected": -0.879493236541748, + "logps/chosen": -56.024200439453125, + "logps/rejected": -99.46864318847656, + "loss": 0.5788, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.4574952125549316, + "rewards/margins": -0.7047407627105713, + "rewards/rejected": 3.162235975265503, + "step": 2558 + }, + { + "epoch": 0.42, + "learning_rate": 9.194926885327116e-06, + "logits/chosen": -1.2157008647918701, + "logits/rejected": -1.2230803966522217, + "logps/chosen": -39.931453704833984, + "logps/rejected": -65.5361557006836, + "loss": 0.7813, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.8592373132705688, + "rewards/margins": -0.32615697383880615, + "rewards/rejected": 2.185394287109375, + "step": 2559 + }, + { + "epoch": 0.42, + "learning_rate": 9.194211583920613e-06, + "logits/chosen": -0.8544781804084778, + "logits/rejected": -0.8217674493789673, + "logps/chosen": -131.40719604492188, + "logps/rejected": -102.421875, + "loss": 1.5551, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.9153960943222046, + "rewards/margins": -3.061367988586426, + "rewards/rejected": 4.97676420211792, + "step": 2560 + }, + { + "epoch": 0.42, + "learning_rate": 9.193495992733779e-06, + "logits/chosen": -1.217411994934082, + "logits/rejected": -1.075793981552124, + "logps/chosen": -104.15230560302734, + "logps/rejected": -33.31551742553711, + "loss": 0.198, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.8352699279785156, + "rewards/margins": 0.920474648475647, + "rewards/rejected": -0.08520469814538956, + "step": 2561 + }, + { + "epoch": 0.42, + "learning_rate": 9.192780111816048e-06, + "logits/chosen": -1.2837530374526978, + "logits/rejected": -1.3489515781402588, + "logps/chosen": -90.32978820800781, + "logps/rejected": -87.27396392822266, + "loss": 1.8686, + "rewards/accuracies": 1.0, + "rewards/chosen": 7.108394145965576, + "rewards/margins": 0.5534052848815918, + "rewards/rejected": 6.554988861083984, + "step": 2562 + }, + { + "epoch": 0.42, + "learning_rate": 9.192063941216884e-06, + "logits/chosen": -0.5910680890083313, + "logits/rejected": -0.5887428522109985, + "logps/chosen": -5.609270095825195, + "logps/rejected": -3.7666680812835693, + "loss": 0.457, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.2899486720561981, + "rewards/margins": -0.1411985158920288, + "rewards/rejected": 0.43114718794822693, + "step": 2563 + }, + { + "epoch": 0.42, + "learning_rate": 9.191347480985768e-06, + "logits/chosen": -1.015698790550232, + "logits/rejected": -1.0659403800964355, + "logps/chosen": -81.70770263671875, + "logps/rejected": -98.88968658447266, + "loss": 1.2862, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.1272027492523193, + "rewards/margins": -2.3879849910736084, + "rewards/rejected": 5.515187740325928, + "step": 2564 + }, + { + "epoch": 0.42, + "learning_rate": 9.1906307311722e-06, + "logits/chosen": -0.7351839542388916, + "logits/rejected": -0.7351839542388916, + "logps/chosen": -55.20201110839844, + "logps/rejected": -55.20201110839844, + "loss": 0.348, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.5136478543281555, + "rewards/margins": 0.0, + "rewards/rejected": 0.5136478543281555, + "step": 2565 + }, + { + "epoch": 0.42, + "learning_rate": 9.1899136918257e-06, + "logits/chosen": -0.5937431454658508, + "logits/rejected": -0.5737488865852356, + "logps/chosen": -14.230225563049316, + "logps/rejected": -1.7955858707427979, + "loss": 1.3777, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.34130069613456726, + "rewards/margins": -0.061217308044433594, + "rewards/rejected": 0.40251800417900085, + "step": 2566 + }, + { + "epoch": 0.42, + "learning_rate": 9.18919636299581e-06, + "logits/chosen": -1.1265852451324463, + "logits/rejected": -1.0940099954605103, + "logps/chosen": -53.64480209350586, + "logps/rejected": -30.528430938720703, + "loss": 0.2592, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.7584691047668457, + "rewards/margins": 0.5287845134735107, + "rewards/rejected": 2.229684591293335, + "step": 2567 + }, + { + "epoch": 0.42, + "learning_rate": 9.18847874473209e-06, + "logits/chosen": -1.0435564517974854, + "logits/rejected": -0.9633494019508362, + "logps/chosen": -95.30839538574219, + "logps/rejected": -126.24495697021484, + "loss": 1.407, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.9709579944610596, + "rewards/margins": -2.4794304370880127, + "rewards/rejected": 6.450388431549072, + "step": 2568 + }, + { + "epoch": 0.42, + "learning_rate": 9.18776083708412e-06, + "logits/chosen": -1.1137269735336304, + "logits/rejected": -1.0891529321670532, + "logps/chosen": -62.453346252441406, + "logps/rejected": -44.63493728637695, + "loss": 0.6304, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.244097113609314, + "rewards/margins": 0.052150726318359375, + "rewards/rejected": 1.1919463872909546, + "step": 2569 + }, + { + "epoch": 0.42, + "learning_rate": 9.1870426401015e-06, + "logits/chosen": -0.7901538014411926, + "logits/rejected": -0.7901538014411926, + "logps/chosen": -90.49515533447266, + "logps/rejected": -90.49515533447266, + "loss": 2.2619, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.1811745166778564, + "rewards/margins": 0.0, + "rewards/rejected": 1.1811745166778564, + "step": 2570 + }, + { + "epoch": 0.42, + "learning_rate": 9.186324153833853e-06, + "logits/chosen": -0.9806663393974304, + "logits/rejected": -0.981819748878479, + "logps/chosen": -45.3111572265625, + "logps/rejected": -80.67865753173828, + "loss": 0.7915, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.7675682306289673, + "rewards/margins": 1.240289330482483, + "rewards/rejected": 0.5272789001464844, + "step": 2571 + }, + { + "epoch": 0.42, + "learning_rate": 9.18560537833082e-06, + "logits/chosen": -1.0411962270736694, + "logits/rejected": -0.994203507900238, + "logps/chosen": -54.26728057861328, + "logps/rejected": -49.49419021606445, + "loss": 0.8821, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.5890426635742188, + "rewards/margins": -0.673515796661377, + "rewards/rejected": 2.2625584602355957, + "step": 2572 + }, + { + "epoch": 0.42, + "learning_rate": 9.184886313642056e-06, + "logits/chosen": -1.1881568431854248, + "logits/rejected": -1.0337644815444946, + "logps/chosen": -170.69610595703125, + "logps/rejected": -81.47560119628906, + "loss": 0.1793, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.691384792327881, + "rewards/margins": 0.9217357635498047, + "rewards/rejected": 5.769649028778076, + "step": 2573 + }, + { + "epoch": 0.42, + "learning_rate": 9.184166959817247e-06, + "logits/chosen": -1.247051477432251, + "logits/rejected": -1.3881036043167114, + "logps/chosen": -100.50704956054688, + "logps/rejected": -35.49258804321289, + "loss": 0.4127, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.8219696283340454, + "rewards/margins": 1.6459789276123047, + "rewards/rejected": 0.17599068582057953, + "step": 2574 + }, + { + "epoch": 0.42, + "learning_rate": 9.183447316906094e-06, + "logits/chosen": -0.9628379940986633, + "logits/rejected": -0.9628379940986633, + "logps/chosen": -52.93172073364258, + "logps/rejected": -52.93172073364258, + "loss": 0.4127, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.512519598007202, + "rewards/margins": 0.0, + "rewards/rejected": 3.512519598007202, + "step": 2575 + }, + { + "epoch": 0.42, + "learning_rate": 9.182727384958314e-06, + "logits/chosen": -0.504039466381073, + "logits/rejected": -0.504039466381073, + "logps/chosen": -4.801723957061768, + "logps/rejected": -4.801723957061768, + "loss": 0.5637, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.760895311832428, + "rewards/margins": 0.0, + "rewards/rejected": 0.760895311832428, + "step": 2576 + }, + { + "epoch": 0.42, + "learning_rate": 9.18200716402365e-06, + "logits/chosen": -1.252480387687683, + "logits/rejected": -1.202934741973877, + "logps/chosen": -60.292457580566406, + "logps/rejected": -53.64509582519531, + "loss": 0.7624, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.3228797912597656, + "rewards/margins": -0.9392402172088623, + "rewards/rejected": 3.262120008468628, + "step": 2577 + }, + { + "epoch": 0.42, + "learning_rate": 9.18128665415186e-06, + "logits/chosen": -0.9654319882392883, + "logits/rejected": -0.9273070096969604, + "logps/chosen": -35.57804870605469, + "logps/rejected": -63.24577331542969, + "loss": 0.9404, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.8941047191619873, + "rewards/margins": 1.6329779624938965, + "rewards/rejected": 1.2611267566680908, + "step": 2578 + }, + { + "epoch": 0.42, + "learning_rate": 9.180565855392726e-06, + "logits/chosen": -1.277945876121521, + "logits/rejected": -1.2780044078826904, + "logps/chosen": -101.15385437011719, + "logps/rejected": -70.56185913085938, + "loss": 0.1856, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.985382080078125, + "rewards/margins": 0.8253371715545654, + "rewards/rejected": 1.1600449085235596, + "step": 2579 + }, + { + "epoch": 0.42, + "learning_rate": 9.17984476779605e-06, + "logits/chosen": -0.9923059940338135, + "logits/rejected": -0.9603485465049744, + "logps/chosen": -32.38884353637695, + "logps/rejected": -67.8003921508789, + "loss": 0.5994, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.010866165161133, + "rewards/margins": -0.1335468292236328, + "rewards/rejected": 2.1444129943847656, + "step": 2580 + }, + { + "epoch": 0.42, + "learning_rate": 9.179123391411648e-06, + "logits/chosen": -0.790823221206665, + "logits/rejected": -0.8122740983963013, + "logps/chosen": -49.126068115234375, + "logps/rejected": -59.130271911621094, + "loss": 0.7431, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.5274345278739929, + "rewards/margins": -0.6848755478858948, + "rewards/rejected": 1.2123100757598877, + "step": 2581 + }, + { + "epoch": 0.42, + "learning_rate": 9.178401726289366e-06, + "logits/chosen": -0.8439351916313171, + "logits/rejected": -0.8606879711151123, + "logps/chosen": -39.33804702758789, + "logps/rejected": -79.98210906982422, + "loss": 0.9403, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.744462251663208, + "rewards/margins": 0.9347614645957947, + "rewards/rejected": 0.8097007870674133, + "step": 2582 + }, + { + "epoch": 0.42, + "learning_rate": 9.177679772479058e-06, + "logits/chosen": -1.557212233543396, + "logits/rejected": -1.523447871208191, + "logps/chosen": -60.82073974609375, + "logps/rejected": -64.66098022460938, + "loss": 1.0147, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.4754128456115723, + "rewards/margins": -1.4987356662750244, + "rewards/rejected": 3.9741485118865967, + "step": 2583 + }, + { + "epoch": 0.42, + "learning_rate": 9.176957530030609e-06, + "logits/chosen": -0.8427794575691223, + "logits/rejected": -0.8176414966583252, + "logps/chosen": -108.52182006835938, + "logps/rejected": -72.97953033447266, + "loss": 0.2708, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.7232887744903564, + "rewards/margins": 0.5693634748458862, + "rewards/rejected": 1.1539252996444702, + "step": 2584 + }, + { + "epoch": 0.42, + "learning_rate": 9.176234998993917e-06, + "logits/chosen": -1.364653468132019, + "logits/rejected": -1.4722740650177002, + "logps/chosen": -190.6443328857422, + "logps/rejected": -86.00291442871094, + "loss": 0.1618, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.578405857086182, + "rewards/margins": 1.248347282409668, + "rewards/rejected": 4.330058574676514, + "step": 2585 + }, + { + "epoch": 0.42, + "learning_rate": 9.175512179418903e-06, + "logits/chosen": -0.5869868993759155, + "logits/rejected": -0.6009423136711121, + "logps/chosen": -86.68278503417969, + "logps/rejected": -113.15036010742188, + "loss": 0.4129, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.8423645496368408, + "rewards/margins": 0.47509467601776123, + "rewards/rejected": 1.3672698736190796, + "step": 2586 + }, + { + "epoch": 0.42, + "learning_rate": 9.174789071355506e-06, + "logits/chosen": -0.9212647080421448, + "logits/rejected": -0.7412660121917725, + "logps/chosen": -153.61337280273438, + "logps/rejected": -44.04917907714844, + "loss": 0.0063, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.209280490875244, + "rewards/margins": 4.8234453201293945, + "rewards/rejected": 1.3858349323272705, + "step": 2587 + }, + { + "epoch": 0.42, + "learning_rate": 9.174065674853687e-06, + "logits/chosen": -0.913756787776947, + "logits/rejected": -0.8938507437705994, + "logps/chosen": -32.84315490722656, + "logps/rejected": -11.15134334564209, + "loss": 0.4988, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.9854778051376343, + "rewards/margins": 1.0331010818481445, + "rewards/rejected": 0.952376663684845, + "step": 2588 + }, + { + "epoch": 0.42, + "learning_rate": 9.173341989963424e-06, + "logits/chosen": -1.1522961854934692, + "logits/rejected": -0.9725726246833801, + "logps/chosen": -140.78167724609375, + "logps/rejected": -106.28168487548828, + "loss": 0.4162, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.051922559738159, + "rewards/margins": 0.7504080533981323, + "rewards/rejected": 1.3015145063400269, + "step": 2589 + }, + { + "epoch": 0.42, + "learning_rate": 9.172618016734718e-06, + "logits/chosen": -1.2823489904403687, + "logits/rejected": -0.9042088985443115, + "logps/chosen": -152.041748046875, + "logps/rejected": -29.76422882080078, + "loss": 0.0084, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.393316745758057, + "rewards/margins": 4.4113688468933105, + "rewards/rejected": 0.9819477200508118, + "step": 2590 + }, + { + "epoch": 0.42, + "learning_rate": 9.17189375521759e-06, + "logits/chosen": -1.112114667892456, + "logits/rejected": -1.1143025159835815, + "logps/chosen": -35.32218933105469, + "logps/rejected": -92.33566284179688, + "loss": 1.4258, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0448760986328125, + "rewards/margins": 0.05242609977722168, + "rewards/rejected": 2.992449998855591, + "step": 2591 + }, + { + "epoch": 0.42, + "learning_rate": 9.171169205462078e-06, + "logits/chosen": -1.2554049491882324, + "logits/rejected": -1.2205654382705688, + "logps/chosen": -66.84893798828125, + "logps/rejected": -46.88465118408203, + "loss": 1.1148, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.2968491315841675, + "rewards/margins": -1.7534102201461792, + "rewards/rejected": 3.0502593517303467, + "step": 2592 + }, + { + "epoch": 0.42, + "learning_rate": 9.170444367518243e-06, + "logits/chosen": -0.8332541584968567, + "logits/rejected": -0.7174879908561707, + "logps/chosen": -57.860267639160156, + "logps/rejected": -50.76660919189453, + "loss": 1.5039, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.5196800231933594, + "rewards/margins": 1.216759443283081, + "rewards/rejected": 1.3029205799102783, + "step": 2593 + }, + { + "epoch": 0.42, + "learning_rate": 9.169719241436162e-06, + "logits/chosen": -0.9197719097137451, + "logits/rejected": -0.9454191327095032, + "logps/chosen": -97.5642318725586, + "logps/rejected": -62.52370834350586, + "loss": 0.0959, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.925342559814453, + "rewards/margins": 2.491687297821045, + "rewards/rejected": 1.4336551427841187, + "step": 2594 + }, + { + "epoch": 0.42, + "learning_rate": 9.168993827265935e-06, + "logits/chosen": -1.2732499837875366, + "logits/rejected": -1.2756081819534302, + "logps/chosen": -60.697471618652344, + "logps/rejected": -83.49738311767578, + "loss": 0.4164, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.7994377613067627, + "rewards/margins": -0.25859689712524414, + "rewards/rejected": 3.058034658432007, + "step": 2595 + }, + { + "epoch": 0.42, + "learning_rate": 9.168268125057682e-06, + "logits/chosen": -1.0789231061935425, + "logits/rejected": -0.9368137717247009, + "logps/chosen": -101.63093566894531, + "logps/rejected": -31.24083709716797, + "loss": 0.0155, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.096823215484619, + "rewards/margins": 3.703629732131958, + "rewards/rejected": 2.393193483352661, + "step": 2596 + }, + { + "epoch": 0.42, + "learning_rate": 9.167542134861543e-06, + "logits/chosen": -0.8073171973228455, + "logits/rejected": -0.9224022030830383, + "logps/chosen": -98.9806900024414, + "logps/rejected": -99.64218139648438, + "loss": 3.3016, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.7637656927108765, + "rewards/margins": -1.0044502019882202, + "rewards/rejected": 2.7682158946990967, + "step": 2597 + }, + { + "epoch": 0.42, + "learning_rate": 9.166815856727676e-06, + "logits/chosen": -1.1796804666519165, + "logits/rejected": -1.1995296478271484, + "logps/chosen": -65.7328872680664, + "logps/rejected": -113.0311279296875, + "loss": 0.5859, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.5015121698379517, + "rewards/margins": -0.19595491886138916, + "rewards/rejected": 1.6974670886993408, + "step": 2598 + }, + { + "epoch": 0.42, + "learning_rate": 9.16608929070626e-06, + "logits/chosen": -0.6314095258712769, + "logits/rejected": -0.7288467288017273, + "logps/chosen": -70.21621704101562, + "logps/rejected": -104.07677459716797, + "loss": 1.5362, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.404266357421875, + "rewards/margins": -1.4116119146347046, + "rewards/rejected": 1.8158782720565796, + "step": 2599 + }, + { + "epoch": 0.42, + "learning_rate": 9.165362436847493e-06, + "logits/chosen": -1.2683863639831543, + "logits/rejected": -1.246800422668457, + "logps/chosen": -88.24705505371094, + "logps/rejected": -69.31668090820312, + "loss": 2.3106, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.1871049404144287, + "rewards/margins": -1.5640478134155273, + "rewards/rejected": 3.751152753829956, + "step": 2600 + }, + { + "epoch": 0.42, + "learning_rate": 9.164635295201597e-06, + "logits/chosen": -1.1029132604599, + "logits/rejected": -1.1785550117492676, + "logps/chosen": -115.74291229248047, + "logps/rejected": -169.0677490234375, + "loss": 2.5621, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.7218223810195923, + "rewards/margins": -5.091353893280029, + "rewards/rejected": 5.813176155090332, + "step": 2601 + }, + { + "epoch": 0.42, + "learning_rate": 9.163907865818806e-06, + "logits/chosen": -1.1414531469345093, + "logits/rejected": -1.2181663513183594, + "logps/chosen": -109.29383087158203, + "logps/rejected": -126.7524642944336, + "loss": 0.67, + "rewards/accuracies": 0.0, + "rewards/chosen": 5.857570648193359, + "rewards/margins": -0.2578887939453125, + "rewards/rejected": 6.115459442138672, + "step": 2602 + }, + { + "epoch": 0.42, + "learning_rate": 9.163180148749381e-06, + "logits/chosen": -0.8249343037605286, + "logits/rejected": -1.142161250114441, + "logps/chosen": -106.77701568603516, + "logps/rejected": -64.75113677978516, + "loss": 0.7677, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.2036399841308594, + "rewards/margins": -1.2523994445800781, + "rewards/rejected": 3.4560394287109375, + "step": 2603 + }, + { + "epoch": 0.42, + "learning_rate": 9.1624521440436e-06, + "logits/chosen": -0.9532864093780518, + "logits/rejected": -0.9123585820198059, + "logps/chosen": -156.47735595703125, + "logps/rejected": -115.98739624023438, + "loss": 0.0591, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.583021640777588, + "rewards/margins": 3.960679531097412, + "rewards/rejected": 1.6223419904708862, + "step": 2604 + }, + { + "epoch": 0.42, + "learning_rate": 9.161723851751763e-06, + "logits/chosen": -1.0699495077133179, + "logits/rejected": -1.0184985399246216, + "logps/chosen": -84.07038879394531, + "logps/rejected": -88.44636535644531, + "loss": 0.2757, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.575415134429932, + "rewards/margins": 0.3271803855895996, + "rewards/rejected": 4.248234748840332, + "step": 2605 + }, + { + "epoch": 0.42, + "learning_rate": 9.160995271924185e-06, + "logits/chosen": -1.236938238143921, + "logits/rejected": -1.3582968711853027, + "logps/chosen": -166.64642333984375, + "logps/rejected": -77.75582122802734, + "loss": 0.0342, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.696325778961182, + "rewards/margins": 3.2177772521972656, + "rewards/rejected": 3.478548526763916, + "step": 2606 + }, + { + "epoch": 0.42, + "learning_rate": 9.160266404611206e-06, + "logits/chosen": -0.7010979056358337, + "logits/rejected": -0.7550027966499329, + "logps/chosen": -36.99449157714844, + "logps/rejected": -105.33930969238281, + "loss": 0.7084, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.624884009361267, + "rewards/margins": 0.963000476360321, + "rewards/rejected": 0.661883533000946, + "step": 2607 + }, + { + "epoch": 0.42, + "learning_rate": 9.159537249863182e-06, + "logits/chosen": -1.1758655309677124, + "logits/rejected": -0.9618759751319885, + "logps/chosen": -131.96377563476562, + "logps/rejected": -93.80110168457031, + "loss": 0.8002, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.677646160125732, + "rewards/margins": 2.222574234008789, + "rewards/rejected": 4.455071926116943, + "step": 2608 + }, + { + "epoch": 0.42, + "learning_rate": 9.158807807730493e-06, + "logits/chosen": -1.170865535736084, + "logits/rejected": -1.1435860395431519, + "logps/chosen": -103.93412017822266, + "logps/rejected": -85.10452270507812, + "loss": 0.6185, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.164435863494873, + "rewards/margins": 3.0551042556762695, + "rewards/rejected": 2.1093316078186035, + "step": 2609 + }, + { + "epoch": 0.42, + "learning_rate": 9.158078078263536e-06, + "logits/chosen": -1.0753908157348633, + "logits/rejected": -1.0476154088974, + "logps/chosen": -63.97626495361328, + "logps/rejected": -62.60763931274414, + "loss": 1.1589, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.32867431640625, + "rewards/margins": -0.7962605953216553, + "rewards/rejected": 2.1249349117279053, + "step": 2610 + }, + { + "epoch": 0.42, + "learning_rate": 9.157348061512728e-06, + "logits/chosen": -1.1347293853759766, + "logits/rejected": -0.9675013422966003, + "logps/chosen": -123.49205017089844, + "logps/rejected": -53.51385498046875, + "loss": 0.2241, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.541285753250122, + "rewards/margins": 1.2615141868591309, + "rewards/rejected": 2.279771566390991, + "step": 2611 + }, + { + "epoch": 0.42, + "learning_rate": 9.156617757528505e-06, + "logits/chosen": -0.7262248992919922, + "logits/rejected": -0.7709484696388245, + "logps/chosen": -56.1102294921875, + "logps/rejected": -57.848052978515625, + "loss": 0.6124, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.7867813110351562, + "rewards/margins": -0.6574981212615967, + "rewards/rejected": 2.444279432296753, + "step": 2612 + }, + { + "epoch": 0.42, + "learning_rate": 9.155887166361326e-06, + "logits/chosen": -0.9025213122367859, + "logits/rejected": -0.9616957902908325, + "logps/chosen": -90.24227905273438, + "logps/rejected": -56.47332763671875, + "loss": 0.695, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.162493944168091, + "rewards/margins": -0.5120651721954346, + "rewards/rejected": 2.6745591163635254, + "step": 2613 + }, + { + "epoch": 0.42, + "learning_rate": 9.155156288061666e-06, + "logits/chosen": -0.91887366771698, + "logits/rejected": -0.969736635684967, + "logps/chosen": -66.252197265625, + "logps/rejected": -125.25948333740234, + "loss": 2.1244, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.767499566078186, + "rewards/margins": -2.771200656890869, + "rewards/rejected": 4.538700103759766, + "step": 2614 + }, + { + "epoch": 0.42, + "learning_rate": 9.154425122680024e-06, + "logits/chosen": -0.8675822615623474, + "logits/rejected": -0.8510743379592896, + "logps/chosen": -85.46710205078125, + "logps/rejected": -52.006141662597656, + "loss": 0.3926, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.2241883277893066, + "rewards/margins": -0.06845617294311523, + "rewards/rejected": 2.292644500732422, + "step": 2615 + }, + { + "epoch": 0.42, + "learning_rate": 9.153693670266915e-06, + "logits/chosen": -1.16313636302948, + "logits/rejected": -1.16313636302948, + "logps/chosen": -12.110481262207031, + "logps/rejected": -12.110481262207031, + "loss": 0.7265, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.4519582986831665, + "rewards/margins": 0.0, + "rewards/rejected": 1.4519582986831665, + "step": 2616 + }, + { + "epoch": 0.42, + "learning_rate": 9.152961930872877e-06, + "logits/chosen": -1.23409903049469, + "logits/rejected": -1.1168763637542725, + "logps/chosen": -49.965415954589844, + "logps/rejected": -33.83854675292969, + "loss": 0.3981, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.279294729232788, + "rewards/margins": -0.1848907470703125, + "rewards/rejected": 2.4641854763031006, + "step": 2617 + }, + { + "epoch": 0.42, + "learning_rate": 9.152229904548464e-06, + "logits/chosen": -1.0160220861434937, + "logits/rejected": -1.0191280841827393, + "logps/chosen": -64.50764465332031, + "logps/rejected": -70.04891967773438, + "loss": 1.3998, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.771527886390686, + "rewards/margins": -0.4666863679885864, + "rewards/rejected": 2.2382142543792725, + "step": 2618 + }, + { + "epoch": 0.43, + "learning_rate": 9.151497591344253e-06, + "logits/chosen": -1.2135435342788696, + "logits/rejected": -1.1769137382507324, + "logps/chosen": -61.1713752746582, + "logps/rejected": -83.10295104980469, + "loss": 0.1887, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.1076762676239014, + "rewards/margins": 1.5489330291748047, + "rewards/rejected": 0.5587432980537415, + "step": 2619 + }, + { + "epoch": 0.43, + "learning_rate": 9.150764991310843e-06, + "logits/chosen": -1.1216670274734497, + "logits/rejected": -1.0528137683868408, + "logps/chosen": -67.26924133300781, + "logps/rejected": -83.95541381835938, + "loss": 0.0585, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.7530410289764404, + "rewards/margins": 2.3074119091033936, + "rewards/rejected": 0.4456291198730469, + "step": 2620 + }, + { + "epoch": 0.43, + "learning_rate": 9.150032104498846e-06, + "logits/chosen": -0.8555387854576111, + "logits/rejected": -0.8206079006195068, + "logps/chosen": -66.38699340820312, + "logps/rejected": -65.95046997070312, + "loss": 0.2921, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.535510301589966, + "rewards/margins": 0.27411341667175293, + "rewards/rejected": 2.261396884918213, + "step": 2621 + }, + { + "epoch": 0.43, + "learning_rate": 9.149298930958896e-06, + "logits/chosen": -1.520132303237915, + "logits/rejected": -1.5456329584121704, + "logps/chosen": -65.9654769897461, + "logps/rejected": -117.51730346679688, + "loss": 0.845, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.5564063787460327, + "rewards/margins": -1.4817131757736206, + "rewards/rejected": 3.0381195545196533, + "step": 2622 + }, + { + "epoch": 0.43, + "learning_rate": 9.148565470741652e-06, + "logits/chosen": -0.625126838684082, + "logits/rejected": -0.6422572731971741, + "logps/chosen": -83.98634338378906, + "logps/rejected": -96.41119384765625, + "loss": 0.8666, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.8576141595840454, + "rewards/margins": -0.37262117862701416, + "rewards/rejected": 2.2302353382110596, + "step": 2623 + }, + { + "epoch": 0.43, + "learning_rate": 9.147831723897788e-06, + "logits/chosen": -0.8093703985214233, + "logits/rejected": -0.5839498043060303, + "logps/chosen": -76.30960845947266, + "logps/rejected": -12.284064292907715, + "loss": 0.1461, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.265918731689453, + "rewards/margins": 1.2234013080596924, + "rewards/rejected": 1.0425174236297607, + "step": 2624 + }, + { + "epoch": 0.43, + "learning_rate": 9.147097690478002e-06, + "logits/chosen": -1.2091619968414307, + "logits/rejected": -1.1349561214447021, + "logps/chosen": -122.67320251464844, + "logps/rejected": -101.57633972167969, + "loss": 0.2437, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.807445049285889, + "rewards/margins": 0.5936126708984375, + "rewards/rejected": 5.213832378387451, + "step": 2625 + }, + { + "epoch": 0.43, + "learning_rate": 9.146363370533004e-06, + "logits/chosen": -1.013542890548706, + "logits/rejected": -1.0127578973770142, + "logps/chosen": -68.3932113647461, + "logps/rejected": -72.88921356201172, + "loss": 0.3326, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9318153858184814, + "rewards/margins": 0.07264089584350586, + "rewards/rejected": 2.8591744899749756, + "step": 2626 + }, + { + "epoch": 0.43, + "learning_rate": 9.14562876411353e-06, + "logits/chosen": -0.8658761978149414, + "logits/rejected": -0.7865537405014038, + "logps/chosen": -123.51521301269531, + "logps/rejected": -73.5325927734375, + "loss": 0.0547, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.741844177246094, + "rewards/margins": 2.20283579826355, + "rewards/rejected": 2.539008378982544, + "step": 2627 + }, + { + "epoch": 0.43, + "learning_rate": 9.144893871270335e-06, + "logits/chosen": -1.1234339475631714, + "logits/rejected": -1.1498688459396362, + "logps/chosen": -72.50100708007812, + "logps/rejected": -129.40899658203125, + "loss": 1.5516, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.4228501319885254, + "rewards/margins": -2.929867744445801, + "rewards/rejected": 5.352717876434326, + "step": 2628 + }, + { + "epoch": 0.43, + "learning_rate": 9.144158692054192e-06, + "logits/chosen": -0.9128024578094482, + "logits/rejected": -0.9346609115600586, + "logps/chosen": -48.140235900878906, + "logps/rejected": -89.87089538574219, + "loss": 0.311, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.7395156621932983, + "rewards/margins": 0.3894714117050171, + "rewards/rejected": 1.3500442504882812, + "step": 2629 + }, + { + "epoch": 0.43, + "learning_rate": 9.143423226515894e-06, + "logits/chosen": -0.9256672859191895, + "logits/rejected": -0.9269958734512329, + "logps/chosen": -63.2737922668457, + "logps/rejected": -64.79264068603516, + "loss": 0.4565, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.1050167083740234, + "rewards/margins": -0.383652925491333, + "rewards/rejected": 2.4886696338653564, + "step": 2630 + }, + { + "epoch": 0.43, + "learning_rate": 9.14268747470626e-06, + "logits/chosen": -0.9988171458244324, + "logits/rejected": -0.888739287853241, + "logps/chosen": -122.14930725097656, + "logps/rejected": -103.38201904296875, + "loss": 0.4483, + "rewards/accuracies": 0.0, + "rewards/chosen": 5.451704502105713, + "rewards/margins": -0.3552384376525879, + "rewards/rejected": 5.806942939758301, + "step": 2631 + }, + { + "epoch": 0.43, + "learning_rate": 9.141951436676119e-06, + "logits/chosen": -1.0966949462890625, + "logits/rejected": -0.9084857106208801, + "logps/chosen": -150.68893432617188, + "logps/rejected": -189.52223205566406, + "loss": 0.116, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.5186920166015625, + "rewards/margins": 1.3808608055114746, + "rewards/rejected": 3.137831211090088, + "step": 2632 + }, + { + "epoch": 0.43, + "learning_rate": 9.141215112476325e-06, + "logits/chosen": -0.771786093711853, + "logits/rejected": -0.7081595659255981, + "logps/chosen": -71.02310180664062, + "logps/rejected": -100.24649047851562, + "loss": 1.0937, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.3939614295959473, + "rewards/margins": -0.611914873123169, + "rewards/rejected": 3.005876302719116, + "step": 2633 + }, + { + "epoch": 0.43, + "learning_rate": 9.14047850215775e-06, + "logits/chosen": -0.787384569644928, + "logits/rejected": -0.8093694448471069, + "logps/chosen": -10.322336196899414, + "logps/rejected": -1.975122332572937, + "loss": 0.5062, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.08102903515100479, + "rewards/margins": -0.20811045169830322, + "rewards/rejected": 0.2891394793987274, + "step": 2634 + }, + { + "epoch": 0.43, + "learning_rate": 9.139741605771291e-06, + "logits/chosen": -0.5846335291862488, + "logits/rejected": -0.5846335291862488, + "logps/chosen": -69.75343322753906, + "logps/rejected": -69.75343322753906, + "loss": 0.6997, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.874424695968628, + "rewards/margins": 0.0, + "rewards/rejected": 2.874424695968628, + "step": 2635 + }, + { + "epoch": 0.43, + "learning_rate": 9.139004423367854e-06, + "logits/chosen": -0.8278061747550964, + "logits/rejected": -0.7769807577133179, + "logps/chosen": -93.34977722167969, + "logps/rejected": -81.78018951416016, + "loss": 0.3222, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.197009325027466, + "rewards/margins": 1.883734941482544, + "rewards/rejected": 0.3132743835449219, + "step": 2636 + }, + { + "epoch": 0.43, + "learning_rate": 9.138266954998378e-06, + "logits/chosen": -1.1970546245574951, + "logits/rejected": -1.3276073932647705, + "logps/chosen": -175.80519104003906, + "logps/rejected": -150.44674682617188, + "loss": 1.22, + "rewards/accuracies": 0.0, + "rewards/chosen": 6.297314643859863, + "rewards/margins": -2.1941699981689453, + "rewards/rejected": 8.491484642028809, + "step": 2637 + }, + { + "epoch": 0.43, + "learning_rate": 9.137529200713811e-06, + "logits/chosen": -0.7665315270423889, + "logits/rejected": -0.744513750076294, + "logps/chosen": -47.02788162231445, + "logps/rejected": -70.59131622314453, + "loss": 0.2764, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.094767451286316, + "rewards/margins": 0.30721938610076904, + "rewards/rejected": 0.7875480651855469, + "step": 2638 + }, + { + "epoch": 0.43, + "learning_rate": 9.136791160565126e-06, + "logits/chosen": -0.6027267575263977, + "logits/rejected": -0.59634929895401, + "logps/chosen": -66.93726348876953, + "logps/rejected": -51.81012725830078, + "loss": 0.603, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.2174110412597656, + "rewards/margins": -0.060298919677734375, + "rewards/rejected": 1.2777099609375, + "step": 2639 + }, + { + "epoch": 0.43, + "learning_rate": 9.136052834603314e-06, + "logits/chosen": -0.4311242699623108, + "logits/rejected": -0.4431411623954773, + "logps/chosen": -8.182836532592773, + "logps/rejected": -51.3536491394043, + "loss": 1.2266, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.16061697900295258, + "rewards/margins": -0.10201539099216461, + "rewards/rejected": 0.2626323699951172, + "step": 2640 + }, + { + "epoch": 0.43, + "learning_rate": 9.135314222879388e-06, + "logits/chosen": -1.5008492469787598, + "logits/rejected": -1.4232970476150513, + "logps/chosen": -115.71678161621094, + "logps/rejected": -34.694889068603516, + "loss": 0.0879, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.2402267456054688, + "rewards/margins": 2.0474281311035156, + "rewards/rejected": 0.19279861450195312, + "step": 2641 + }, + { + "epoch": 0.43, + "learning_rate": 9.134575325444377e-06, + "logits/chosen": -1.0172110795974731, + "logits/rejected": -0.9999514818191528, + "logps/chosen": -97.71569061279297, + "logps/rejected": -143.82215881347656, + "loss": 2.0714, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.3828102350234985, + "rewards/margins": -3.4224772453308105, + "rewards/rejected": 4.8052873611450195, + "step": 2642 + }, + { + "epoch": 0.43, + "learning_rate": 9.133836142349332e-06, + "logits/chosen": -0.9287588000297546, + "logits/rejected": -0.9825677871704102, + "logps/chosen": -119.19717407226562, + "logps/rejected": -149.03485107421875, + "loss": 2.2486, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.678924560546875, + "rewards/margins": -3.7815141677856445, + "rewards/rejected": 5.4604387283325195, + "step": 2643 + }, + { + "epoch": 0.43, + "learning_rate": 9.133096673645325e-06, + "logits/chosen": -1.1265944242477417, + "logits/rejected": -1.03398859500885, + "logps/chosen": -87.79680633544922, + "logps/rejected": -65.9250259399414, + "loss": 0.1733, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.860708713531494, + "rewards/margins": 1.0416131019592285, + "rewards/rejected": 1.8190956115722656, + "step": 2644 + }, + { + "epoch": 0.43, + "learning_rate": 9.132356919383446e-06, + "logits/chosen": -0.9999443292617798, + "logits/rejected": -1.1008049249649048, + "logps/chosen": -104.84209442138672, + "logps/rejected": -159.9849395751953, + "loss": 2.7411, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.1813180446624756, + "rewards/margins": -5.26120662689209, + "rewards/rejected": 7.4425249099731445, + "step": 2645 + }, + { + "epoch": 0.43, + "learning_rate": 9.131616879614804e-06, + "logits/chosen": -0.7459420561790466, + "logits/rejected": -0.9956108331680298, + "logps/chosen": -68.1337661743164, + "logps/rejected": -87.375732421875, + "loss": 0.2082, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.8566277027130127, + "rewards/margins": 2.1410439014434814, + "rewards/rejected": 0.7155838012695312, + "step": 2646 + }, + { + "epoch": 0.43, + "learning_rate": 9.13087655439053e-06, + "logits/chosen": -0.8228781223297119, + "logits/rejected": -0.9518327116966248, + "logps/chosen": -76.95527648925781, + "logps/rejected": -97.79686737060547, + "loss": 0.4147, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.030673503875732, + "rewards/margins": 0.12519168853759766, + "rewards/rejected": 3.9054818153381348, + "step": 2647 + }, + { + "epoch": 0.43, + "learning_rate": 9.130135943761772e-06, + "logits/chosen": -1.0449244976043701, + "logits/rejected": -0.9571689963340759, + "logps/chosen": -75.95051574707031, + "logps/rejected": -27.079238891601562, + "loss": 0.0575, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.435185194015503, + "rewards/margins": 2.1963462829589844, + "rewards/rejected": 0.23883895576000214, + "step": 2648 + }, + { + "epoch": 0.43, + "learning_rate": 9.129395047779701e-06, + "logits/chosen": -0.862726628780365, + "logits/rejected": -0.7199157476425171, + "logps/chosen": -33.4775390625, + "logps/rejected": -21.299652099609375, + "loss": 0.3353, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.740918755531311, + "rewards/margins": 1.185070514678955, + "rewards/rejected": 0.5558483004570007, + "step": 2649 + }, + { + "epoch": 0.43, + "learning_rate": 9.128653866495504e-06, + "logits/chosen": -0.9607958197593689, + "logits/rejected": -1.0517243146896362, + "logps/chosen": -64.4480209350586, + "logps/rejected": -104.23226928710938, + "loss": 0.2371, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.5428475141525269, + "rewards/margins": 0.5751183032989502, + "rewards/rejected": 0.9677292108535767, + "step": 2650 + }, + { + "epoch": 0.43, + "learning_rate": 9.12791239996039e-06, + "logits/chosen": -0.5290257334709167, + "logits/rejected": -0.5285052061080933, + "logps/chosen": -2.770244598388672, + "logps/rejected": -1.4251384735107422, + "loss": 3.0485, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.26929354667663574, + "rewards/margins": -0.14643293619155884, + "rewards/rejected": 0.4157264828681946, + "step": 2651 + }, + { + "epoch": 0.43, + "learning_rate": 9.12717064822559e-06, + "logits/chosen": -0.5165430307388306, + "logits/rejected": -0.5222406387329102, + "logps/chosen": -6.614252090454102, + "logps/rejected": -2.3152639865875244, + "loss": 0.8312, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.2980367839336395, + "rewards/margins": -0.017445623874664307, + "rewards/rejected": 0.31548240780830383, + "step": 2652 + }, + { + "epoch": 0.43, + "learning_rate": 9.126428611342348e-06, + "logits/chosen": -1.2165411710739136, + "logits/rejected": -1.2519598007202148, + "logps/chosen": -218.4961700439453, + "logps/rejected": -81.82557678222656, + "loss": 0.0996, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.101539611816406, + "rewards/margins": 1.6334044933319092, + "rewards/rejected": 2.468135118484497, + "step": 2653 + }, + { + "epoch": 0.43, + "learning_rate": 9.125686289361935e-06, + "logits/chosen": -0.577812910079956, + "logits/rejected": -0.6863294243812561, + "logps/chosen": -52.80110168457031, + "logps/rejected": -99.74540710449219, + "loss": 0.7429, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.8554828763008118, + "rewards/margins": -1.105637788772583, + "rewards/rejected": 1.96112060546875, + "step": 2654 + }, + { + "epoch": 0.43, + "learning_rate": 9.124943682335635e-06, + "logits/chosen": -0.9800903797149658, + "logits/rejected": -1.0684975385665894, + "logps/chosen": -83.35301208496094, + "logps/rejected": -116.95928192138672, + "loss": 3.1306, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.3804153203964233, + "rewards/margins": -2.980135440826416, + "rewards/rejected": 4.360550880432129, + "step": 2655 + }, + { + "epoch": 0.43, + "learning_rate": 9.124200790314759e-06, + "logits/chosen": -1.2165327072143555, + "logits/rejected": -1.1603710651397705, + "logps/chosen": -143.83102416992188, + "logps/rejected": -135.54617309570312, + "loss": 0.8815, + "rewards/accuracies": 0.0, + "rewards/chosen": 5.273756504058838, + "rewards/margins": -1.514643669128418, + "rewards/rejected": 6.788400173187256, + "step": 2656 + }, + { + "epoch": 0.43, + "learning_rate": 9.12345761335063e-06, + "logits/chosen": -0.9253941774368286, + "logits/rejected": -0.9604794383049011, + "logps/chosen": -43.45915222167969, + "logps/rejected": -72.90444946289062, + "loss": 1.2283, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.4335304498672485, + "rewards/margins": -2.270928382873535, + "rewards/rejected": 3.704458713531494, + "step": 2657 + }, + { + "epoch": 0.43, + "learning_rate": 9.122714151494599e-06, + "logits/chosen": -0.9601019024848938, + "logits/rejected": -0.9394745826721191, + "logps/chosen": -89.58943939208984, + "logps/rejected": -122.9354248046875, + "loss": 0.9013, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.586225152015686, + "rewards/margins": -0.9074410200119019, + "rewards/rejected": 2.493666172027588, + "step": 2658 + }, + { + "epoch": 0.43, + "learning_rate": 9.121970404798028e-06, + "logits/chosen": -0.8505595922470093, + "logits/rejected": -0.846068263053894, + "logps/chosen": -2.3020131587982178, + "logps/rejected": -42.193016052246094, + "loss": 0.6183, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.4825521409511566, + "rewards/margins": -0.07674965262413025, + "rewards/rejected": 0.5593017935752869, + "step": 2659 + }, + { + "epoch": 0.43, + "learning_rate": 9.121226373312303e-06, + "logits/chosen": -1.9931097030639648, + "logits/rejected": -2.0725414752960205, + "logps/chosen": -151.75196838378906, + "logps/rejected": -108.1137466430664, + "loss": 0.3085, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.462687969207764, + "rewards/margins": 0.6696953773498535, + "rewards/rejected": 5.79299259185791, + "step": 2660 + }, + { + "epoch": 0.43, + "learning_rate": 9.120482057088832e-06, + "logits/chosen": -1.1010111570358276, + "logits/rejected": -1.12711763381958, + "logps/chosen": -86.95056915283203, + "logps/rejected": -68.35025024414062, + "loss": 1.3701, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.9264060854911804, + "rewards/margins": -2.6234824657440186, + "rewards/rejected": 3.5498886108398438, + "step": 2661 + }, + { + "epoch": 0.43, + "learning_rate": 9.119737456179041e-06, + "logits/chosen": -1.2063488960266113, + "logits/rejected": -1.2809810638427734, + "logps/chosen": -143.4983367919922, + "logps/rejected": -79.86717224121094, + "loss": 0.0343, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.742323398590088, + "rewards/margins": 3.603085517883301, + "rewards/rejected": 1.1392379999160767, + "step": 2662 + }, + { + "epoch": 0.43, + "learning_rate": 9.118992570634374e-06, + "logits/chosen": -1.3314738273620605, + "logits/rejected": -1.3006292581558228, + "logps/chosen": -68.990234375, + "logps/rejected": -75.55601501464844, + "loss": 0.8646, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.4529709815979004, + "rewards/margins": 0.14258742332458496, + "rewards/rejected": 2.3103835582733154, + "step": 2663 + }, + { + "epoch": 0.43, + "learning_rate": 9.11824740050629e-06, + "logits/chosen": -1.3114725351333618, + "logits/rejected": -1.267593264579773, + "logps/chosen": -31.46611213684082, + "logps/rejected": -51.75944137573242, + "loss": 0.1835, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.898085117340088, + "rewards/margins": 0.8249523639678955, + "rewards/rejected": 3.0731327533721924, + "step": 2664 + }, + { + "epoch": 0.43, + "learning_rate": 9.117501945846281e-06, + "logits/chosen": -0.960707426071167, + "logits/rejected": -1.0906747579574585, + "logps/chosen": -68.44743347167969, + "logps/rejected": -156.62518310546875, + "loss": 4.0525, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.3746719360351562, + "rewards/margins": -2.7923874855041504, + "rewards/rejected": 6.167059421539307, + "step": 2665 + }, + { + "epoch": 0.43, + "learning_rate": 9.116756206705848e-06, + "logits/chosen": -0.657952606678009, + "logits/rejected": -0.7202527523040771, + "logps/chosen": -79.61754608154297, + "logps/rejected": -78.71123504638672, + "loss": 2.0559, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.012668609619140625, + "rewards/margins": -3.4978363513946533, + "rewards/rejected": 3.510504961013794, + "step": 2666 + }, + { + "epoch": 0.43, + "learning_rate": 9.116010183136512e-06, + "logits/chosen": -1.2991825342178345, + "logits/rejected": -1.3505891561508179, + "logps/chosen": -266.31695556640625, + "logps/rejected": -166.04063415527344, + "loss": 1.8483, + "rewards/accuracies": 0.0, + "rewards/chosen": 4.338937282562256, + "rewards/margins": -3.576277256011963, + "rewards/rejected": 7.915214538574219, + "step": 2667 + }, + { + "epoch": 0.43, + "learning_rate": 9.115263875189821e-06, + "logits/chosen": -1.1785550117492676, + "logits/rejected": -1.2639018297195435, + "logps/chosen": -139.55831909179688, + "logps/rejected": -168.60174560546875, + "loss": 0.518, + "rewards/accuracies": 0.0, + "rewards/chosen": 5.277764797210693, + "rewards/margins": -0.5341706275939941, + "rewards/rejected": 5.8119354248046875, + "step": 2668 + }, + { + "epoch": 0.43, + "learning_rate": 9.114517282917335e-06, + "logits/chosen": -0.8241130113601685, + "logits/rejected": -0.8241130113601685, + "logps/chosen": -57.701202392578125, + "logps/rejected": -57.701202392578125, + "loss": 1.0381, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.570453643798828, + "rewards/margins": 0.0, + "rewards/rejected": 2.570453643798828, + "step": 2669 + }, + { + "epoch": 0.43, + "learning_rate": 9.113770406370635e-06, + "logits/chosen": -1.211559772491455, + "logits/rejected": -1.2760106325149536, + "logps/chosen": -90.36846923828125, + "logps/rejected": -98.44802856445312, + "loss": 3.2937, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.334733724594116, + "rewards/margins": -5.517291069030762, + "rewards/rejected": 7.852025032043457, + "step": 2670 + }, + { + "epoch": 0.43, + "learning_rate": 9.113023245601325e-06, + "logits/chosen": -0.6845595836639404, + "logits/rejected": -0.7017070055007935, + "logps/chosen": -56.19496154785156, + "logps/rejected": -41.769004821777344, + "loss": 0.5564, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.313946485519409, + "rewards/margins": 0.6513526439666748, + "rewards/rejected": 1.6625938415527344, + "step": 2671 + }, + { + "epoch": 0.43, + "learning_rate": 9.112275800661026e-06, + "logits/chosen": -0.6881936192512512, + "logits/rejected": -0.7480759024620056, + "logps/chosen": -59.578433990478516, + "logps/rejected": -102.65118408203125, + "loss": 1.4396, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.7537670135498047, + "rewards/margins": 0.19321858882904053, + "rewards/rejected": 1.5605484247207642, + "step": 2672 + }, + { + "epoch": 0.43, + "learning_rate": 9.111528071601381e-06, + "logits/chosen": -1.0579290390014648, + "logits/rejected": -1.0370527505874634, + "logps/chosen": -67.30826568603516, + "logps/rejected": -87.10889434814453, + "loss": 0.1869, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.1326606273651123, + "rewards/margins": 0.809219241142273, + "rewards/rejected": 1.3234413862228394, + "step": 2673 + }, + { + "epoch": 0.43, + "learning_rate": 9.110780058474052e-06, + "logits/chosen": -0.9922494292259216, + "logits/rejected": -0.9236793518066406, + "logps/chosen": -138.81234741210938, + "logps/rejected": -75.62299346923828, + "loss": 0.7322, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.403001546859741, + "rewards/margins": -0.8011069297790527, + "rewards/rejected": 3.204108476638794, + "step": 2674 + }, + { + "epoch": 0.43, + "learning_rate": 9.110031761330713e-06, + "logits/chosen": -1.089328646659851, + "logits/rejected": -1.0161950588226318, + "logps/chosen": -66.34711456298828, + "logps/rejected": -58.32652282714844, + "loss": 0.5978, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.855182647705078, + "rewards/margins": -0.7770829200744629, + "rewards/rejected": 3.632265567779541, + "step": 2675 + }, + { + "epoch": 0.43, + "learning_rate": 9.109283180223073e-06, + "logits/chosen": -0.9359074831008911, + "logits/rejected": -0.9236019253730774, + "logps/chosen": -44.324188232421875, + "logps/rejected": -38.600311279296875, + "loss": 0.7476, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.8261550664901733, + "rewards/margins": -0.5488866567611694, + "rewards/rejected": 2.3750417232513428, + "step": 2676 + }, + { + "epoch": 0.43, + "learning_rate": 9.108534315202844e-06, + "logits/chosen": -1.178515911102295, + "logits/rejected": -1.2296409606933594, + "logps/chosen": -146.23809814453125, + "logps/rejected": -100.1244888305664, + "loss": 0.0539, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.509799480438232, + "rewards/margins": 3.0938456058502197, + "rewards/rejected": 2.4159538745880127, + "step": 2677 + }, + { + "epoch": 0.43, + "learning_rate": 9.107785166321772e-06, + "logits/chosen": -0.8096348643302917, + "logits/rejected": -0.7818166017532349, + "logps/chosen": -76.7027587890625, + "logps/rejected": -89.62110900878906, + "loss": 0.4209, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.072163462638855, + "rewards/margins": -0.22737503051757812, + "rewards/rejected": 1.299538493156433, + "step": 2678 + }, + { + "epoch": 0.43, + "learning_rate": 9.107035733631612e-06, + "logits/chosen": -1.084686040878296, + "logits/rejected": -0.9988218545913696, + "logps/chosen": -128.86195373535156, + "logps/rejected": -69.16358947753906, + "loss": 0.4921, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.3218889236450195, + "rewards/margins": 0.10047769546508789, + "rewards/rejected": 4.221411228179932, + "step": 2679 + }, + { + "epoch": 0.43, + "learning_rate": 9.106286017184143e-06, + "logits/chosen": -1.1510266065597534, + "logits/rejected": -1.0045902729034424, + "logps/chosen": -62.39748001098633, + "logps/rejected": -27.04378890991211, + "loss": 0.0707, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.3517391681671143, + "rewards/margins": 1.9640798568725586, + "rewards/rejected": 0.3876592814922333, + "step": 2680 + }, + { + "epoch": 0.44, + "learning_rate": 9.105536017031167e-06, + "logits/chosen": -1.1555806398391724, + "logits/rejected": -1.202495813369751, + "logps/chosen": -174.41339111328125, + "logps/rejected": -117.3310775756836, + "loss": 0.1954, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.868374824523926, + "rewards/margins": 1.622591495513916, + "rewards/rejected": 5.24578332901001, + "step": 2681 + }, + { + "epoch": 0.44, + "learning_rate": 9.104785733224498e-06, + "logits/chosen": -0.8169601559638977, + "logits/rejected": -1.0481115579605103, + "logps/chosen": -75.50401306152344, + "logps/rejected": -26.910663604736328, + "loss": 0.3504, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.4222259521484375, + "rewards/margins": 1.9591875076293945, + "rewards/rejected": 0.46303844451904297, + "step": 2682 + }, + { + "epoch": 0.44, + "learning_rate": 9.104035165815971e-06, + "logits/chosen": -1.0161510705947876, + "logits/rejected": -0.9529544711112976, + "logps/chosen": -64.57068634033203, + "logps/rejected": -47.84012985229492, + "loss": 0.2553, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.799896240234375, + "rewards/margins": 0.4084599018096924, + "rewards/rejected": 2.3914363384246826, + "step": 2683 + }, + { + "epoch": 0.44, + "learning_rate": 9.103284314857452e-06, + "logits/chosen": -1.1413975954055786, + "logits/rejected": -1.0805212259292603, + "logps/chosen": -43.04485321044922, + "logps/rejected": -64.06334686279297, + "loss": 0.9379, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.1965675354003906, + "rewards/margins": -1.5653488636016846, + "rewards/rejected": 3.761916399002075, + "step": 2684 + }, + { + "epoch": 0.44, + "learning_rate": 9.10253318040081e-06, + "logits/chosen": -1.401682734489441, + "logits/rejected": -1.414987564086914, + "logps/chosen": -114.77713012695312, + "logps/rejected": -59.92448043823242, + "loss": 0.3556, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.7907532453536987, + "rewards/margins": 0.1283329725265503, + "rewards/rejected": 1.6624202728271484, + "step": 2685 + }, + { + "epoch": 0.44, + "learning_rate": 9.101781762497944e-06, + "logits/chosen": -0.9029470086097717, + "logits/rejected": -0.926763653755188, + "logps/chosen": -325.7677307128906, + "logps/rejected": -80.87440490722656, + "loss": 0.0181, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.207345485687256, + "rewards/margins": 4.021135330200195, + "rewards/rejected": 2.1862099170684814, + "step": 2686 + }, + { + "epoch": 0.44, + "learning_rate": 9.10103006120077e-06, + "logits/chosen": -1.279180645942688, + "logits/rejected": -1.2741743326187134, + "logps/chosen": -103.86631774902344, + "logps/rejected": -51.48251724243164, + "loss": 0.7568, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.1457031965255737, + "rewards/margins": -1.2280734777450562, + "rewards/rejected": 2.37377667427063, + "step": 2687 + }, + { + "epoch": 0.44, + "learning_rate": 9.100278076561222e-06, + "logits/chosen": -0.7784574627876282, + "logits/rejected": -0.7626368403434753, + "logps/chosen": -64.62506103515625, + "logps/rejected": -25.940753936767578, + "loss": 1.5621, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.489875078201294, + "rewards/margins": 0.6181584000587463, + "rewards/rejected": 0.8717166781425476, + "step": 2688 + }, + { + "epoch": 0.44, + "learning_rate": 9.099525808631258e-06, + "logits/chosen": -1.1675041913986206, + "logits/rejected": -1.079735279083252, + "logps/chosen": -182.34132385253906, + "logps/rejected": -57.08180236816406, + "loss": 0.2407, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.296806335449219, + "rewards/margins": 2.4666969776153564, + "rewards/rejected": 2.8301093578338623, + "step": 2689 + }, + { + "epoch": 0.44, + "learning_rate": 9.09877325746285e-06, + "logits/chosen": -1.3678451776504517, + "logits/rejected": -1.4822938442230225, + "logps/chosen": -120.16959381103516, + "logps/rejected": -111.66436767578125, + "loss": 0.7091, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.329761505126953, + "rewards/margins": -1.0455713272094727, + "rewards/rejected": 4.375332832336426, + "step": 2690 + }, + { + "epoch": 0.44, + "learning_rate": 9.098020423107992e-06, + "logits/chosen": -1.2128347158432007, + "logits/rejected": -1.232610821723938, + "logps/chosen": -93.69953155517578, + "logps/rejected": -151.30694580078125, + "loss": 1.2309, + "rewards/accuracies": 0.0, + "rewards/chosen": 4.764962196350098, + "rewards/margins": -2.0163488388061523, + "rewards/rejected": 6.78131103515625, + "step": 2691 + }, + { + "epoch": 0.44, + "learning_rate": 9.0972673056187e-06, + "logits/chosen": -1.001393437385559, + "logits/rejected": -1.0299538373947144, + "logps/chosen": -176.39306640625, + "logps/rejected": -110.54853820800781, + "loss": 0.7246, + "rewards/accuracies": 0.0, + "rewards/chosen": 6.034192085266113, + "rewards/margins": -1.1567792892456055, + "rewards/rejected": 7.190971374511719, + "step": 2692 + }, + { + "epoch": 0.44, + "learning_rate": 9.096513905047004e-06, + "logits/chosen": -0.8624438047409058, + "logits/rejected": -0.877395749092102, + "logps/chosen": -69.990966796875, + "logps/rejected": -127.09149932861328, + "loss": 0.3509, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.441149115562439, + "rewards/margins": 0.4827086925506592, + "rewards/rejected": 0.9584404230117798, + "step": 2693 + }, + { + "epoch": 0.44, + "learning_rate": 9.09576022144496e-06, + "logits/chosen": -0.8617739081382751, + "logits/rejected": -0.8874123096466064, + "logps/chosen": -48.99824523925781, + "logps/rejected": -38.977333068847656, + "loss": 0.3181, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.7689605951309204, + "rewards/margins": 1.0851280689239502, + "rewards/rejected": 0.683832585811615, + "step": 2694 + }, + { + "epoch": 0.44, + "learning_rate": 9.09500625486464e-06, + "logits/chosen": -1.099429726600647, + "logits/rejected": -1.09457528591156, + "logps/chosen": -102.73747253417969, + "logps/rejected": -63.98750305175781, + "loss": 1.4326, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.2329682111740112, + "rewards/margins": -1.0984679460525513, + "rewards/rejected": 2.3314361572265625, + "step": 2695 + }, + { + "epoch": 0.44, + "learning_rate": 9.094252005358133e-06, + "logits/chosen": -1.0418848991394043, + "logits/rejected": -1.0034916400909424, + "logps/chosen": -45.350006103515625, + "logps/rejected": -16.211320877075195, + "loss": 1.3738, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.0201596021652222, + "rewards/margins": 0.4669625163078308, + "rewards/rejected": 0.5531970858573914, + "step": 2696 + }, + { + "epoch": 0.44, + "learning_rate": 9.093497472977554e-06, + "logits/chosen": -1.1634056568145752, + "logits/rejected": -1.1641837358474731, + "logps/chosen": -92.514404296875, + "logps/rejected": -44.46879196166992, + "loss": 0.566, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.665657877922058, + "rewards/margins": -0.6179302930831909, + "rewards/rejected": 2.283588171005249, + "step": 2697 + }, + { + "epoch": 0.44, + "learning_rate": 9.092742657775031e-06, + "logits/chosen": -0.9223672747612, + "logits/rejected": -0.8689063787460327, + "logps/chosen": -65.46855926513672, + "logps/rejected": -100.87838745117188, + "loss": 1.7675, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.7028679251670837, + "rewards/margins": -3.208414316177368, + "rewards/rejected": 3.9112823009490967, + "step": 2698 + }, + { + "epoch": 0.44, + "learning_rate": 9.091987559802718e-06, + "logits/chosen": -0.9152271747589111, + "logits/rejected": -0.7212046384811401, + "logps/chosen": -154.71060180664062, + "logps/rejected": -75.43531799316406, + "loss": 0.0543, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.497292995452881, + "rewards/margins": 2.673616647720337, + "rewards/rejected": 3.823676347732544, + "step": 2699 + }, + { + "epoch": 0.44, + "learning_rate": 9.091232179112782e-06, + "logits/chosen": -1.4028501510620117, + "logits/rejected": -1.2524763345718384, + "logps/chosen": -160.84339904785156, + "logps/rejected": -72.87982940673828, + "loss": 1.1652, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.204705715179443, + "rewards/margins": 2.5429022312164307, + "rewards/rejected": 3.6618034839630127, + "step": 2700 + }, + { + "epoch": 0.44, + "learning_rate": 9.090476515757416e-06, + "logits/chosen": -1.087482213973999, + "logits/rejected": -1.078995704650879, + "logps/chosen": -31.073835372924805, + "logps/rejected": -59.198570251464844, + "loss": 0.4427, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.7318971157073975, + "rewards/margins": -0.13858556747436523, + "rewards/rejected": 2.8704826831817627, + "step": 2701 + }, + { + "epoch": 0.44, + "learning_rate": 9.089720569788825e-06, + "logits/chosen": -1.4685980081558228, + "logits/rejected": -1.2858761548995972, + "logps/chosen": -105.09162902832031, + "logps/rejected": -21.785297393798828, + "loss": 0.0044, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.748374938964844, + "rewards/margins": 5.225639820098877, + "rewards/rejected": 0.522735059261322, + "step": 2702 + }, + { + "epoch": 0.44, + "learning_rate": 9.088964341259241e-06, + "logits/chosen": -1.2048969268798828, + "logits/rejected": -1.258656620979309, + "logps/chosen": -57.36900329589844, + "logps/rejected": -101.70020294189453, + "loss": 1.928, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.9923577308654785, + "rewards/margins": -3.022078037261963, + "rewards/rejected": 6.014435768127441, + "step": 2703 + }, + { + "epoch": 0.44, + "learning_rate": 9.088207830220912e-06, + "logits/chosen": -1.1990036964416504, + "logits/rejected": -1.2638225555419922, + "logps/chosen": -77.77555084228516, + "logps/rejected": -118.29612731933594, + "loss": 2.3936, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.533061981201172, + "rewards/margins": -3.278759002685547, + "rewards/rejected": 5.811820983886719, + "step": 2704 + }, + { + "epoch": 0.44, + "learning_rate": 9.087451036726103e-06, + "logits/chosen": -1.37450110912323, + "logits/rejected": -1.2855408191680908, + "logps/chosen": -149.2466583251953, + "logps/rejected": -79.14552307128906, + "loss": 0.291, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.8034988641738892, + "rewards/margins": 0.5586974620819092, + "rewards/rejected": 1.24480140209198, + "step": 2705 + }, + { + "epoch": 0.44, + "learning_rate": 9.086693960827106e-06, + "logits/chosen": -0.9281981587409973, + "logits/rejected": -0.9184233546257019, + "logps/chosen": -81.38323974609375, + "logps/rejected": -79.9705810546875, + "loss": 0.6494, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.553755283355713, + "rewards/margins": 0.7472336292266846, + "rewards/rejected": 1.8065216541290283, + "step": 2706 + }, + { + "epoch": 0.44, + "learning_rate": 9.085936602576222e-06, + "logits/chosen": -1.1261221170425415, + "logits/rejected": -1.1557341814041138, + "logps/chosen": -89.57450866699219, + "logps/rejected": -50.4097785949707, + "loss": 0.6826, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.808274984359741, + "rewards/margins": -0.02554798126220703, + "rewards/rejected": 2.8338229656219482, + "step": 2707 + }, + { + "epoch": 0.44, + "learning_rate": 9.085178962025783e-06, + "logits/chosen": -0.8717837929725647, + "logits/rejected": -0.8900392651557922, + "logps/chosen": -89.42926025390625, + "logps/rejected": -152.69683837890625, + "loss": 0.1774, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.2545249462127686, + "rewards/margins": 0.9468756914138794, + "rewards/rejected": 1.3076492547988892, + "step": 2708 + }, + { + "epoch": 0.44, + "learning_rate": 9.08442103922813e-06, + "logits/chosen": -1.1529457569122314, + "logits/rejected": -1.0952836275100708, + "logps/chosen": -127.41810607910156, + "logps/rejected": -129.54171752929688, + "loss": 0.7894, + "rewards/accuracies": 0.0, + "rewards/chosen": 6.7673659324646, + "rewards/margins": -1.2811541557312012, + "rewards/rejected": 8.0485200881958, + "step": 2709 + }, + { + "epoch": 0.44, + "learning_rate": 9.08366283423563e-06, + "logits/chosen": -0.9791221022605896, + "logits/rejected": -0.9991861581802368, + "logps/chosen": -82.1179428100586, + "logps/rejected": -73.87389373779297, + "loss": 0.509, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.7590394020080566, + "rewards/margins": 0.967440128326416, + "rewards/rejected": 1.7915992736816406, + "step": 2710 + }, + { + "epoch": 0.44, + "learning_rate": 9.08290434710067e-06, + "logits/chosen": -0.6178016662597656, + "logits/rejected": -0.6246060132980347, + "logps/chosen": -0.48297181725502014, + "logps/rejected": -29.67676544189453, + "loss": 0.4091, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.20649421215057373, + "rewards/margins": 0.18946024775505066, + "rewards/rejected": 0.017033958807587624, + "step": 2711 + }, + { + "epoch": 0.44, + "learning_rate": 9.08214557787565e-06, + "logits/chosen": -1.257987141609192, + "logits/rejected": -1.2755508422851562, + "logps/chosen": -36.91838836669922, + "logps/rejected": -84.59504699707031, + "loss": 0.7357, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.7707459926605225, + "rewards/margins": -1.113800048828125, + "rewards/rejected": 3.8845460414886475, + "step": 2712 + }, + { + "epoch": 0.44, + "learning_rate": 9.081386526612998e-06, + "logits/chosen": -0.6331204175949097, + "logits/rejected": -0.687220573425293, + "logps/chosen": -28.806690216064453, + "logps/rejected": -53.94526672363281, + "loss": 0.6148, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.4914791584014893, + "rewards/margins": -0.011469602584838867, + "rewards/rejected": 1.5029487609863281, + "step": 2713 + }, + { + "epoch": 0.44, + "learning_rate": 9.080627193365155e-06, + "logits/chosen": -1.1753538846969604, + "logits/rejected": -1.0943135023117065, + "logps/chosen": -72.12673950195312, + "logps/rejected": -61.99463653564453, + "loss": 1.2974, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.108721971511841, + "rewards/margins": 0.23315513134002686, + "rewards/rejected": 1.875566840171814, + "step": 2714 + }, + { + "epoch": 0.44, + "learning_rate": 9.079867578184585e-06, + "logits/chosen": -1.013015627861023, + "logits/rejected": -1.0441443920135498, + "logps/chosen": -57.991722106933594, + "logps/rejected": -37.6444091796875, + "loss": 2.535, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.952918291091919, + "rewards/margins": -0.7117965221405029, + "rewards/rejected": 2.664714813232422, + "step": 2715 + }, + { + "epoch": 0.44, + "learning_rate": 9.079107681123767e-06, + "logits/chosen": -0.8334051370620728, + "logits/rejected": -0.8220812082290649, + "logps/chosen": -42.58887481689453, + "logps/rejected": -32.436798095703125, + "loss": 1.0643, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.655043125152588, + "rewards/margins": -0.2123711109161377, + "rewards/rejected": 2.8674142360687256, + "step": 2716 + }, + { + "epoch": 0.44, + "learning_rate": 9.078347502235207e-06, + "logits/chosen": -0.8692598938941956, + "logits/rejected": -0.8239935040473938, + "logps/chosen": -52.964134216308594, + "logps/rejected": -30.1754207611084, + "loss": 0.9159, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.0887839794158936, + "rewards/margins": -0.26718783378601074, + "rewards/rejected": 2.3559718132019043, + "step": 2717 + }, + { + "epoch": 0.44, + "learning_rate": 9.077587041571425e-06, + "logits/chosen": -0.7193015813827515, + "logits/rejected": -0.7074863314628601, + "logps/chosen": -58.19731903076172, + "logps/rejected": -33.647579193115234, + "loss": 0.8726, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.4422508478164673, + "rewards/margins": -0.6223872900009155, + "rewards/rejected": 2.064638137817383, + "step": 2718 + }, + { + "epoch": 0.44, + "learning_rate": 9.07682629918496e-06, + "logits/chosen": -1.0175613164901733, + "logits/rejected": -1.1026455163955688, + "logps/chosen": -109.13243865966797, + "logps/rejected": -144.28277587890625, + "loss": 0.8568, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.8813438415527344, + "rewards/margins": -0.40508508682250977, + "rewards/rejected": 3.286428928375244, + "step": 2719 + }, + { + "epoch": 0.44, + "learning_rate": 9.076065275128372e-06, + "logits/chosen": -1.0599524974822998, + "logits/rejected": -1.0424363613128662, + "logps/chosen": -79.7130126953125, + "logps/rejected": -59.08631134033203, + "loss": 2.0662, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.6303116083145142, + "rewards/margins": -3.0787758827209473, + "rewards/rejected": 4.709087371826172, + "step": 2720 + }, + { + "epoch": 0.44, + "learning_rate": 9.075303969454244e-06, + "logits/chosen": -0.8468753099441528, + "logits/rejected": -0.8678369522094727, + "logps/chosen": -114.60350036621094, + "logps/rejected": -91.83155059814453, + "loss": 0.8877, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.551838755607605, + "rewards/margins": -1.4197396039962769, + "rewards/rejected": 2.971578359603882, + "step": 2721 + }, + { + "epoch": 0.44, + "learning_rate": 9.07454238221517e-06, + "logits/chosen": -0.8503095507621765, + "logits/rejected": -0.7292295694351196, + "logps/chosen": -69.9913330078125, + "logps/rejected": -38.20049285888672, + "loss": 0.2809, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.215502977371216, + "rewards/margins": 0.34778285026550293, + "rewards/rejected": 2.867720127105713, + "step": 2722 + }, + { + "epoch": 0.44, + "learning_rate": 9.073780513463773e-06, + "logits/chosen": -0.8193199634552002, + "logits/rejected": -0.8042075634002686, + "logps/chosen": -61.25926971435547, + "logps/rejected": -49.866050720214844, + "loss": 0.3519, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.388833612203598, + "rewards/margins": 0.34536436200141907, + "rewards/rejected": 0.04346923902630806, + "step": 2723 + }, + { + "epoch": 0.44, + "learning_rate": 9.073018363252689e-06, + "logits/chosen": -0.4428602457046509, + "logits/rejected": -0.4428602457046509, + "logps/chosen": -1.157314419746399, + "logps/rejected": -1.157314419746399, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.21119724214076996, + "rewards/margins": 0.0, + "rewards/rejected": 0.21119724214076996, + "step": 2724 + }, + { + "epoch": 0.44, + "learning_rate": 9.072255931634572e-06, + "logits/chosen": -0.7778173089027405, + "logits/rejected": -0.7778173089027405, + "logps/chosen": -16.671289443969727, + "logps/rejected": -16.671289443969727, + "loss": 0.4969, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.3683494329452515, + "rewards/margins": 0.0, + "rewards/rejected": 1.3683494329452515, + "step": 2725 + }, + { + "epoch": 0.44, + "learning_rate": 9.071493218662106e-06, + "logits/chosen": -1.1015739440917969, + "logits/rejected": -1.116012454032898, + "logps/chosen": -161.71336364746094, + "logps/rejected": -124.05754852294922, + "loss": 0.8891, + "rewards/accuracies": 0.0, + "rewards/chosen": 6.5593109130859375, + "rewards/margins": -1.4602041244506836, + "rewards/rejected": 8.019515037536621, + "step": 2726 + }, + { + "epoch": 0.44, + "learning_rate": 9.070730224387982e-06, + "logits/chosen": -1.0924192667007446, + "logits/rejected": -1.0470337867736816, + "logps/chosen": -136.9384765625, + "logps/rejected": -178.22137451171875, + "loss": 0.0299, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.020564556121826, + "rewards/margins": 2.7963380813598633, + "rewards/rejected": 3.224226474761963, + "step": 2727 + }, + { + "epoch": 0.44, + "learning_rate": 9.069966948864917e-06, + "logits/chosen": -1.0141345262527466, + "logits/rejected": -1.0223360061645508, + "logps/chosen": -35.08590316772461, + "logps/rejected": -71.78571319580078, + "loss": 0.2656, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.6138477325439453, + "rewards/margins": 0.4541560411453247, + "rewards/rejected": 1.1596916913986206, + "step": 2728 + }, + { + "epoch": 0.44, + "learning_rate": 9.069203392145647e-06, + "logits/chosen": -1.030450701713562, + "logits/rejected": -1.055376410484314, + "logps/chosen": -51.48343276977539, + "logps/rejected": -64.44303894042969, + "loss": 1.732, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.572866439819336, + "rewards/margins": 0.5094996690750122, + "rewards/rejected": 1.0633667707443237, + "step": 2729 + }, + { + "epoch": 0.44, + "learning_rate": 9.068439554282924e-06, + "logits/chosen": -1.0906084775924683, + "logits/rejected": -1.044967770576477, + "logps/chosen": -129.420654296875, + "logps/rejected": -83.14962005615234, + "loss": 0.4315, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.719752550125122, + "rewards/margins": -0.2760772705078125, + "rewards/rejected": 1.9958298206329346, + "step": 2730 + }, + { + "epoch": 0.44, + "learning_rate": 9.067675435329527e-06, + "logits/chosen": -1.342039942741394, + "logits/rejected": -1.4061802625656128, + "logps/chosen": -162.455322265625, + "logps/rejected": -179.10537719726562, + "loss": 1.5887, + "rewards/accuracies": 0.0, + "rewards/chosen": 6.70363187789917, + "rewards/margins": -3.1228909492492676, + "rewards/rejected": 9.826522827148438, + "step": 2731 + }, + { + "epoch": 0.44, + "learning_rate": 9.066911035338243e-06, + "logits/chosen": -0.688718318939209, + "logits/rejected": -0.660461962223053, + "logps/chosen": -40.21091079711914, + "logps/rejected": -44.36274337768555, + "loss": 0.4542, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.0854419469833374, + "rewards/margins": 0.1132926344871521, + "rewards/rejected": 0.9721493124961853, + "step": 2732 + }, + { + "epoch": 0.44, + "learning_rate": 9.066146354361889e-06, + "logits/chosen": -0.8191035985946655, + "logits/rejected": -0.8191035985946655, + "logps/chosen": -16.358688354492188, + "logps/rejected": -16.358688354492188, + "loss": 1.2199, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.3446117341518402, + "rewards/margins": 0.0, + "rewards/rejected": 0.3446117341518402, + "step": 2733 + }, + { + "epoch": 0.44, + "learning_rate": 9.065381392453296e-06, + "logits/chosen": -0.7988123893737793, + "logits/rejected": -0.8299576044082642, + "logps/chosen": -66.46439361572266, + "logps/rejected": -100.00784301757812, + "loss": 0.7312, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.5313827395439148, + "rewards/margins": -0.878852903842926, + "rewards/rejected": 1.4102356433868408, + "step": 2734 + }, + { + "epoch": 0.44, + "learning_rate": 9.064616149665314e-06, + "logits/chosen": -0.2676602005958557, + "logits/rejected": -0.26541653275489807, + "logps/chosen": -5.506505489349365, + "logps/rejected": -5.465539932250977, + "loss": 0.7318, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.3473146855831146, + "rewards/margins": -0.1490013599395752, + "rewards/rejected": 0.4963160455226898, + "step": 2735 + }, + { + "epoch": 0.44, + "learning_rate": 9.06385062605082e-06, + "logits/chosen": -1.317460060119629, + "logits/rejected": -1.2434451580047607, + "logps/chosen": -52.759178161621094, + "logps/rejected": -27.83257293701172, + "loss": 0.3695, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.6514495611190796, + "rewards/margins": 1.7381219863891602, + "rewards/rejected": -0.08667240291833878, + "step": 2736 + }, + { + "epoch": 0.44, + "learning_rate": 9.063084821662697e-06, + "logits/chosen": -0.6657377481460571, + "logits/rejected": -0.7027717232704163, + "logps/chosen": -54.13710021972656, + "logps/rejected": -39.85494613647461, + "loss": 0.5635, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.5506553649902344, + "rewards/margins": 0.2629363536834717, + "rewards/rejected": 2.2877190113067627, + "step": 2737 + }, + { + "epoch": 0.44, + "learning_rate": 9.06231873655386e-06, + "logits/chosen": -0.8823354244232178, + "logits/rejected": -0.7932889461517334, + "logps/chosen": -70.8860855102539, + "logps/rejected": -35.332550048828125, + "loss": 2.426, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.4635292291641235, + "rewards/margins": 1.244293212890625, + "rewards/rejected": 0.21923600137233734, + "step": 2738 + }, + { + "epoch": 0.44, + "learning_rate": 9.061552370777236e-06, + "logits/chosen": -0.7106794714927673, + "logits/rejected": -0.776618242263794, + "logps/chosen": -64.2663345336914, + "logps/rejected": -93.58380126953125, + "loss": 0.7793, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.1346938610076904, + "rewards/margins": -1.2922592163085938, + "rewards/rejected": 3.426953077316284, + "step": 2739 + }, + { + "epoch": 0.44, + "learning_rate": 9.060785724385772e-06, + "logits/chosen": -0.7765650749206543, + "logits/rejected": -0.7805537581443787, + "logps/chosen": -65.29847717285156, + "logps/rejected": -69.41888427734375, + "loss": 0.4522, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.4072463512420654, + "rewards/margins": 0.2435622215270996, + "rewards/rejected": 2.163684129714966, + "step": 2740 + }, + { + "epoch": 0.44, + "learning_rate": 9.06001879743244e-06, + "logits/chosen": -0.9623813033103943, + "logits/rejected": -0.9400763511657715, + "logps/chosen": -126.74447631835938, + "logps/rejected": -61.27488708496094, + "loss": 0.6363, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.6127532720565796, + "rewards/margins": 0.052864789962768555, + "rewards/rejected": 1.559888482093811, + "step": 2741 + }, + { + "epoch": 0.45, + "learning_rate": 9.059251589970224e-06, + "logits/chosen": -0.7804642915725708, + "logits/rejected": -0.7995630502700806, + "logps/chosen": -76.58486938476562, + "logps/rejected": -114.83824157714844, + "loss": 2.8873, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.3886711597442627, + "rewards/margins": -1.2407431602478027, + "rewards/rejected": 3.6294143199920654, + "step": 2742 + }, + { + "epoch": 0.45, + "learning_rate": 9.058484102052133e-06, + "logits/chosen": -1.08419668674469, + "logits/rejected": -1.084248661994934, + "logps/chosen": -41.51997375488281, + "logps/rejected": -29.990463256835938, + "loss": 0.6171, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.2770977020263672, + "rewards/margins": -0.3182453513145447, + "rewards/rejected": 0.5953430533409119, + "step": 2743 + }, + { + "epoch": 0.45, + "learning_rate": 9.057716333731193e-06, + "logits/chosen": -0.8368192315101624, + "logits/rejected": 0.19810542464256287, + "logps/chosen": -59.991947174072266, + "logps/rejected": -38.66684341430664, + "loss": 0.2237, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.190063238143921, + "rewards/margins": 1.924860954284668, + "rewards/rejected": 0.2652023434638977, + "step": 2744 + }, + { + "epoch": 0.45, + "learning_rate": 9.056948285060447e-06, + "logits/chosen": -1.276634693145752, + "logits/rejected": -1.2333623170852661, + "logps/chosen": -101.02265167236328, + "logps/rejected": -90.4710693359375, + "loss": 0.115, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.779134511947632, + "rewards/margins": 1.3726495504379272, + "rewards/rejected": 1.4064849615097046, + "step": 2745 + }, + { + "epoch": 0.45, + "learning_rate": 9.056179956092961e-06, + "logits/chosen": -1.0413572788238525, + "logits/rejected": -1.2819290161132812, + "logps/chosen": -79.97216033935547, + "logps/rejected": -118.27745819091797, + "loss": 0.4877, + "rewards/accuracies": 0.0, + "rewards/chosen": 4.312984466552734, + "rewards/margins": -0.3729720115661621, + "rewards/rejected": 4.6859564781188965, + "step": 2746 + }, + { + "epoch": 0.45, + "learning_rate": 9.055411346881823e-06, + "logits/chosen": -0.889205276966095, + "logits/rejected": -0.9151307344436646, + "logps/chosen": -144.3756103515625, + "logps/rejected": -81.6800537109375, + "loss": 0.3486, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.8444840908050537, + "rewards/margins": 0.0612335205078125, + "rewards/rejected": 3.783250570297241, + "step": 2747 + }, + { + "epoch": 0.45, + "learning_rate": 9.054642457480131e-06, + "logits/chosen": -0.7599835395812988, + "logits/rejected": -0.6512318849563599, + "logps/chosen": -61.98621368408203, + "logps/rejected": -19.577287673950195, + "loss": 0.2578, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.1982529163360596, + "rewards/margins": 0.9512558579444885, + "rewards/rejected": 0.24699707329273224, + "step": 2748 + }, + { + "epoch": 0.45, + "learning_rate": 9.053873287941013e-06, + "logits/chosen": -1.164231777191162, + "logits/rejected": -1.1007354259490967, + "logps/chosen": -154.17462158203125, + "logps/rejected": -96.34182739257812, + "loss": 0.3482, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.0702760219573975, + "rewards/margins": 0.12848520278930664, + "rewards/rejected": 1.9417908191680908, + "step": 2749 + }, + { + "epoch": 0.45, + "learning_rate": 9.053103838317608e-06, + "logits/chosen": -0.9970227479934692, + "logits/rejected": -0.9807589054107666, + "logps/chosen": -45.516334533691406, + "logps/rejected": -81.92759704589844, + "loss": 0.3125, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.978870391845703, + "rewards/margins": 0.6571457386016846, + "rewards/rejected": 2.3217246532440186, + "step": 2750 + }, + { + "epoch": 0.45, + "learning_rate": 9.052334108663076e-06, + "logits/chosen": -0.8949490785598755, + "logits/rejected": -0.8949490785598755, + "logps/chosen": -23.169519424438477, + "logps/rejected": -23.169519424438477, + "loss": 0.3487, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.5585482120513916, + "rewards/margins": 0.0, + "rewards/rejected": 1.5585482120513916, + "step": 2751 + }, + { + "epoch": 0.45, + "learning_rate": 9.051564099030604e-06, + "logits/chosen": -0.6202903985977173, + "logits/rejected": -0.6736775636672974, + "logps/chosen": -76.176025390625, + "logps/rejected": -97.11955261230469, + "loss": 1.6622, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.136215925216675, + "rewards/margins": -3.2327616214752197, + "rewards/rejected": 5.3689775466918945, + "step": 2752 + }, + { + "epoch": 0.45, + "learning_rate": 9.050793809473388e-06, + "logits/chosen": -0.7877982258796692, + "logits/rejected": -0.7816951274871826, + "logps/chosen": -74.5009765625, + "logps/rejected": -52.365943908691406, + "loss": 1.038, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.73565673828125, + "rewards/margins": -0.7606148719787598, + "rewards/rejected": 2.4962716102600098, + "step": 2753 + }, + { + "epoch": 0.45, + "learning_rate": 9.050023240044649e-06, + "logits/chosen": -0.5230185985565186, + "logits/rejected": -0.5230185985565186, + "logps/chosen": -17.92888641357422, + "logps/rejected": -17.92888641357422, + "loss": 0.3866, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.2507064938545227, + "rewards/margins": 0.0, + "rewards/rejected": 0.2507064938545227, + "step": 2754 + }, + { + "epoch": 0.45, + "learning_rate": 9.049252390797624e-06, + "logits/chosen": -0.8665316104888916, + "logits/rejected": -0.7798551917076111, + "logps/chosen": -41.5313720703125, + "logps/rejected": -94.34498596191406, + "loss": 0.5557, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.5240578651428223, + "rewards/margins": 0.6554871797561646, + "rewards/rejected": 1.8685706853866577, + "step": 2755 + }, + { + "epoch": 0.45, + "learning_rate": 9.048481261785576e-06, + "logits/chosen": -1.1501166820526123, + "logits/rejected": -1.1823585033416748, + "logps/chosen": -87.19795227050781, + "logps/rejected": -108.51029968261719, + "loss": 3.0291, + "rewards/accuracies": 0.0, + "rewards/chosen": 4.514400005340576, + "rewards/margins": -2.596487045288086, + "rewards/rejected": 7.110887050628662, + "step": 2756 + }, + { + "epoch": 0.45, + "learning_rate": 9.047709853061778e-06, + "logits/chosen": -0.9918769598007202, + "logits/rejected": -0.9584556818008423, + "logps/chosen": -88.15355682373047, + "logps/rejected": -57.39464569091797, + "loss": 0.2422, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.7219154834747314, + "rewards/margins": 0.48946237564086914, + "rewards/rejected": 3.2324531078338623, + "step": 2757 + }, + { + "epoch": 0.45, + "learning_rate": 9.04693816467953e-06, + "logits/chosen": -0.8558734059333801, + "logits/rejected": -0.8558734059333801, + "logps/chosen": -83.77867126464844, + "logps/rejected": -83.77867126464844, + "loss": 0.3684, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.6826348304748535, + "rewards/margins": 0.0, + "rewards/rejected": 2.6826348304748535, + "step": 2758 + }, + { + "epoch": 0.45, + "learning_rate": 9.046166196692145e-06, + "logits/chosen": -0.8887861371040344, + "logits/rejected": -0.9143281579017639, + "logps/chosen": -69.86714172363281, + "logps/rejected": -101.65647888183594, + "loss": 1.7332, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.3656128644943237, + "rewards/margins": -2.191812038421631, + "rewards/rejected": 3.557425022125244, + "step": 2759 + }, + { + "epoch": 0.45, + "learning_rate": 9.045393949152963e-06, + "logits/chosen": -0.9688484072685242, + "logits/rejected": -0.8531200885772705, + "logps/chosen": -33.52663803100586, + "logps/rejected": -57.4736328125, + "loss": 1.1996, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.164386510848999, + "rewards/margins": -2.16546893119812, + "rewards/rejected": 4.329855442047119, + "step": 2760 + }, + { + "epoch": 0.45, + "learning_rate": 9.044621422115338e-06, + "logits/chosen": -1.3614968061447144, + "logits/rejected": -1.3132448196411133, + "logps/chosen": -69.42390441894531, + "logps/rejected": -34.24930953979492, + "loss": 0.115, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.177464246749878, + "rewards/margins": 1.6309947967529297, + "rewards/rejected": 0.546469509601593, + "step": 2761 + }, + { + "epoch": 0.45, + "learning_rate": 9.043848615632643e-06, + "logits/chosen": -1.1901313066482544, + "logits/rejected": -1.349772572517395, + "logps/chosen": -271.05767822265625, + "logps/rejected": -119.7306900024414, + "loss": 1.5846, + "rewards/accuracies": 0.0, + "rewards/chosen": 4.99542236328125, + "rewards/margins": -2.882110118865967, + "rewards/rejected": 7.877532482147217, + "step": 2762 + }, + { + "epoch": 0.45, + "learning_rate": 9.04307552975827e-06, + "logits/chosen": -0.9935187697410583, + "logits/rejected": -1.0668253898620605, + "logps/chosen": -62.26264953613281, + "logps/rejected": -123.634765625, + "loss": 1.8079, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.087214708328247, + "rewards/margins": -3.2243149280548096, + "rewards/rejected": 4.311529636383057, + "step": 2763 + }, + { + "epoch": 0.45, + "learning_rate": 9.042302164545634e-06, + "logits/chosen": -1.046194076538086, + "logits/rejected": -0.8167155981063843, + "logps/chosen": -113.35208892822266, + "logps/rejected": -49.32050704956055, + "loss": 0.0666, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.397129058837891, + "rewards/margins": 2.73456072807312, + "rewards/rejected": 2.6625683307647705, + "step": 2764 + }, + { + "epoch": 0.45, + "learning_rate": 9.041528520048168e-06, + "logits/chosen": -0.775484561920166, + "logits/rejected": -0.802021861076355, + "logps/chosen": -51.79380798339844, + "logps/rejected": -65.84600067138672, + "loss": 1.2598, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.2346458435058594, + "rewards/margins": 0.5835357308387756, + "rewards/rejected": 0.6511101126670837, + "step": 2765 + }, + { + "epoch": 0.45, + "learning_rate": 9.040754596319322e-06, + "logits/chosen": -0.9510620832443237, + "logits/rejected": -0.826024055480957, + "logps/chosen": -71.48755645751953, + "logps/rejected": -31.427410125732422, + "loss": 0.076, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.9418182373046875, + "rewards/margins": 1.8453651666641235, + "rewards/rejected": 0.09645309299230576, + "step": 2766 + }, + { + "epoch": 0.45, + "learning_rate": 9.039980393412567e-06, + "logits/chosen": -1.3876978158950806, + "logits/rejected": -1.3986692428588867, + "logps/chosen": -105.78739166259766, + "logps/rejected": -101.97534942626953, + "loss": 1.0369, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.6795997619628906, + "rewards/margins": -1.7521271705627441, + "rewards/rejected": 3.4317269325256348, + "step": 2767 + }, + { + "epoch": 0.45, + "learning_rate": 9.039205911381395e-06, + "logits/chosen": -1.2134896516799927, + "logits/rejected": -1.0673692226409912, + "logps/chosen": -75.05331420898438, + "logps/rejected": -100.57599639892578, + "loss": 2.4395, + "rewards/accuracies": 0.0, + "rewards/chosen": 4.265967845916748, + "rewards/margins": -0.18644380569458008, + "rewards/rejected": 4.452411651611328, + "step": 2768 + }, + { + "epoch": 0.45, + "learning_rate": 9.038431150279313e-06, + "logits/chosen": -0.9111588597297668, + "logits/rejected": -0.9163168668746948, + "logps/chosen": -7.6589202880859375, + "logps/rejected": -19.46068572998047, + "loss": 0.5844, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.3326702117919922, + "rewards/margins": -0.04495200514793396, + "rewards/rejected": 0.37762221693992615, + "step": 2769 + }, + { + "epoch": 0.45, + "learning_rate": 9.03765611015985e-06, + "logits/chosen": -1.4293276071548462, + "logits/rejected": -1.3741216659545898, + "logps/chosen": -74.77020263671875, + "logps/rejected": -16.238229751586914, + "loss": 0.1043, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.254847764968872, + "rewards/margins": 1.6980397701263428, + "rewards/rejected": 0.5568079352378845, + "step": 2770 + }, + { + "epoch": 0.45, + "learning_rate": 9.036880791076555e-06, + "logits/chosen": -1.0949786901474, + "logits/rejected": -1.0816715955734253, + "logps/chosen": -78.71275329589844, + "logps/rejected": -52.32791519165039, + "loss": 0.527, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0318849086761475, + "rewards/margins": 0.4381139278411865, + "rewards/rejected": 2.593770980834961, + "step": 2771 + }, + { + "epoch": 0.45, + "learning_rate": 9.036105193082995e-06, + "logits/chosen": -1.0331841707229614, + "logits/rejected": -0.769874632358551, + "logps/chosen": -77.77958679199219, + "logps/rejected": -24.580114364624023, + "loss": 0.5021, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0011138916015625, + "rewards/margins": 2.346409320831299, + "rewards/rejected": 0.6547046899795532, + "step": 2772 + }, + { + "epoch": 0.45, + "learning_rate": 9.035329316232755e-06, + "logits/chosen": -0.9085951447486877, + "logits/rejected": -0.9085951447486877, + "logps/chosen": -16.668249130249023, + "logps/rejected": -16.668249130249023, + "loss": 0.3648, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.70675528049469, + "rewards/margins": 0.0, + "rewards/rejected": 1.70675528049469, + "step": 2773 + }, + { + "epoch": 0.45, + "learning_rate": 9.034553160579444e-06, + "logits/chosen": -0.9222357273101807, + "logits/rejected": -0.9728245139122009, + "logps/chosen": -85.30121612548828, + "logps/rejected": -97.31100463867188, + "loss": 0.8332, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.9753777980804443, + "rewards/margins": -0.835913896560669, + "rewards/rejected": 4.811291694641113, + "step": 2774 + }, + { + "epoch": 0.45, + "learning_rate": 9.033776726176681e-06, + "logits/chosen": -1.3376946449279785, + "logits/rejected": -1.3554011583328247, + "logps/chosen": -88.34336853027344, + "logps/rejected": -68.94129180908203, + "loss": 2.6466, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.3793411254882812, + "rewards/margins": 0.23505473136901855, + "rewards/rejected": 1.1442863941192627, + "step": 2775 + }, + { + "epoch": 0.45, + "learning_rate": 9.033000013078117e-06, + "logits/chosen": -0.9702839851379395, + "logits/rejected": -0.9222963452339172, + "logps/chosen": -78.7891845703125, + "logps/rejected": -50.07749938964844, + "loss": 0.6545, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.8570098876953125, + "rewards/margins": 0.08860468864440918, + "rewards/rejected": 1.7684051990509033, + "step": 2776 + }, + { + "epoch": 0.45, + "learning_rate": 9.032223021337415e-06, + "logits/chosen": -0.8718878030776978, + "logits/rejected": -0.8886677622795105, + "logps/chosen": -48.948970794677734, + "logps/rejected": -70.95744323730469, + "loss": 0.2922, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.278954029083252, + "rewards/margins": 0.4497867822647095, + "rewards/rejected": 1.8291672468185425, + "step": 2777 + }, + { + "epoch": 0.45, + "learning_rate": 9.031445751008252e-06, + "logits/chosen": -0.36816418170928955, + "logits/rejected": -0.36816418170928955, + "logps/chosen": -9.858972549438477, + "logps/rejected": -9.858972549438477, + "loss": 0.5372, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.5503107309341431, + "rewards/margins": 0.0, + "rewards/rejected": 0.5503107309341431, + "step": 2778 + }, + { + "epoch": 0.45, + "learning_rate": 9.030668202144334e-06, + "logits/chosen": -0.8516542911529541, + "logits/rejected": -0.7069472670555115, + "logps/chosen": -87.61038208007812, + "logps/rejected": -37.384742736816406, + "loss": 0.3797, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.362762451171875, + "rewards/margins": -0.04529118537902832, + "rewards/rejected": 2.4080536365509033, + "step": 2779 + }, + { + "epoch": 0.45, + "learning_rate": 9.029890374799381e-06, + "logits/chosen": -1.2558122873306274, + "logits/rejected": -1.2340061664581299, + "logps/chosen": -96.30915069580078, + "logps/rejected": -43.003509521484375, + "loss": 0.2153, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.9435417652130127, + "rewards/margins": 1.0923171043395996, + "rewards/rejected": 2.851224660873413, + "step": 2780 + }, + { + "epoch": 0.45, + "learning_rate": 9.029112269027136e-06, + "logits/chosen": -1.0602744817733765, + "logits/rejected": -1.0140873193740845, + "logps/chosen": -89.48230743408203, + "logps/rejected": -154.3873291015625, + "loss": 0.5337, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.359747290611267, + "rewards/margins": -0.6403716802597046, + "rewards/rejected": 2.0001189708709717, + "step": 2781 + }, + { + "epoch": 0.45, + "learning_rate": 9.028333884881357e-06, + "logits/chosen": -1.224732756614685, + "logits/rejected": -1.3523317575454712, + "logps/chosen": -106.61632537841797, + "logps/rejected": -128.87808227539062, + "loss": 1.6526, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.548433780670166, + "rewards/margins": -3.1900124549865723, + "rewards/rejected": 6.738446235656738, + "step": 2782 + }, + { + "epoch": 0.45, + "learning_rate": 9.027555222415822e-06, + "logits/chosen": -0.6824508905410767, + "logits/rejected": -0.6824508905410767, + "logps/chosen": -0.609778881072998, + "logps/rejected": -0.609778881072998, + "loss": 0.656, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.1500740796327591, + "rewards/margins": 0.0, + "rewards/rejected": 0.1500740796327591, + "step": 2783 + }, + { + "epoch": 0.45, + "learning_rate": 9.026776281684331e-06, + "logits/chosen": -0.392031192779541, + "logits/rejected": -0.4010132849216461, + "logps/chosen": -1.4387950897216797, + "logps/rejected": -22.28255271911621, + "loss": 0.709, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2826606333255768, + "rewards/margins": 0.08770766854286194, + "rewards/rejected": 0.19495296478271484, + "step": 2784 + }, + { + "epoch": 0.45, + "learning_rate": 9.025997062740701e-06, + "logits/chosen": -1.3754162788391113, + "logits/rejected": -1.463051676750183, + "logps/chosen": -46.504764556884766, + "logps/rejected": -64.6196060180664, + "loss": 1.4974, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.6552319526672363, + "rewards/margins": 0.5830159187316895, + "rewards/rejected": 2.072216033935547, + "step": 2785 + }, + { + "epoch": 0.45, + "learning_rate": 9.025217565638766e-06, + "logits/chosen": -1.136189579963684, + "logits/rejected": -1.1401643753051758, + "logps/chosen": -69.47933959960938, + "logps/rejected": -117.50615692138672, + "loss": 0.1934, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9430320262908936, + "rewards/margins": 1.3954452276229858, + "rewards/rejected": 1.5475867986679077, + "step": 2786 + }, + { + "epoch": 0.45, + "learning_rate": 9.024437790432387e-06, + "logits/chosen": -0.8409844040870667, + "logits/rejected": -0.8581089377403259, + "logps/chosen": -98.18218994140625, + "logps/rejected": -94.14620971679688, + "loss": 1.1682, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.6265877485275269, + "rewards/margins": -0.19504928588867188, + "rewards/rejected": 1.8216370344161987, + "step": 2787 + }, + { + "epoch": 0.45, + "learning_rate": 9.023657737175436e-06, + "logits/chosen": -1.3424623012542725, + "logits/rejected": -1.378342866897583, + "logps/chosen": -55.4747314453125, + "logps/rejected": -102.0934829711914, + "loss": 2.7978, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.231170654296875, + "rewards/margins": -2.447661876678467, + "rewards/rejected": 5.678832530975342, + "step": 2788 + }, + { + "epoch": 0.45, + "learning_rate": 9.022877405921805e-06, + "logits/chosen": -1.4257749319076538, + "logits/rejected": -1.425201177597046, + "logps/chosen": -95.95632934570312, + "logps/rejected": -59.154144287109375, + "loss": 0.4378, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.000715732574463, + "rewards/margins": -0.3192627429962158, + "rewards/rejected": 2.3199784755706787, + "step": 2789 + }, + { + "epoch": 0.45, + "learning_rate": 9.022096796725413e-06, + "logits/chosen": -0.7346746325492859, + "logits/rejected": -0.6380069255828857, + "logps/chosen": -60.88434600830078, + "logps/rejected": -67.627685546875, + "loss": 1.7443, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.0287818908691406, + "rewards/margins": -0.37868738174438477, + "rewards/rejected": 2.4074692726135254, + "step": 2790 + }, + { + "epoch": 0.45, + "learning_rate": 9.021315909640187e-06, + "logits/chosen": -1.232170820236206, + "logits/rejected": -1.191631555557251, + "logps/chosen": -57.51447296142578, + "logps/rejected": -48.156211853027344, + "loss": 0.6558, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.6681221723556519, + "rewards/margins": 0.1777176856994629, + "rewards/rejected": 1.490404486656189, + "step": 2791 + }, + { + "epoch": 0.45, + "learning_rate": 9.020534744720084e-06, + "logits/chosen": -0.8878380060195923, + "logits/rejected": -0.8852300047874451, + "logps/chosen": -5.58382511138916, + "logps/rejected": -7.842417240142822, + "loss": 0.3648, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.29985132813453674, + "rewards/margins": -0.05295222997665405, + "rewards/rejected": 0.3528035581111908, + "step": 2792 + }, + { + "epoch": 0.45, + "learning_rate": 9.019753302019072e-06, + "logits/chosen": -1.3299142122268677, + "logits/rejected": -1.3535958528518677, + "logps/chosen": -84.90339660644531, + "logps/rejected": -77.31629943847656, + "loss": 2.1322, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.6229248046875, + "rewards/margins": -0.7660300731658936, + "rewards/rejected": 3.3889548778533936, + "step": 2793 + }, + { + "epoch": 0.45, + "learning_rate": 9.018971581591141e-06, + "logits/chosen": -1.5791491270065308, + "logits/rejected": -1.3977261781692505, + "logps/chosen": -70.029541015625, + "logps/rejected": -82.88606262207031, + "loss": 0.1592, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.075216770172119, + "rewards/margins": 1.062556266784668, + "rewards/rejected": 5.012660503387451, + "step": 2794 + }, + { + "epoch": 0.45, + "learning_rate": 9.018189583490304e-06, + "logits/chosen": -0.5237699747085571, + "logits/rejected": -0.4893624186515808, + "logps/chosen": -15.992140769958496, + "logps/rejected": -1.071143627166748, + "loss": 1.8564, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.6777850389480591, + "rewards/margins": 0.22457033395767212, + "rewards/rejected": 0.45321470499038696, + "step": 2795 + }, + { + "epoch": 0.45, + "learning_rate": 9.017407307770585e-06, + "logits/chosen": -0.8965132832527161, + "logits/rejected": -0.8894830346107483, + "logps/chosen": -29.63159942626953, + "logps/rejected": -65.05156707763672, + "loss": 1.8535, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.0093876123428345, + "rewards/margins": -1.8099108934402466, + "rewards/rejected": 2.819298505783081, + "step": 2796 + }, + { + "epoch": 0.45, + "learning_rate": 9.016624754486034e-06, + "logits/chosen": -1.0428088903427124, + "logits/rejected": -1.0693626403808594, + "logps/chosen": -115.36512756347656, + "logps/rejected": -59.04856872558594, + "loss": 1.2988, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.8758522272109985, + "rewards/margins": -1.747937798500061, + "rewards/rejected": 2.6237900257110596, + "step": 2797 + }, + { + "epoch": 0.45, + "learning_rate": 9.01584192369072e-06, + "logits/chosen": -0.6713929772377014, + "logits/rejected": -0.6713929772377014, + "logps/chosen": -58.70361328125, + "logps/rejected": -58.70361328125, + "loss": 0.6441, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.1387726068496704, + "rewards/margins": 0.0, + "rewards/rejected": 1.1387726068496704, + "step": 2798 + }, + { + "epoch": 0.45, + "learning_rate": 9.015058815438726e-06, + "logits/chosen": -0.8505740761756897, + "logits/rejected": -0.7598766088485718, + "logps/chosen": -81.14494323730469, + "logps/rejected": -73.37467956542969, + "loss": 0.5064, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.078176975250244, + "rewards/margins": 2.6010727882385254, + "rewards/rejected": 2.4771041870117188, + "step": 2799 + }, + { + "epoch": 0.45, + "learning_rate": 9.01427542978416e-06, + "logits/chosen": -1.4410964250564575, + "logits/rejected": -1.350857138633728, + "logps/chosen": -130.7029571533203, + "logps/rejected": -35.93843078613281, + "loss": 0.434, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.362361192703247, + "rewards/margins": 0.7361141443252563, + "rewards/rejected": 0.6262470483779907, + "step": 2800 + }, + { + "epoch": 0.45, + "learning_rate": 9.013491766781144e-06, + "logits/chosen": -1.0443207025527954, + "logits/rejected": -1.0443207025527954, + "logps/chosen": -50.929100036621094, + "logps/rejected": -50.929100036621094, + "loss": 0.7555, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.1721580028533936, + "rewards/margins": 0.0, + "rewards/rejected": 2.1721580028533936, + "step": 2801 + }, + { + "epoch": 0.45, + "learning_rate": 9.012707826483823e-06, + "logits/chosen": -1.2756373882293701, + "logits/rejected": -1.2032678127288818, + "logps/chosen": -63.76103973388672, + "logps/rejected": -47.35353088378906, + "loss": 0.4393, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.2233864068984985, + "rewards/margins": -0.33614540100097656, + "rewards/rejected": 1.559531807899475, + "step": 2802 + }, + { + "epoch": 0.45, + "learning_rate": 9.01192360894636e-06, + "logits/chosen": -1.0774728059768677, + "logits/rejected": -1.0774728059768677, + "logps/chosen": -63.91147232055664, + "logps/rejected": -63.91147232055664, + "loss": 0.3573, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.7612767219543457, + "rewards/margins": 0.0, + "rewards/rejected": 2.7612767219543457, + "step": 2803 + }, + { + "epoch": 0.46, + "learning_rate": 9.011139114222938e-06, + "logits/chosen": -1.2844351530075073, + "logits/rejected": -1.0811835527420044, + "logps/chosen": -145.78311157226562, + "logps/rejected": -65.32916259765625, + "loss": 0.029, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.719053745269775, + "rewards/margins": 3.061671495437622, + "rewards/rejected": 3.6573822498321533, + "step": 2804 + }, + { + "epoch": 0.46, + "learning_rate": 9.010354342367755e-06, + "logits/chosen": -1.0782939195632935, + "logits/rejected": -0.8075804114341736, + "logps/chosen": -99.03242492675781, + "logps/rejected": -85.76890563964844, + "loss": 0.0468, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.856428623199463, + "rewards/margins": 3.4252259731292725, + "rewards/rejected": 2.4312026500701904, + "step": 2805 + }, + { + "epoch": 0.46, + "learning_rate": 9.009569293435035e-06, + "logits/chosen": -0.9214832782745361, + "logits/rejected": -0.9214832782745361, + "logps/chosen": -48.07359313964844, + "logps/rejected": -48.07359313964844, + "loss": 0.6465, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.9046036005020142, + "rewards/margins": 0.0, + "rewards/rejected": 1.9046036005020142, + "step": 2806 + }, + { + "epoch": 0.46, + "learning_rate": 9.008783967479014e-06, + "logits/chosen": -1.4122356176376343, + "logits/rejected": -1.413517713546753, + "logps/chosen": -72.64421081542969, + "logps/rejected": -96.76191711425781, + "loss": 0.8897, + "rewards/accuracies": 0.0, + "rewards/chosen": 4.804450988769531, + "rewards/margins": -0.5897078514099121, + "rewards/rejected": 5.394158840179443, + "step": 2807 + }, + { + "epoch": 0.46, + "learning_rate": 9.007998364553951e-06, + "logits/chosen": -1.089253306388855, + "logits/rejected": -1.0989676713943481, + "logps/chosen": -75.80970001220703, + "logps/rejected": -77.54598999023438, + "loss": 0.8252, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.3554595708847046, + "rewards/margins": -0.3623948097229004, + "rewards/rejected": 1.717854380607605, + "step": 2808 + }, + { + "epoch": 0.46, + "learning_rate": 9.007212484714128e-06, + "logits/chosen": -1.5209593772888184, + "logits/rejected": -1.4660457372665405, + "logps/chosen": -56.37129211425781, + "logps/rejected": -34.75887680053711, + "loss": 0.3729, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.20928955078125, + "rewards/margins": 2.915980100631714, + "rewards/rejected": 0.29330942034721375, + "step": 2809 + }, + { + "epoch": 0.46, + "learning_rate": 9.006426328013838e-06, + "logits/chosen": -1.010554552078247, + "logits/rejected": -1.198486566543579, + "logps/chosen": -255.8131103515625, + "logps/rejected": -200.32969665527344, + "loss": 0.25, + "rewards/accuracies": 1.0, + "rewards/chosen": 7.921075344085693, + "rewards/margins": 0.5289287567138672, + "rewards/rejected": 7.392146587371826, + "step": 2810 + }, + { + "epoch": 0.46, + "learning_rate": 9.005639894507398e-06, + "logits/chosen": -0.845449686050415, + "logits/rejected": -0.845449686050415, + "logps/chosen": -9.310954093933105, + "logps/rejected": -9.310954093933105, + "loss": 1.0411, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.9454819560050964, + "rewards/margins": 0.0, + "rewards/rejected": 0.9454819560050964, + "step": 2811 + }, + { + "epoch": 0.46, + "learning_rate": 9.004853184249143e-06, + "logits/chosen": -1.1134451627731323, + "logits/rejected": -1.135155200958252, + "logps/chosen": -52.2092399597168, + "logps/rejected": -34.13714599609375, + "loss": 2.7634, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.700796127319336, + "rewards/margins": -0.33683276176452637, + "rewards/rejected": 2.0376288890838623, + "step": 2812 + }, + { + "epoch": 0.46, + "learning_rate": 9.004066197293429e-06, + "logits/chosen": -0.9665987491607666, + "logits/rejected": -0.9665987491607666, + "logps/chosen": -47.79901885986328, + "logps/rejected": -47.79901885986328, + "loss": 0.4004, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.2407920360565186, + "rewards/margins": 0.0, + "rewards/rejected": 2.2407920360565186, + "step": 2813 + }, + { + "epoch": 0.46, + "learning_rate": 9.003278933694625e-06, + "logits/chosen": -0.8496865630149841, + "logits/rejected": -0.6900425553321838, + "logps/chosen": -52.068851470947266, + "logps/rejected": -47.0277099609375, + "loss": 0.5756, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.770496129989624, + "rewards/margins": 0.5428311824798584, + "rewards/rejected": 2.2276649475097656, + "step": 2814 + }, + { + "epoch": 0.46, + "learning_rate": 9.002491393507127e-06, + "logits/chosen": -0.9130786061286926, + "logits/rejected": -0.9188826084136963, + "logps/chosen": -49.853302001953125, + "logps/rejected": -74.81259155273438, + "loss": 0.2643, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.3485145568847656, + "rewards/margins": 1.123814344406128, + "rewards/rejected": 1.2247002124786377, + "step": 2815 + }, + { + "epoch": 0.46, + "learning_rate": 9.001703576785344e-06, + "logits/chosen": -1.140884280204773, + "logits/rejected": -1.1524295806884766, + "logps/chosen": -33.94533920288086, + "logps/rejected": -35.041831970214844, + "loss": 1.054, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.566812515258789, + "rewards/margins": -0.09444475173950195, + "rewards/rejected": 2.661257266998291, + "step": 2816 + }, + { + "epoch": 0.46, + "learning_rate": 9.00091548358371e-06, + "logits/chosen": -1.2394678592681885, + "logits/rejected": -1.2653312683105469, + "logps/chosen": -176.02886962890625, + "logps/rejected": -49.093994140625, + "loss": 0.4377, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.755941867828369, + "rewards/margins": 3.273904800415039, + "rewards/rejected": 1.4820369482040405, + "step": 2817 + }, + { + "epoch": 0.46, + "learning_rate": 9.000127113956673e-06, + "logits/chosen": -0.5051687955856323, + "logits/rejected": -0.617741584777832, + "logps/chosen": -48.76844787597656, + "logps/rejected": -68.17958068847656, + "loss": 0.6915, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.1211551427841187, + "rewards/margins": -0.6034039258956909, + "rewards/rejected": 1.7245590686798096, + "step": 2818 + }, + { + "epoch": 0.46, + "learning_rate": 8.999338467958703e-06, + "logits/chosen": -1.050820231437683, + "logits/rejected": -0.8436483144760132, + "logps/chosen": -405.73486328125, + "logps/rejected": -114.72883605957031, + "loss": 0.0713, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.026986598968506, + "rewards/margins": 2.1386489868164062, + "rewards/rejected": 1.8883377313613892, + "step": 2819 + }, + { + "epoch": 0.46, + "learning_rate": 8.998549545644285e-06, + "logits/chosen": -0.9531111121177673, + "logits/rejected": -0.9490797519683838, + "logps/chosen": -107.72201538085938, + "logps/rejected": -89.35627746582031, + "loss": 0.1988, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.7401825189590454, + "rewards/margins": 0.8225067257881165, + "rewards/rejected": 0.917675793170929, + "step": 2820 + }, + { + "epoch": 0.46, + "learning_rate": 8.99776034706793e-06, + "logits/chosen": -1.2389839887619019, + "logits/rejected": -1.0885037183761597, + "logps/chosen": -136.1552734375, + "logps/rejected": -57.801971435546875, + "loss": 0.3507, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.876744270324707, + "rewards/margins": 2.3313393592834473, + "rewards/rejected": 3.5454049110412598, + "step": 2821 + }, + { + "epoch": 0.46, + "learning_rate": 8.996970872284158e-06, + "logits/chosen": -1.1540477275848389, + "logits/rejected": -1.139763593673706, + "logps/chosen": -74.14129638671875, + "logps/rejected": -123.14134216308594, + "loss": 0.1162, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0192573070526123, + "rewards/margins": 2.366142988204956, + "rewards/rejected": 0.6531143188476562, + "step": 2822 + }, + { + "epoch": 0.46, + "learning_rate": 8.996181121347522e-06, + "logits/chosen": -0.6939056515693665, + "logits/rejected": -0.83513343334198, + "logps/chosen": -71.14212036132812, + "logps/rejected": -61.81017303466797, + "loss": 1.0696, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.3814407587051392, + "rewards/margins": -1.5259782075881958, + "rewards/rejected": 2.907418966293335, + "step": 2823 + }, + { + "epoch": 0.46, + "learning_rate": 8.995391094312583e-06, + "logits/chosen": -1.1730167865753174, + "logits/rejected": -1.2206897735595703, + "logps/chosen": -36.926300048828125, + "logps/rejected": -48.92692184448242, + "loss": 0.4173, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.4493820667266846, + "rewards/margins": 0.2402033805847168, + "rewards/rejected": 2.2091786861419678, + "step": 2824 + }, + { + "epoch": 0.46, + "learning_rate": 8.994600791233924e-06, + "logits/chosen": -0.4722420275211334, + "logits/rejected": -0.40280619263648987, + "logps/chosen": -36.570335388183594, + "logps/rejected": -42.51119613647461, + "loss": 1.0677, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.9235267639160156, + "rewards/margins": -0.2426280975341797, + "rewards/rejected": 2.1661548614501953, + "step": 2825 + }, + { + "epoch": 0.46, + "learning_rate": 8.993810212166147e-06, + "logits/chosen": -0.9963010549545288, + "logits/rejected": -0.981191873550415, + "logps/chosen": -94.3676986694336, + "logps/rejected": -48.692108154296875, + "loss": 2.4685, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.6660149097442627, + "rewards/margins": 0.26389622688293457, + "rewards/rejected": 1.4021186828613281, + "step": 2826 + }, + { + "epoch": 0.46, + "learning_rate": 8.993019357163872e-06, + "logits/chosen": -1.3026642799377441, + "logits/rejected": -1.2422677278518677, + "logps/chosen": -53.82404327392578, + "logps/rejected": -35.274009704589844, + "loss": 1.1658, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.4948402643203735, + "rewards/margins": 1.2519066333770752, + "rewards/rejected": 0.24293366074562073, + "step": 2827 + }, + { + "epoch": 0.46, + "learning_rate": 8.992228226281745e-06, + "logits/chosen": -0.7434683442115784, + "logits/rejected": -0.7276577949523926, + "logps/chosen": -48.390045166015625, + "logps/rejected": -33.3250617980957, + "loss": 0.4636, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0946807861328125, + "rewards/margins": 0.05636405944824219, + "rewards/rejected": 3.0383167266845703, + "step": 2828 + }, + { + "epoch": 0.46, + "learning_rate": 8.991436819574421e-06, + "logits/chosen": -1.150006651878357, + "logits/rejected": -1.0983307361602783, + "logps/chosen": -55.850345611572266, + "logps/rejected": -58.406593322753906, + "loss": 0.1542, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.8397634029388428, + "rewards/margins": 1.0463956594467163, + "rewards/rejected": 1.7933677434921265, + "step": 2829 + }, + { + "epoch": 0.46, + "learning_rate": 8.99064513709658e-06, + "logits/chosen": -1.1546427011489868, + "logits/rejected": -1.1415101289749146, + "logps/chosen": -98.22148132324219, + "logps/rejected": -38.24233627319336, + "loss": 1.7867, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.4412918090820312, + "rewards/margins": 1.072327733039856, + "rewards/rejected": 1.3689640760421753, + "step": 2830 + }, + { + "epoch": 0.46, + "learning_rate": 8.989853178902921e-06, + "logits/chosen": -0.7507598400115967, + "logits/rejected": -0.6473782062530518, + "logps/chosen": -99.71548461914062, + "logps/rejected": -26.993209838867188, + "loss": 0.1397, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.5134658813476562, + "rewards/margins": 2.8019204139709473, + "rewards/rejected": -0.28845444321632385, + "step": 2831 + }, + { + "epoch": 0.46, + "learning_rate": 8.989060945048158e-06, + "logits/chosen": -0.7070825099945068, + "logits/rejected": -0.5920942425727844, + "logps/chosen": -100.86486053466797, + "logps/rejected": -27.336029052734375, + "loss": 2.8431, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.3966057300567627, + "rewards/margins": 1.8776428699493408, + "rewards/rejected": 0.5189628601074219, + "step": 2832 + }, + { + "epoch": 0.46, + "learning_rate": 8.98826843558703e-06, + "logits/chosen": -1.0363974571228027, + "logits/rejected": -1.0849530696868896, + "logps/chosen": -102.29456329345703, + "logps/rejected": -106.66059875488281, + "loss": 0.3894, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0018670558929443, + "rewards/margins": 0.08542251586914062, + "rewards/rejected": 2.9164445400238037, + "step": 2833 + }, + { + "epoch": 0.46, + "learning_rate": 8.987475650574289e-06, + "logits/chosen": -0.9724906086921692, + "logits/rejected": -0.9725701808929443, + "logps/chosen": -90.4132080078125, + "logps/rejected": -124.91271209716797, + "loss": 0.1649, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.7128822803497314, + "rewards/margins": 0.969563364982605, + "rewards/rejected": 1.7433189153671265, + "step": 2834 + }, + { + "epoch": 0.46, + "learning_rate": 8.98668259006471e-06, + "logits/chosen": -0.6601648330688477, + "logits/rejected": -0.6601648330688477, + "logps/chosen": -1.223179817199707, + "logps/rejected": -1.223179817199707, + "loss": 0.7473, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.14738915860652924, + "rewards/margins": 0.0, + "rewards/rejected": 0.14738915860652924, + "step": 2835 + }, + { + "epoch": 0.46, + "learning_rate": 8.985889254113087e-06, + "logits/chosen": -0.7378827333450317, + "logits/rejected": -0.738671064376831, + "logps/chosen": -58.647857666015625, + "logps/rejected": -62.020172119140625, + "loss": 0.6778, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.6477257013320923, + "rewards/margins": 0.062250494956970215, + "rewards/rejected": 1.585475206375122, + "step": 2836 + }, + { + "epoch": 0.46, + "learning_rate": 8.985095642774233e-06, + "logits/chosen": -0.7962009310722351, + "logits/rejected": -0.7334416508674622, + "logps/chosen": -68.05126953125, + "logps/rejected": -242.96450805664062, + "loss": 2.5333, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.9113960266113281, + "rewards/margins": -3.896636486053467, + "rewards/rejected": 5.808032512664795, + "step": 2837 + }, + { + "epoch": 0.46, + "learning_rate": 8.984301756102977e-06, + "logits/chosen": -0.7240849733352661, + "logits/rejected": -0.7614006400108337, + "logps/chosen": -56.77473831176758, + "logps/rejected": -53.48748779296875, + "loss": 0.7955, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.9473240375518799, + "rewards/margins": -0.374467134475708, + "rewards/rejected": 2.321791172027588, + "step": 2838 + }, + { + "epoch": 0.46, + "learning_rate": 8.983507594154168e-06, + "logits/chosen": -1.1912256479263306, + "logits/rejected": -1.1106704473495483, + "logps/chosen": -183.99032592773438, + "logps/rejected": -77.72445678710938, + "loss": 0.2015, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.196539402008057, + "rewards/margins": 1.332963466644287, + "rewards/rejected": 4.8635759353637695, + "step": 2839 + }, + { + "epoch": 0.46, + "learning_rate": 8.982713156982678e-06, + "logits/chosen": -1.0318183898925781, + "logits/rejected": -1.06716787815094, + "logps/chosen": -142.56002807617188, + "logps/rejected": -110.1364517211914, + "loss": 0.9091, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.181085109710693, + "rewards/margins": 2.3214759826660156, + "rewards/rejected": 1.8596092462539673, + "step": 2840 + }, + { + "epoch": 0.46, + "learning_rate": 8.981918444643393e-06, + "logits/chosen": -0.8725177645683289, + "logits/rejected": -0.9431917667388916, + "logps/chosen": -30.211023330688477, + "logps/rejected": -65.55003356933594, + "loss": 2.0129, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.6257559061050415, + "rewards/margins": -2.173215866088867, + "rewards/rejected": 3.798971652984619, + "step": 2841 + }, + { + "epoch": 0.46, + "learning_rate": 8.98112345719122e-06, + "logits/chosen": -1.0399630069732666, + "logits/rejected": -1.019428014755249, + "logps/chosen": -124.17449188232422, + "logps/rejected": -87.13038635253906, + "loss": 0.2651, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.535677433013916, + "rewards/margins": 0.8457512855529785, + "rewards/rejected": 1.6899261474609375, + "step": 2842 + }, + { + "epoch": 0.46, + "learning_rate": 8.980328194681088e-06, + "logits/chosen": -0.691768229007721, + "logits/rejected": -0.584290087223053, + "logps/chosen": -123.46261596679688, + "logps/rejected": -37.04007339477539, + "loss": 0.2444, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.188210964202881, + "rewards/margins": 1.927776575088501, + "rewards/rejected": 2.26043438911438, + "step": 2843 + }, + { + "epoch": 0.46, + "learning_rate": 8.979532657167937e-06, + "logits/chosen": -0.7458242177963257, + "logits/rejected": -0.7637706995010376, + "logps/chosen": -71.95919799804688, + "logps/rejected": -98.04679107666016, + "loss": 1.7547, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.4534707069396973, + "rewards/margins": 2.3115057945251465, + "rewards/rejected": 1.1419647932052612, + "step": 2844 + }, + { + "epoch": 0.46, + "learning_rate": 8.978736844706735e-06, + "logits/chosen": -1.09701406955719, + "logits/rejected": -1.0312923192977905, + "logps/chosen": -54.25434875488281, + "logps/rejected": -43.0850715637207, + "loss": 0.9122, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.9230849742889404, + "rewards/margins": -0.7738721370697021, + "rewards/rejected": 4.696957111358643, + "step": 2845 + }, + { + "epoch": 0.46, + "learning_rate": 8.977940757352465e-06, + "logits/chosen": -1.2024809122085571, + "logits/rejected": -1.0805937051773071, + "logps/chosen": -137.11050415039062, + "logps/rejected": -70.80474090576172, + "loss": 0.1374, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.054785251617432, + "rewards/margins": 1.3368103504180908, + "rewards/rejected": 3.717974901199341, + "step": 2846 + }, + { + "epoch": 0.46, + "learning_rate": 8.97714439516013e-06, + "logits/chosen": -0.6999064683914185, + "logits/rejected": -0.6999064683914185, + "logps/chosen": -0.9850645661354065, + "logps/rejected": -0.9850645661354065, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.27269282937049866, + "rewards/margins": 0.0, + "rewards/rejected": 0.27269282937049866, + "step": 2847 + }, + { + "epoch": 0.46, + "learning_rate": 8.976347758184745e-06, + "logits/chosen": -0.8723025918006897, + "logits/rejected": -1.0521775484085083, + "logps/chosen": -72.59062194824219, + "logps/rejected": -122.46575927734375, + "loss": 1.3434, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.9219635725021362, + "rewards/margins": -2.5921082496643066, + "rewards/rejected": 4.514071941375732, + "step": 2848 + }, + { + "epoch": 0.46, + "learning_rate": 8.975550846481359e-06, + "logits/chosen": -1.1425108909606934, + "logits/rejected": -1.0762138366699219, + "logps/chosen": -48.93604278564453, + "logps/rejected": -63.26224899291992, + "loss": 1.2027, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.8781098127365112, + "rewards/margins": 1.472127914428711, + "rewards/rejected": 0.4059818387031555, + "step": 2849 + }, + { + "epoch": 0.46, + "learning_rate": 8.974753660105023e-06, + "logits/chosen": -0.6454949378967285, + "logits/rejected": -0.6552102565765381, + "logps/chosen": -8.91451644897461, + "logps/rejected": -3.983281135559082, + "loss": 0.762, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0932827964425087, + "rewards/margins": -0.16110444068908691, + "rewards/rejected": 0.254387229681015, + "step": 2850 + }, + { + "epoch": 0.46, + "learning_rate": 8.97395619911082e-06, + "logits/chosen": -0.8371603488922119, + "logits/rejected": -0.86065673828125, + "logps/chosen": -88.216064453125, + "logps/rejected": -87.8876953125, + "loss": 0.4844, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.9025055170059204, + "rewards/margins": -0.33694469928741455, + "rewards/rejected": 2.239450216293335, + "step": 2851 + }, + { + "epoch": 0.46, + "learning_rate": 8.973158463553847e-06, + "logits/chosen": -0.9932941198348999, + "logits/rejected": -0.8591756820678711, + "logps/chosen": -112.51211547851562, + "logps/rejected": -133.86773681640625, + "loss": 0.1476, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.591870307922363, + "rewards/margins": 3.8407914638519287, + "rewards/rejected": 1.7510788440704346, + "step": 2852 + }, + { + "epoch": 0.46, + "learning_rate": 8.972360453489215e-06, + "logits/chosen": -1.1072545051574707, + "logits/rejected": -1.095119833946228, + "logps/chosen": -50.18998718261719, + "logps/rejected": -88.33560180664062, + "loss": 0.2979, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.3099822998046875, + "rewards/margins": 0.4435318112373352, + "rewards/rejected": 0.8664504885673523, + "step": 2853 + }, + { + "epoch": 0.46, + "learning_rate": 8.971562168972065e-06, + "logits/chosen": -0.9490060210227966, + "logits/rejected": -0.9678670763969421, + "logps/chosen": -96.63168334960938, + "logps/rejected": -121.15876770019531, + "loss": 0.6308, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.6373627185821533, + "rewards/margins": 2.2657501697540283, + "rewards/rejected": 1.371612548828125, + "step": 2854 + }, + { + "epoch": 0.46, + "learning_rate": 8.970763610057546e-06, + "logits/chosen": -1.3374983072280884, + "logits/rejected": -1.3209401369094849, + "logps/chosen": -89.52253723144531, + "logps/rejected": -68.3750228881836, + "loss": 1.0697, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.6818466186523438, + "rewards/margins": -1.062432050704956, + "rewards/rejected": 2.7442786693573, + "step": 2855 + }, + { + "epoch": 0.46, + "learning_rate": 8.969964776800836e-06, + "logits/chosen": -1.1124403476715088, + "logits/rejected": -1.0202581882476807, + "logps/chosen": -141.45553588867188, + "logps/rejected": -137.6501007080078, + "loss": 0.0868, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.369691371917725, + "rewards/margins": 1.739715576171875, + "rewards/rejected": 4.62997579574585, + "step": 2856 + }, + { + "epoch": 0.46, + "learning_rate": 8.969165669257123e-06, + "logits/chosen": -0.876624345779419, + "logits/rejected": -0.8691173195838928, + "logps/chosen": -76.3641357421875, + "logps/rejected": -96.46954345703125, + "loss": 1.4744, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.3581405878067017, + "rewards/margins": -2.714573860168457, + "rewards/rejected": 4.072714328765869, + "step": 2857 + }, + { + "epoch": 0.46, + "learning_rate": 8.968366287481621e-06, + "logits/chosen": -1.3029470443725586, + "logits/rejected": -1.2354894876480103, + "logps/chosen": -146.009521484375, + "logps/rejected": -292.46942138671875, + "loss": 1.0685, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.584967017173767, + "rewards/margins": -1.2093933820724487, + "rewards/rejected": 2.794360399246216, + "step": 2858 + }, + { + "epoch": 0.46, + "learning_rate": 8.967566631529553e-06, + "logits/chosen": -1.2318341732025146, + "logits/rejected": -1.0173395872116089, + "logps/chosen": -121.49718475341797, + "logps/rejected": -57.891239166259766, + "loss": 0.2952, + "rewards/accuracies": 1.0, + "rewards/chosen": 7.4304633140563965, + "rewards/margins": 5.592418670654297, + "rewards/rejected": 1.8380444049835205, + "step": 2859 + }, + { + "epoch": 0.46, + "learning_rate": 8.966766701456177e-06, + "logits/chosen": -0.9519075155258179, + "logits/rejected": -1.0004078149795532, + "logps/chosen": -34.72753143310547, + "logps/rejected": -115.13894653320312, + "loss": 1.0646, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.552775263786316, + "rewards/margins": -0.2759082317352295, + "rewards/rejected": 1.8286834955215454, + "step": 2860 + }, + { + "epoch": 0.46, + "learning_rate": 8.965966497316754e-06, + "logits/chosen": -1.0480610132217407, + "logits/rejected": -1.0480610132217407, + "logps/chosen": -69.0875473022461, + "logps/rejected": -69.0875473022461, + "loss": 0.3637, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.5770394802093506, + "rewards/margins": 0.0, + "rewards/rejected": 3.5770394802093506, + "step": 2861 + }, + { + "epoch": 0.46, + "learning_rate": 8.96516601916657e-06, + "logits/chosen": -0.8047402501106262, + "logits/rejected": -0.8762883543968201, + "logps/chosen": -44.337249755859375, + "logps/rejected": -95.08665466308594, + "loss": 0.9446, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.6272525787353516, + "rewards/margins": 0.12353247404098511, + "rewards/rejected": 0.5037201046943665, + "step": 2862 + }, + { + "epoch": 0.46, + "learning_rate": 8.964365267060936e-06, + "logits/chosen": -0.8134000897407532, + "logits/rejected": -0.708666205406189, + "logps/chosen": -49.69473648071289, + "logps/rejected": -80.2509994506836, + "loss": 0.237, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.9543042182922363, + "rewards/margins": 1.361865758895874, + "rewards/rejected": 2.5924384593963623, + "step": 2863 + }, + { + "epoch": 0.46, + "learning_rate": 8.963564241055172e-06, + "logits/chosen": -1.0946682691574097, + "logits/rejected": -1.1952279806137085, + "logps/chosen": -71.71398162841797, + "logps/rejected": -83.86553192138672, + "loss": 1.6351, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.7215713262557983, + "rewards/margins": -3.2145347595214844, + "rewards/rejected": 4.936106204986572, + "step": 2864 + }, + { + "epoch": 0.47, + "learning_rate": 8.962762941204623e-06, + "logits/chosen": -0.8516660928726196, + "logits/rejected": -0.6932076215744019, + "logps/chosen": -41.19510269165039, + "logps/rejected": -36.37513732910156, + "loss": 0.2315, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.8872768878936768, + "rewards/margins": 0.8893020749092102, + "rewards/rejected": 0.9979748129844666, + "step": 2865 + }, + { + "epoch": 0.47, + "learning_rate": 8.961961367564652e-06, + "logits/chosen": -0.8615169525146484, + "logits/rejected": -0.8136593699455261, + "logps/chosen": -25.575336456298828, + "logps/rejected": -86.04947662353516, + "loss": 2.9425, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.8687286376953125, + "rewards/margins": -4.904937267303467, + "rewards/rejected": 5.773665904998779, + "step": 2866 + }, + { + "epoch": 0.47, + "learning_rate": 8.961159520190637e-06, + "logits/chosen": -1.04767644405365, + "logits/rejected": -0.9685817956924438, + "logps/chosen": -56.606964111328125, + "logps/rejected": -27.484567642211914, + "loss": 0.938, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.501715064048767, + "rewards/margins": 0.8172376155853271, + "rewards/rejected": 0.6844774484634399, + "step": 2867 + }, + { + "epoch": 0.47, + "learning_rate": 8.96035739913798e-06, + "logits/chosen": -0.8848148584365845, + "logits/rejected": -0.9059154987335205, + "logps/chosen": -76.40724182128906, + "logps/rejected": -143.6439666748047, + "loss": 2.1895, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.636834740638733, + "rewards/margins": -4.295292854309082, + "rewards/rejected": 5.932127475738525, + "step": 2868 + }, + { + "epoch": 0.47, + "learning_rate": 8.9595550044621e-06, + "logits/chosen": -1.3113412857055664, + "logits/rejected": -1.2267124652862549, + "logps/chosen": -65.61174011230469, + "logps/rejected": -44.015541076660156, + "loss": 0.7035, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.8900253772735596, + "rewards/margins": -0.27590489387512207, + "rewards/rejected": 2.1659302711486816, + "step": 2869 + }, + { + "epoch": 0.47, + "learning_rate": 8.958752336218434e-06, + "logits/chosen": -1.3782628774642944, + "logits/rejected": -1.2470245361328125, + "logps/chosen": -84.89767456054688, + "logps/rejected": -57.69304656982422, + "loss": 0.2595, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.2426345348358154, + "rewards/margins": 1.512925624847412, + "rewards/rejected": 0.7297088503837585, + "step": 2870 + }, + { + "epoch": 0.47, + "learning_rate": 8.957949394462442e-06, + "logits/chosen": -1.0105851888656616, + "logits/rejected": -1.0716837644577026, + "logps/chosen": -132.1944122314453, + "logps/rejected": -141.97323608398438, + "loss": 2.479, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.9044387936592102, + "rewards/margins": -4.921656608581543, + "rewards/rejected": 5.8260955810546875, + "step": 2871 + }, + { + "epoch": 0.47, + "learning_rate": 8.957146179249597e-06, + "logits/chosen": -1.0521438121795654, + "logits/rejected": -1.1911201477050781, + "logps/chosen": -64.20016479492188, + "logps/rejected": -27.251052856445312, + "loss": 2.0657, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.260033369064331, + "rewards/margins": 0.7406352758407593, + "rewards/rejected": 1.5193980932235718, + "step": 2872 + }, + { + "epoch": 0.47, + "learning_rate": 8.95634269063539e-06, + "logits/chosen": -1.3394055366516113, + "logits/rejected": -1.153971552848816, + "logps/chosen": -132.70559692382812, + "logps/rejected": -51.43941116333008, + "loss": 0.9593, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.874871850013733, + "rewards/margins": -1.4903613328933716, + "rewards/rejected": 3.3652331829071045, + "step": 2873 + }, + { + "epoch": 0.47, + "learning_rate": 8.955538928675343e-06, + "logits/chosen": -0.8183132410049438, + "logits/rejected": -0.8002040982246399, + "logps/chosen": -26.716127395629883, + "logps/rejected": -21.4383544921875, + "loss": 0.3492, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.9660493731498718, + "rewards/margins": 0.18008077144622803, + "rewards/rejected": 0.7859686017036438, + "step": 2874 + }, + { + "epoch": 0.47, + "learning_rate": 8.954734893424981e-06, + "logits/chosen": -1.083910584449768, + "logits/rejected": -0.9950361847877502, + "logps/chosen": -80.78704833984375, + "logps/rejected": -61.490394592285156, + "loss": 0.5091, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.648537516593933, + "rewards/margins": 0.28275835514068604, + "rewards/rejected": 1.365779161453247, + "step": 2875 + }, + { + "epoch": 0.47, + "learning_rate": 8.95393058493986e-06, + "logits/chosen": -0.9714767336845398, + "logits/rejected": -0.9381241798400879, + "logps/chosen": -68.97135925292969, + "logps/rejected": -56.14393615722656, + "loss": 0.6596, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.036625623703003, + "rewards/margins": -0.3972504138946533, + "rewards/rejected": 2.4338760375976562, + "step": 2876 + }, + { + "epoch": 0.47, + "learning_rate": 8.953126003275548e-06, + "logits/chosen": -1.1539417505264282, + "logits/rejected": -1.1614731550216675, + "logps/chosen": -70.25642395019531, + "logps/rejected": -34.74107360839844, + "loss": 1.0173, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.559565782546997, + "rewards/margins": -1.4380149841308594, + "rewards/rejected": 2.9975807666778564, + "step": 2877 + }, + { + "epoch": 0.47, + "learning_rate": 8.952321148487632e-06, + "logits/chosen": -1.0052121877670288, + "logits/rejected": -0.9728103280067444, + "logps/chosen": -85.79537963867188, + "logps/rejected": -57.88817596435547, + "loss": 0.2271, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.973391056060791, + "rewards/margins": 0.5641839504241943, + "rewards/rejected": 2.4092071056365967, + "step": 2878 + }, + { + "epoch": 0.47, + "learning_rate": 8.951516020631723e-06, + "logits/chosen": -0.43856194615364075, + "logits/rejected": -0.4902304708957672, + "logps/chosen": -6.996115684509277, + "logps/rejected": -55.577693939208984, + "loss": 0.4413, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.3744846284389496, + "rewards/margins": -0.3450336754322052, + "rewards/rejected": 0.7195183038711548, + "step": 2879 + }, + { + "epoch": 0.47, + "learning_rate": 8.950710619763445e-06, + "logits/chosen": -0.7190291881561279, + "logits/rejected": -0.7190291881561279, + "logps/chosen": -85.03857421875, + "logps/rejected": -85.03857421875, + "loss": 0.358, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.491352915763855, + "rewards/margins": 0.0, + "rewards/rejected": 1.491352915763855, + "step": 2880 + }, + { + "epoch": 0.47, + "learning_rate": 8.949904945938448e-06, + "logits/chosen": -1.1226261854171753, + "logits/rejected": -1.101433515548706, + "logps/chosen": -74.30427551269531, + "logps/rejected": -74.02326965332031, + "loss": 1.0425, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.828159332275391, + "rewards/margins": 1.4702880382537842, + "rewards/rejected": 3.3578712940216064, + "step": 2881 + }, + { + "epoch": 0.47, + "learning_rate": 8.94909899921239e-06, + "logits/chosen": -0.7886596918106079, + "logits/rejected": -0.7433167099952698, + "logps/chosen": -99.36909484863281, + "logps/rejected": -45.04609680175781, + "loss": 0.4595, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.1328247785568237, + "rewards/margins": -0.38636279106140137, + "rewards/rejected": 1.519187569618225, + "step": 2882 + }, + { + "epoch": 0.47, + "learning_rate": 8.948292779640961e-06, + "logits/chosen": -0.8635314106941223, + "logits/rejected": -0.8433211445808411, + "logps/chosen": -67.7188949584961, + "logps/rejected": -128.41429138183594, + "loss": 0.4581, + "rewards/accuracies": 0.0, + "rewards/chosen": 4.658532619476318, + "rewards/margins": -0.31762266159057617, + "rewards/rejected": 4.9761552810668945, + "step": 2883 + }, + { + "epoch": 0.47, + "learning_rate": 8.947486287279859e-06, + "logits/chosen": -0.9349709153175354, + "logits/rejected": -0.8808729648590088, + "logps/chosen": -71.93244934082031, + "logps/rejected": -118.17610931396484, + "loss": 0.7772, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.14347243309021, + "rewards/margins": -1.3132758140563965, + "rewards/rejected": 3.4567482471466064, + "step": 2884 + }, + { + "epoch": 0.47, + "learning_rate": 8.946679522184807e-06, + "logits/chosen": -0.9237927794456482, + "logits/rejected": -0.9894058704376221, + "logps/chosen": -135.12173461914062, + "logps/rejected": -129.21661376953125, + "loss": 0.2004, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.3486924171447754, + "rewards/margins": 1.4665513038635254, + "rewards/rejected": 1.88214111328125, + "step": 2885 + }, + { + "epoch": 0.47, + "learning_rate": 8.945872484411543e-06, + "logits/chosen": -0.7944230437278748, + "logits/rejected": -0.7561151385307312, + "logps/chosen": -143.8537139892578, + "logps/rejected": -45.89556121826172, + "loss": 0.4793, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.171520948410034, + "rewards/margins": -0.322206974029541, + "rewards/rejected": 2.493727922439575, + "step": 2886 + }, + { + "epoch": 0.47, + "learning_rate": 8.945065174015825e-06, + "logits/chosen": -0.6660142540931702, + "logits/rejected": -0.65256267786026, + "logps/chosen": -92.32763671875, + "logps/rejected": -64.65980529785156, + "loss": 1.6612, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.722921848297119, + "rewards/margins": -0.2709808349609375, + "rewards/rejected": 2.9939026832580566, + "step": 2887 + }, + { + "epoch": 0.47, + "learning_rate": 8.944257591053433e-06, + "logits/chosen": -1.51012122631073, + "logits/rejected": -1.376443862915039, + "logps/chosen": -163.16641235351562, + "logps/rejected": -127.53550720214844, + "loss": 0.4396, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.1519775390625, + "rewards/margins": 0.5577802658081055, + "rewards/rejected": 4.5941972732543945, + "step": 2888 + }, + { + "epoch": 0.47, + "learning_rate": 8.943449735580163e-06, + "logits/chosen": -0.6761447787284851, + "logits/rejected": -0.6761447787284851, + "logps/chosen": -48.45842361450195, + "logps/rejected": -48.45842361450195, + "loss": 0.4441, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.1971698999404907, + "rewards/margins": 0.0, + "rewards/rejected": 1.1971698999404907, + "step": 2889 + }, + { + "epoch": 0.47, + "learning_rate": 8.94264160765183e-06, + "logits/chosen": -0.9182702302932739, + "logits/rejected": -0.8995966911315918, + "logps/chosen": -106.74006652832031, + "logps/rejected": -132.35324096679688, + "loss": 2.4018, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.080587863922119, + "rewards/margins": -4.481951713562012, + "rewards/rejected": 7.562539577484131, + "step": 2890 + }, + { + "epoch": 0.47, + "learning_rate": 8.941833207324267e-06, + "logits/chosen": -0.5415586829185486, + "logits/rejected": -0.5429473519325256, + "logps/chosen": -52.09851837158203, + "logps/rejected": -40.90770721435547, + "loss": 1.5738, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.7204673290252686, + "rewards/margins": -0.042884111404418945, + "rewards/rejected": 2.7633514404296875, + "step": 2891 + }, + { + "epoch": 0.47, + "learning_rate": 8.941024534653326e-06, + "logits/chosen": -0.7663821578025818, + "logits/rejected": -0.7276403903961182, + "logps/chosen": -41.9017333984375, + "logps/rejected": -47.81591796875, + "loss": 2.2231, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.9224518537521362, + "rewards/margins": -0.5299254655838013, + "rewards/rejected": 2.4523773193359375, + "step": 2892 + }, + { + "epoch": 0.47, + "learning_rate": 8.940215589694882e-06, + "logits/chosen": -1.2102795839309692, + "logits/rejected": -1.212607502937317, + "logps/chosen": -102.98312377929688, + "logps/rejected": -51.61890411376953, + "loss": 0.6735, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.100383758544922, + "rewards/margins": 0.08008718490600586, + "rewards/rejected": 2.020296573638916, + "step": 2893 + }, + { + "epoch": 0.47, + "learning_rate": 8.939406372504823e-06, + "logits/chosen": -0.9987138509750366, + "logits/rejected": -0.9199525713920593, + "logps/chosen": -178.76654052734375, + "logps/rejected": -158.3575439453125, + "loss": 0.7868, + "rewards/accuracies": 0.0, + "rewards/chosen": 5.593911647796631, + "rewards/margins": -0.6959781646728516, + "rewards/rejected": 6.289889812469482, + "step": 2894 + }, + { + "epoch": 0.47, + "learning_rate": 8.938596883139058e-06, + "logits/chosen": -0.7380850315093994, + "logits/rejected": -0.7470872402191162, + "logps/chosen": -4.432803153991699, + "logps/rejected": -1.7330174446105957, + "loss": 0.4252, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.16414670646190643, + "rewards/margins": -0.1403772085905075, + "rewards/rejected": 0.30452391505241394, + "step": 2895 + }, + { + "epoch": 0.47, + "learning_rate": 8.937787121653517e-06, + "logits/chosen": -1.1191486120224, + "logits/rejected": -1.1426869630813599, + "logps/chosen": -76.08872985839844, + "logps/rejected": -103.20646667480469, + "loss": 0.3628, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2029807567596436, + "rewards/margins": 0.0018987655639648438, + "rewards/rejected": 3.2010819911956787, + "step": 2896 + }, + { + "epoch": 0.47, + "learning_rate": 8.936977088104144e-06, + "logits/chosen": -1.0996968746185303, + "logits/rejected": -1.1701680421829224, + "logps/chosen": -128.73690795898438, + "logps/rejected": -166.50701904296875, + "loss": 1.7783, + "rewards/accuracies": 0.0, + "rewards/chosen": 4.64145040512085, + "rewards/margins": -3.4444918632507324, + "rewards/rejected": 8.085942268371582, + "step": 2897 + }, + { + "epoch": 0.47, + "learning_rate": 8.936166782546907e-06, + "logits/chosen": -1.2926435470581055, + "logits/rejected": -1.0472265481948853, + "logps/chosen": -278.2098388671875, + "logps/rejected": -129.58236694335938, + "loss": 1.3018, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.840634346008301, + "rewards/margins": 1.4323487281799316, + "rewards/rejected": 5.408285617828369, + "step": 2898 + }, + { + "epoch": 0.47, + "learning_rate": 8.935356205037788e-06, + "logits/chosen": -1.026669979095459, + "logits/rejected": -1.1263433694839478, + "logps/chosen": -68.99467468261719, + "logps/rejected": -88.25035095214844, + "loss": 1.1516, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.2146774530410767, + "rewards/margins": -2.1764283180236816, + "rewards/rejected": 3.3911056518554688, + "step": 2899 + }, + { + "epoch": 0.47, + "learning_rate": 8.934545355632794e-06, + "logits/chosen": -1.0722174644470215, + "logits/rejected": -0.680452287197113, + "logps/chosen": -102.75617980957031, + "logps/rejected": -32.775787353515625, + "loss": 0.0014, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.940687656402588, + "rewards/margins": 6.444726467132568, + "rewards/rejected": 0.49596139788627625, + "step": 2900 + }, + { + "epoch": 0.47, + "learning_rate": 8.933734234387944e-06, + "logits/chosen": -0.7988722920417786, + "logits/rejected": -0.7283421754837036, + "logps/chosen": -63.97315216064453, + "logps/rejected": -66.26500701904297, + "loss": 0.4847, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.7728355526924133, + "rewards/margins": -0.4772239327430725, + "rewards/rejected": 1.2500594854354858, + "step": 2901 + }, + { + "epoch": 0.47, + "learning_rate": 8.93292284135928e-06, + "logits/chosen": -0.7036992907524109, + "logits/rejected": -0.7036992907524109, + "logps/chosen": -60.24802017211914, + "logps/rejected": -60.24802017211914, + "loss": 2.1643, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.068419337272644, + "rewards/margins": 0.0, + "rewards/rejected": 1.068419337272644, + "step": 2902 + }, + { + "epoch": 0.47, + "learning_rate": 8.932111176602862e-06, + "logits/chosen": -0.5158720016479492, + "logits/rejected": -0.5158720016479492, + "logps/chosen": -5.5901994705200195, + "logps/rejected": -5.5901994705200195, + "loss": 0.3469, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.7855088114738464, + "rewards/margins": 0.0, + "rewards/rejected": 0.7855088114738464, + "step": 2903 + }, + { + "epoch": 0.47, + "learning_rate": 8.931299240174767e-06, + "logits/chosen": -1.1240851879119873, + "logits/rejected": -0.9891506433486938, + "logps/chosen": -162.34591674804688, + "logps/rejected": -75.31832885742188, + "loss": 0.2409, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.071142673492432, + "rewards/margins": 1.8228964805603027, + "rewards/rejected": 4.248246192932129, + "step": 2904 + }, + { + "epoch": 0.47, + "learning_rate": 8.930487032131092e-06, + "logits/chosen": -1.0144668817520142, + "logits/rejected": -1.0144668817520142, + "logps/chosen": -57.280696868896484, + "logps/rejected": -57.280696868896484, + "loss": 0.8424, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.745482325553894, + "rewards/margins": 0.0, + "rewards/rejected": 1.745482325553894, + "step": 2905 + }, + { + "epoch": 0.47, + "learning_rate": 8.929674552527956e-06, + "logits/chosen": -0.5416520237922668, + "logits/rejected": -0.5580481290817261, + "logps/chosen": -49.25517654418945, + "logps/rejected": -45.24370193481445, + "loss": 0.9834, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.3869892358779907, + "rewards/margins": -0.8006125688552856, + "rewards/rejected": 2.1876018047332764, + "step": 2906 + }, + { + "epoch": 0.47, + "learning_rate": 8.928861801421489e-06, + "logits/chosen": -0.7724618315696716, + "logits/rejected": -0.6903312802314758, + "logps/chosen": -73.90794372558594, + "logps/rejected": -61.467124938964844, + "loss": 0.7815, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.8130699396133423, + "rewards/margins": 0.11102747917175293, + "rewards/rejected": 1.7020424604415894, + "step": 2907 + }, + { + "epoch": 0.47, + "learning_rate": 8.928048778867848e-06, + "logits/chosen": -0.9084827303886414, + "logits/rejected": -0.948406457901001, + "logps/chosen": -61.793182373046875, + "logps/rejected": -50.45613098144531, + "loss": 0.6833, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.3328263759613037, + "rewards/margins": 0.02350783348083496, + "rewards/rejected": 3.3093185424804688, + "step": 2908 + }, + { + "epoch": 0.47, + "learning_rate": 8.927235484923203e-06, + "logits/chosen": -0.8064321279525757, + "logits/rejected": -0.9630995392799377, + "logps/chosen": -99.40318298339844, + "logps/rejected": -128.5574951171875, + "loss": 1.7363, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.6053695678710938, + "rewards/margins": -3.370466709136963, + "rewards/rejected": 4.975836277008057, + "step": 2909 + }, + { + "epoch": 0.47, + "learning_rate": 8.926421919643746e-06, + "logits/chosen": -1.0885685682296753, + "logits/rejected": -1.1858294010162354, + "logps/chosen": -107.90453338623047, + "logps/rejected": -131.4913787841797, + "loss": 3.4054, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.6998131275177, + "rewards/margins": -6.682839393615723, + "rewards/rejected": 9.382652282714844, + "step": 2910 + }, + { + "epoch": 0.47, + "learning_rate": 8.925608083085689e-06, + "logits/chosen": -0.7815772891044617, + "logits/rejected": -0.7804787158966064, + "logps/chosen": -0.5710437893867493, + "logps/rejected": -36.61487579345703, + "loss": 0.3874, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.1024436503648758, + "rewards/margins": -0.03072667121887207, + "rewards/rejected": 0.13317032158374786, + "step": 2911 + }, + { + "epoch": 0.47, + "learning_rate": 8.924793975305254e-06, + "logits/chosen": -0.6933062076568604, + "logits/rejected": -0.7582195997238159, + "logps/chosen": -72.82917785644531, + "logps/rejected": -86.73049926757812, + "loss": 0.2657, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.707783579826355, + "rewards/margins": 0.77818763256073, + "rewards/rejected": 0.929595947265625, + "step": 2912 + }, + { + "epoch": 0.47, + "learning_rate": 8.923979596358694e-06, + "logits/chosen": -1.0856938362121582, + "logits/rejected": -1.012813925743103, + "logps/chosen": -50.286033630371094, + "logps/rejected": -68.06562805175781, + "loss": 0.4672, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.077500104904175, + "rewards/margins": 1.9652899503707886, + "rewards/rejected": 1.1122101545333862, + "step": 2913 + }, + { + "epoch": 0.47, + "learning_rate": 8.923164946302274e-06, + "logits/chosen": -0.8438593745231628, + "logits/rejected": -0.92452472448349, + "logps/chosen": -67.853515625, + "logps/rejected": -67.52019500732422, + "loss": 1.2318, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.34281325340271, + "rewards/margins": -1.904792070388794, + "rewards/rejected": 4.247605323791504, + "step": 2914 + }, + { + "epoch": 0.47, + "learning_rate": 8.922350025192275e-06, + "logits/chosen": -1.4179385900497437, + "logits/rejected": -1.4609156847000122, + "logps/chosen": -83.76022338867188, + "logps/rejected": -75.34832763671875, + "loss": 2.3825, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.3854362964630127, + "rewards/margins": -0.17712020874023438, + "rewards/rejected": 2.562556505203247, + "step": 2915 + }, + { + "epoch": 0.47, + "learning_rate": 8.921534833085005e-06, + "logits/chosen": -0.765129566192627, + "logits/rejected": -0.766234815120697, + "logps/chosen": -3.0743303298950195, + "logps/rejected": -2.061157464981079, + "loss": 0.3819, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.28511843085289, + "rewards/margins": -0.1359037160873413, + "rewards/rejected": 0.4210221469402313, + "step": 2916 + }, + { + "epoch": 0.47, + "learning_rate": 8.920719370036783e-06, + "logits/chosen": -1.0064204931259155, + "logits/rejected": -1.1608638763427734, + "logps/chosen": -219.40032958984375, + "logps/rejected": -147.62220764160156, + "loss": 0.6084, + "rewards/accuracies": 0.0, + "rewards/chosen": 4.551849365234375, + "rewards/margins": -0.8520889282226562, + "rewards/rejected": 5.403938293457031, + "step": 2917 + }, + { + "epoch": 0.47, + "learning_rate": 8.919903636103951e-06, + "logits/chosen": -1.2215341329574585, + "logits/rejected": -1.2178879976272583, + "logps/chosen": -130.63763427734375, + "logps/rejected": -40.60121154785156, + "loss": 0.0179, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.7503342628479, + "rewards/margins": 5.214400768280029, + "rewards/rejected": 0.5359336733818054, + "step": 2918 + }, + { + "epoch": 0.47, + "learning_rate": 8.919087631342868e-06, + "logits/chosen": -1.1915838718414307, + "logits/rejected": -1.1059515476226807, + "logps/chosen": -127.90907287597656, + "logps/rejected": -92.88650512695312, + "loss": 0.0568, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.401721477508545, + "rewards/margins": 2.721855401992798, + "rewards/rejected": 3.679866075515747, + "step": 2919 + }, + { + "epoch": 0.47, + "learning_rate": 8.918271355809913e-06, + "logits/chosen": -1.062922716140747, + "logits/rejected": -1.166961669921875, + "logps/chosen": -412.189208984375, + "logps/rejected": -147.73672485351562, + "loss": 0.9853, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.579644680023193, + "rewards/margins": 0.3271923065185547, + "rewards/rejected": 6.252452373504639, + "step": 2920 + }, + { + "epoch": 0.47, + "learning_rate": 8.917454809561482e-06, + "logits/chosen": -1.257836103439331, + "logits/rejected": -1.2009814977645874, + "logps/chosen": -50.50242614746094, + "logps/rejected": -17.168209075927734, + "loss": 0.3519, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9162604808807373, + "rewards/margins": 2.1756293773651123, + "rewards/rejected": 0.740631103515625, + "step": 2921 + }, + { + "epoch": 0.47, + "learning_rate": 8.91663799265399e-06, + "logits/chosen": -1.2478139400482178, + "logits/rejected": -1.3597559928894043, + "logps/chosen": -70.81324768066406, + "logps/rejected": -100.90899658203125, + "loss": 1.8777, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.5045669078826904, + "rewards/margins": -3.7113559246063232, + "rewards/rejected": 7.215922832489014, + "step": 2922 + }, + { + "epoch": 0.47, + "learning_rate": 8.915820905143873e-06, + "logits/chosen": -1.1306577920913696, + "logits/rejected": -1.1245440244674683, + "logps/chosen": -133.35739135742188, + "logps/rejected": -133.14886474609375, + "loss": 1.1178, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.9620391726493835, + "rewards/margins": -1.622288465499878, + "rewards/rejected": 2.5843276977539062, + "step": 2923 + }, + { + "epoch": 0.47, + "learning_rate": 8.915003547087585e-06, + "logits/chosen": -0.9185686707496643, + "logits/rejected": -1.09084951877594, + "logps/chosen": -28.533126831054688, + "logps/rejected": -49.59029006958008, + "loss": 1.2995, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.970706582069397, + "rewards/margins": -2.5198936462402344, + "rewards/rejected": 3.490600347518921, + "step": 2924 + }, + { + "epoch": 0.47, + "learning_rate": 8.914185918541594e-06, + "logits/chosen": -1.1155427694320679, + "logits/rejected": -1.0031628608703613, + "logps/chosen": -258.10845947265625, + "logps/rejected": -75.50895690917969, + "loss": 0.1496, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.8697540760040283, + "rewards/margins": 3.558976888656616, + "rewards/rejected": 0.3107772767543793, + "step": 2925 + }, + { + "epoch": 0.47, + "learning_rate": 8.913368019562391e-06, + "logits/chosen": -1.34454345703125, + "logits/rejected": -1.277600884437561, + "logps/chosen": -122.84093475341797, + "logps/rejected": -94.83702850341797, + "loss": 0.6963, + "rewards/accuracies": 0.0, + "rewards/chosen": 6.530666351318359, + "rewards/margins": -0.6482729911804199, + "rewards/rejected": 7.178939342498779, + "step": 2926 + }, + { + "epoch": 0.48, + "learning_rate": 8.91254985020649e-06, + "logits/chosen": -0.9778342843055725, + "logits/rejected": -0.8986379504203796, + "logps/chosen": -91.08261108398438, + "logps/rejected": -43.95466232299805, + "loss": 0.1573, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.604478597640991, + "rewards/margins": 2.1070644855499268, + "rewards/rejected": 0.4974140226840973, + "step": 2927 + }, + { + "epoch": 0.48, + "learning_rate": 8.911731410530413e-06, + "logits/chosen": -0.7752100229263306, + "logits/rejected": -0.7965887188911438, + "logps/chosen": -5.993288040161133, + "logps/rejected": -25.31894302368164, + "loss": 0.5116, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.7626908421516418, + "rewards/margins": -0.05123203992843628, + "rewards/rejected": 0.8139228820800781, + "step": 2928 + }, + { + "epoch": 0.48, + "learning_rate": 8.910912700590711e-06, + "logits/chosen": -1.018187403678894, + "logits/rejected": -0.9029567837715149, + "logps/chosen": -99.90388488769531, + "logps/rejected": -17.984018325805664, + "loss": 0.6781, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.9175933599472046, + "rewards/margins": 1.7720649242401123, + "rewards/rejected": 0.1455284208059311, + "step": 2929 + }, + { + "epoch": 0.48, + "learning_rate": 8.910093720443945e-06, + "logits/chosen": -0.4915396571159363, + "logits/rejected": -0.5199034214019775, + "logps/chosen": -117.53205871582031, + "logps/rejected": -70.21939849853516, + "loss": 0.2064, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.929521322250366, + "rewards/margins": 1.0838509798049927, + "rewards/rejected": 1.8456703424453735, + "step": 2930 + }, + { + "epoch": 0.48, + "learning_rate": 8.909274470146698e-06, + "logits/chosen": -1.2012708187103271, + "logits/rejected": -1.1710294485092163, + "logps/chosen": -96.09244537353516, + "logps/rejected": -88.2865982055664, + "loss": 0.0799, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.707210540771484, + "rewards/margins": 1.7538909912109375, + "rewards/rejected": 3.953319549560547, + "step": 2931 + }, + { + "epoch": 0.48, + "learning_rate": 8.908454949755579e-06, + "logits/chosen": -1.256797194480896, + "logits/rejected": -1.2657761573791504, + "logps/chosen": -132.86233520507812, + "logps/rejected": -104.27328491210938, + "loss": 0.3804, + "rewards/accuracies": 0.0, + "rewards/chosen": 5.668984889984131, + "rewards/margins": -0.09990692138671875, + "rewards/rejected": 5.76889181137085, + "step": 2932 + }, + { + "epoch": 0.48, + "learning_rate": 8.907635159327204e-06, + "logits/chosen": -0.7645596861839294, + "logits/rejected": -0.7619702816009521, + "logps/chosen": -3.8347933292388916, + "logps/rejected": -15.195005416870117, + "loss": 0.3559, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.5506635904312134, + "rewards/margins": -0.028527796268463135, + "rewards/rejected": 0.5791913866996765, + "step": 2933 + }, + { + "epoch": 0.48, + "learning_rate": 8.906815098918214e-06, + "logits/chosen": -0.9055977463722229, + "logits/rejected": -1.09286367893219, + "logps/chosen": -62.61716842651367, + "logps/rejected": -88.01290130615234, + "loss": 3.0654, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.145151138305664, + "rewards/margins": -5.342954158782959, + "rewards/rejected": 6.488105297088623, + "step": 2934 + }, + { + "epoch": 0.48, + "learning_rate": 8.905994768585266e-06, + "logits/chosen": -1.3255224227905273, + "logits/rejected": -1.2937180995941162, + "logps/chosen": -59.868751525878906, + "logps/rejected": -49.99246597290039, + "loss": 1.1134, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.9442275762557983, + "rewards/margins": 0.46442604064941406, + "rewards/rejected": 1.4798015356063843, + "step": 2935 + }, + { + "epoch": 0.48, + "learning_rate": 8.905174168385039e-06, + "logits/chosen": -1.1345535516738892, + "logits/rejected": -1.0363067388534546, + "logps/chosen": -44.455039978027344, + "logps/rejected": -68.43357849121094, + "loss": 0.3846, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.21606969833374, + "rewards/margins": 1.1316168308258057, + "rewards/rejected": 3.0844528675079346, + "step": 2936 + }, + { + "epoch": 0.48, + "learning_rate": 8.904353298374229e-06, + "logits/chosen": -1.3136439323425293, + "logits/rejected": -1.4034700393676758, + "logps/chosen": -256.51739501953125, + "logps/rejected": -92.88088989257812, + "loss": 0.0626, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.517480373382568, + "rewards/margins": 2.017306327819824, + "rewards/rejected": 3.500174045562744, + "step": 2937 + }, + { + "epoch": 0.48, + "learning_rate": 8.903532158609548e-06, + "logits/chosen": -1.2311513423919678, + "logits/rejected": -1.1773781776428223, + "logps/chosen": -87.98777770996094, + "logps/rejected": -73.64566040039062, + "loss": 0.5263, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.7639694213867188, + "rewards/margins": -0.5193405151367188, + "rewards/rejected": 3.2833099365234375, + "step": 2938 + }, + { + "epoch": 0.48, + "learning_rate": 8.902710749147732e-06, + "logits/chosen": -1.3049737215042114, + "logits/rejected": -0.890466034412384, + "logps/chosen": -187.8240966796875, + "logps/rejected": -15.813497543334961, + "loss": 0.7595, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.88671875, + "rewards/margins": 6.084644794464111, + "rewards/rejected": 0.8020738959312439, + "step": 2939 + }, + { + "epoch": 0.48, + "learning_rate": 8.90188907004553e-06, + "logits/chosen": -0.874139130115509, + "logits/rejected": -0.720678985118866, + "logps/chosen": -96.64906311035156, + "logps/rejected": -52.60695266723633, + "loss": 0.0332, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.005931377410889, + "rewards/margins": 3.4791784286499023, + "rewards/rejected": 1.5267528295516968, + "step": 2940 + }, + { + "epoch": 0.48, + "learning_rate": 8.901067121359713e-06, + "logits/chosen": -0.9308049082756042, + "logits/rejected": -0.904573917388916, + "logps/chosen": -79.98100280761719, + "logps/rejected": -89.72970581054688, + "loss": 0.4142, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.6670258045196533, + "rewards/margins": 0.3981132507324219, + "rewards/rejected": 2.2689125537872314, + "step": 2941 + }, + { + "epoch": 0.48, + "learning_rate": 8.900244903147071e-06, + "logits/chosen": -1.4551496505737305, + "logits/rejected": -1.4563299417495728, + "logps/chosen": -120.03363037109375, + "logps/rejected": -102.17586517333984, + "loss": 1.1151, + "rewards/accuracies": 0.0, + "rewards/chosen": 5.363583564758301, + "rewards/margins": -2.0872349739074707, + "rewards/rejected": 7.4508185386657715, + "step": 2942 + }, + { + "epoch": 0.48, + "learning_rate": 8.899422415464409e-06, + "logits/chosen": -0.9434458613395691, + "logits/rejected": -1.0585732460021973, + "logps/chosen": -115.99803924560547, + "logps/rejected": -69.68321228027344, + "loss": 0.7283, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.314621686935425, + "rewards/margins": 0.9290335178375244, + "rewards/rejected": 2.3855881690979004, + "step": 2943 + }, + { + "epoch": 0.48, + "learning_rate": 8.898599658368556e-06, + "logits/chosen": -1.0891516208648682, + "logits/rejected": -1.019158959388733, + "logps/chosen": -86.58216857910156, + "logps/rejected": -33.38287353515625, + "loss": 0.8621, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.5813286304473877, + "rewards/margins": 1.5529828071594238, + "rewards/rejected": 0.028345871716737747, + "step": 2944 + }, + { + "epoch": 0.48, + "learning_rate": 8.897776631916356e-06, + "logits/chosen": -0.8856082558631897, + "logits/rejected": -0.9221553802490234, + "logps/chosen": -93.13410949707031, + "logps/rejected": -117.12952423095703, + "loss": 1.0251, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.710932970046997, + "rewards/margins": -1.672574758529663, + "rewards/rejected": 4.38350772857666, + "step": 2945 + }, + { + "epoch": 0.48, + "learning_rate": 8.89695333616467e-06, + "logits/chosen": -0.9304258823394775, + "logits/rejected": -0.8300792574882507, + "logps/chosen": -178.501220703125, + "logps/rejected": -103.58079528808594, + "loss": 0.0224, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.774450778961182, + "rewards/margins": 3.242872714996338, + "rewards/rejected": 3.5315780639648438, + "step": 2946 + }, + { + "epoch": 0.48, + "learning_rate": 8.896129771170385e-06, + "logits/chosen": -1.0009043216705322, + "logits/rejected": -0.9572518467903137, + "logps/chosen": -75.32911682128906, + "logps/rejected": -63.49618911743164, + "loss": 0.6263, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.8270813226699829, + "rewards/margins": -0.9159221649169922, + "rewards/rejected": 1.743003487586975, + "step": 2947 + }, + { + "epoch": 0.48, + "learning_rate": 8.895305936990397e-06, + "logits/chosen": -1.1432472467422485, + "logits/rejected": -1.3046103715896606, + "logps/chosen": -134.22042846679688, + "logps/rejected": -127.05989074707031, + "loss": 1.6489, + "rewards/accuracies": 0.0, + "rewards/chosen": 5.798031806945801, + "rewards/margins": -3.249422073364258, + "rewards/rejected": 9.047453880310059, + "step": 2948 + }, + { + "epoch": 0.48, + "learning_rate": 8.894481833681625e-06, + "logits/chosen": -0.9571754932403564, + "logits/rejected": -1.0091019868850708, + "logps/chosen": -50.341880798339844, + "logps/rejected": -81.92912292480469, + "loss": 0.2177, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.3968658447265625, + "rewards/margins": 1.257683515548706, + "rewards/rejected": 3.1391823291778564, + "step": 2949 + }, + { + "epoch": 0.48, + "learning_rate": 8.893657461301009e-06, + "logits/chosen": -0.8744904398918152, + "logits/rejected": -0.9155749678611755, + "logps/chosen": -95.11723327636719, + "logps/rejected": -112.04546356201172, + "loss": 1.1386, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.8067810535430908, + "rewards/margins": -0.13762128353118896, + "rewards/rejected": 1.9444023370742798, + "step": 2950 + }, + { + "epoch": 0.48, + "learning_rate": 8.892832819905506e-06, + "logits/chosen": -1.0906211137771606, + "logits/rejected": -1.1105417013168335, + "logps/chosen": -46.234066009521484, + "logps/rejected": -54.38177490234375, + "loss": 0.8897, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.2911083698272705, + "rewards/margins": 1.6511669158935547, + "rewards/rejected": 0.639941394329071, + "step": 2951 + }, + { + "epoch": 0.48, + "learning_rate": 8.892007909552088e-06, + "logits/chosen": -1.0974940061569214, + "logits/rejected": -0.9920036792755127, + "logps/chosen": -34.47907638549805, + "logps/rejected": -6.64113712310791, + "loss": 0.1864, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.115208148956299, + "rewards/margins": 2.2014360427856445, + "rewards/rejected": 0.9137721061706543, + "step": 2952 + }, + { + "epoch": 0.48, + "learning_rate": 8.89118273029775e-06, + "logits/chosen": -1.092518925666809, + "logits/rejected": -1.0383920669555664, + "logps/chosen": -156.4777374267578, + "logps/rejected": -79.45774841308594, + "loss": 1.3398, + "rewards/accuracies": 0.0, + "rewards/chosen": 4.860404968261719, + "rewards/margins": -0.08540821075439453, + "rewards/rejected": 4.945813179016113, + "step": 2953 + }, + { + "epoch": 0.48, + "learning_rate": 8.890357282199504e-06, + "logits/chosen": -0.9503058195114136, + "logits/rejected": -0.7494861483573914, + "logps/chosen": -40.34069061279297, + "logps/rejected": -43.75530242919922, + "loss": 0.1281, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.553415060043335, + "rewards/margins": 1.274593472480774, + "rewards/rejected": 1.278821587562561, + "step": 2954 + }, + { + "epoch": 0.48, + "learning_rate": 8.889531565314382e-06, + "logits/chosen": -1.005096673965454, + "logits/rejected": -1.028489112854004, + "logps/chosen": -148.74203491210938, + "logps/rejected": -62.881492614746094, + "loss": 0.3026, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.895031690597534, + "rewards/margins": 1.358816385269165, + "rewards/rejected": 2.536215305328369, + "step": 2955 + }, + { + "epoch": 0.48, + "learning_rate": 8.88870557969943e-06, + "logits/chosen": -1.2654049396514893, + "logits/rejected": -1.2654049396514893, + "logps/chosen": -86.3128433227539, + "logps/rejected": -86.3128433227539, + "loss": 0.4211, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.393611192703247, + "rewards/margins": 0.0, + "rewards/rejected": 3.393611192703247, + "step": 2956 + }, + { + "epoch": 0.48, + "learning_rate": 8.887879325411718e-06, + "logits/chosen": -0.9577073454856873, + "logits/rejected": -1.1245659589767456, + "logps/chosen": -81.28746032714844, + "logps/rejected": -121.8615951538086, + "loss": 2.0489, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.1495461463928223, + "rewards/margins": -4.035261631011963, + "rewards/rejected": 7.184807777404785, + "step": 2957 + }, + { + "epoch": 0.48, + "learning_rate": 8.887052802508334e-06, + "logits/chosen": -1.0766979455947876, + "logits/rejected": -1.0283821821212769, + "logps/chosen": -155.32408142089844, + "logps/rejected": -62.28679656982422, + "loss": 1.1545, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.3812774419784546, + "rewards/margins": -1.4719475507736206, + "rewards/rejected": 2.853224992752075, + "step": 2958 + }, + { + "epoch": 0.48, + "learning_rate": 8.886226011046378e-06, + "logits/chosen": -0.8776671886444092, + "logits/rejected": -0.6897866129875183, + "logps/chosen": -111.31716918945312, + "logps/rejected": -32.05352020263672, + "loss": 0.1339, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.296765327453613, + "rewards/margins": 1.887965440750122, + "rewards/rejected": 2.408799886703491, + "step": 2959 + }, + { + "epoch": 0.48, + "learning_rate": 8.885398951082979e-06, + "logits/chosen": -1.096103549003601, + "logits/rejected": -0.5577422380447388, + "logps/chosen": -170.44464111328125, + "logps/rejected": -34.91447067260742, + "loss": 0.1152, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.608107089996338, + "rewards/margins": 4.414191722869873, + "rewards/rejected": 1.1939152479171753, + "step": 2960 + }, + { + "epoch": 0.48, + "learning_rate": 8.884571622675275e-06, + "logits/chosen": -1.0620346069335938, + "logits/rejected": -1.05168879032135, + "logps/chosen": -65.22119140625, + "logps/rejected": -124.02330780029297, + "loss": 0.2242, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.9734848737716675, + "rewards/margins": 0.7666115760803223, + "rewards/rejected": 1.2068732976913452, + "step": 2961 + }, + { + "epoch": 0.48, + "learning_rate": 8.883744025880429e-06, + "logits/chosen": -1.15735924243927, + "logits/rejected": -1.0980206727981567, + "logps/chosen": -85.59814453125, + "logps/rejected": -43.35575485229492, + "loss": 0.6243, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.0758370161056519, + "rewards/margins": -0.8750041723251343, + "rewards/rejected": 1.9508411884307861, + "step": 2962 + }, + { + "epoch": 0.48, + "learning_rate": 8.882916160755618e-06, + "logits/chosen": -1.333379864692688, + "logits/rejected": -1.3372381925582886, + "logps/chosen": -113.25516510009766, + "logps/rejected": -100.69197845458984, + "loss": 1.9729, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.304698944091797, + "rewards/margins": -3.8635268211364746, + "rewards/rejected": 6.1682257652282715, + "step": 2963 + }, + { + "epoch": 0.48, + "learning_rate": 8.882088027358042e-06, + "logits/chosen": -0.8868480920791626, + "logits/rejected": -0.8825287222862244, + "logps/chosen": -0.8728103637695312, + "logps/rejected": -6.443537712097168, + "loss": 1.0338, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.36149007081985474, + "rewards/margins": 0.07373028993606567, + "rewards/rejected": 0.28775978088378906, + "step": 2964 + }, + { + "epoch": 0.48, + "learning_rate": 8.881259625744916e-06, + "logits/chosen": -1.4887404441833496, + "logits/rejected": -1.1618927717208862, + "logps/chosen": -42.98121643066406, + "logps/rejected": -94.74773406982422, + "loss": 1.0641, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.9101066589355469, + "rewards/margins": -1.9539048671722412, + "rewards/rejected": 3.864011526107788, + "step": 2965 + }, + { + "epoch": 0.48, + "learning_rate": 8.880430955973474e-06, + "logits/chosen": -0.9792781472206116, + "logits/rejected": -0.7569196224212646, + "logps/chosen": -97.9641342163086, + "logps/rejected": -46.41304016113281, + "loss": 0.5205, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.230741024017334, + "rewards/margins": 1.9655241966247559, + "rewards/rejected": 2.265216827392578, + "step": 2966 + }, + { + "epoch": 0.48, + "learning_rate": 8.87960201810097e-06, + "logits/chosen": -1.0525336265563965, + "logits/rejected": -1.0266941785812378, + "logps/chosen": -69.46161651611328, + "logps/rejected": -39.401893615722656, + "loss": 1.0972, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.7603371143341064, + "rewards/margins": 0.8454990386962891, + "rewards/rejected": 1.9148380756378174, + "step": 2967 + }, + { + "epoch": 0.48, + "learning_rate": 8.878772812184676e-06, + "logits/chosen": -1.0149270296096802, + "logits/rejected": -1.054585576057434, + "logps/chosen": -79.89934539794922, + "logps/rejected": -55.288822174072266, + "loss": 1.2673, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.6124504208564758, + "rewards/margins": -1.628890037536621, + "rewards/rejected": 2.241340398788452, + "step": 2968 + }, + { + "epoch": 0.48, + "learning_rate": 8.87794333828188e-06, + "logits/chosen": -1.084073781967163, + "logits/rejected": -1.1032432317733765, + "logps/chosen": -62.73417282104492, + "logps/rejected": -62.769615173339844, + "loss": 0.6493, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.0867847204208374, + "rewards/margins": -0.9676412343978882, + "rewards/rejected": 2.0544259548187256, + "step": 2969 + }, + { + "epoch": 0.48, + "learning_rate": 8.877113596449895e-06, + "logits/chosen": -1.039150357246399, + "logits/rejected": -0.9777925610542297, + "logps/chosen": -64.44773864746094, + "logps/rejected": -7.329958915710449, + "loss": 0.1094, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.74053955078125, + "rewards/margins": 2.227501153945923, + "rewards/rejected": 0.5130383372306824, + "step": 2970 + }, + { + "epoch": 0.48, + "learning_rate": 8.876283586746047e-06, + "logits/chosen": -1.2111905813217163, + "logits/rejected": -1.1701908111572266, + "logps/chosen": -98.09330749511719, + "logps/rejected": -62.18743896484375, + "loss": 2.046, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.7576316595077515, + "rewards/margins": -0.19539415836334229, + "rewards/rejected": 1.9530258178710938, + "step": 2971 + }, + { + "epoch": 0.48, + "learning_rate": 8.875453309227678e-06, + "logits/chosen": -0.8178099393844604, + "logits/rejected": -0.838036298751831, + "logps/chosen": -33.6470832824707, + "logps/rejected": -70.85816955566406, + "loss": 1.042, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.8687061667442322, + "rewards/margins": -0.4277568459510803, + "rewards/rejected": 1.2964630126953125, + "step": 2972 + }, + { + "epoch": 0.48, + "learning_rate": 8.874622763952156e-06, + "logits/chosen": -0.8457831740379333, + "logits/rejected": -0.8457831740379333, + "logps/chosen": -40.65590286254883, + "logps/rejected": -40.65590286254883, + "loss": 0.3938, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.5698963403701782, + "rewards/margins": 0.0, + "rewards/rejected": 1.5698963403701782, + "step": 2973 + }, + { + "epoch": 0.48, + "learning_rate": 8.873791950976865e-06, + "logits/chosen": -0.7201036810874939, + "logits/rejected": -0.6463963985443115, + "logps/chosen": -50.01197814941406, + "logps/rejected": -63.24919128417969, + "loss": 0.3886, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.739558458328247, + "rewards/margins": 1.0548667907714844, + "rewards/rejected": 1.6846916675567627, + "step": 2974 + }, + { + "epoch": 0.48, + "learning_rate": 8.872960870359203e-06, + "logits/chosen": -0.8991904854774475, + "logits/rejected": -0.8917701244354248, + "logps/chosen": -71.6046142578125, + "logps/rejected": -53.67599868774414, + "loss": 0.0905, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.4011924266815186, + "rewards/margins": 1.7581202983856201, + "rewards/rejected": 1.6430721282958984, + "step": 2975 + }, + { + "epoch": 0.48, + "learning_rate": 8.872129522156591e-06, + "logits/chosen": -1.0289676189422607, + "logits/rejected": -1.2048845291137695, + "logps/chosen": -148.73194885253906, + "logps/rejected": -138.88145446777344, + "loss": 1.4676, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.635449409484863, + "rewards/margins": 1.1858534812927246, + "rewards/rejected": 5.449595928192139, + "step": 2976 + }, + { + "epoch": 0.48, + "learning_rate": 8.871297906426468e-06, + "logits/chosen": -0.9195768237113953, + "logits/rejected": -0.9935426712036133, + "logps/chosen": -51.960845947265625, + "logps/rejected": -93.16060638427734, + "loss": 0.2999, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.2959625720977783, + "rewards/margins": 0.25511693954467773, + "rewards/rejected": 2.0408456325531006, + "step": 2977 + }, + { + "epoch": 0.48, + "learning_rate": 8.87046602322629e-06, + "logits/chosen": -1.1063226461410522, + "logits/rejected": -1.1147502660751343, + "logps/chosen": -67.34861755371094, + "logps/rejected": -66.0655288696289, + "loss": 1.0189, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.310577392578125, + "rewards/margins": -0.9758706092834473, + "rewards/rejected": 2.2864480018615723, + "step": 2978 + }, + { + "epoch": 0.48, + "learning_rate": 8.869633872613533e-06, + "logits/chosen": -1.0470730066299438, + "logits/rejected": -1.0460542440414429, + "logps/chosen": -27.135902404785156, + "logps/rejected": -46.502227783203125, + "loss": 0.8827, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.294607639312744, + "rewards/margins": -0.07882833480834961, + "rewards/rejected": 3.3734359741210938, + "step": 2979 + }, + { + "epoch": 0.48, + "learning_rate": 8.868801454645688e-06, + "logits/chosen": -0.9433117508888245, + "logits/rejected": -0.9365698099136353, + "logps/chosen": -1.5889381170272827, + "logps/rejected": -9.454233169555664, + "loss": 2.9026, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2714803218841553, + "rewards/margins": 0.20788435637950897, + "rewards/rejected": 0.0635959655046463, + "step": 2980 + }, + { + "epoch": 0.48, + "learning_rate": 8.867968769380271e-06, + "logits/chosen": -0.7997124791145325, + "logits/rejected": -0.7349726557731628, + "logps/chosen": -48.015419006347656, + "logps/rejected": -17.675676345825195, + "loss": 1.1243, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.8982223272323608, + "rewards/margins": 1.1261003017425537, + "rewards/rejected": 0.7721220254898071, + "step": 2981 + }, + { + "epoch": 0.48, + "learning_rate": 8.867135816874811e-06, + "logits/chosen": -1.3269621133804321, + "logits/rejected": -1.3269621133804321, + "logps/chosen": -128.6822509765625, + "logps/rejected": -128.6822509765625, + "loss": 1.1319, + "rewards/accuracies": 0.0, + "rewards/chosen": 4.858615398406982, + "rewards/margins": 0.0, + "rewards/rejected": 4.858615398406982, + "step": 2982 + }, + { + "epoch": 0.48, + "learning_rate": 8.866302597186858e-06, + "logits/chosen": -0.978873610496521, + "logits/rejected": -1.2632038593292236, + "logps/chosen": -93.31829833984375, + "logps/rejected": -35.47311019897461, + "loss": 0.0805, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.1319146156311035, + "rewards/margins": 1.7887623310089111, + "rewards/rejected": 0.34315225481987, + "step": 2983 + }, + { + "epoch": 0.48, + "learning_rate": 8.865469110373979e-06, + "logits/chosen": -0.9647057056427002, + "logits/rejected": -0.7509791851043701, + "logps/chosen": -55.42980194091797, + "logps/rejected": -25.275516510009766, + "loss": 0.3556, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.005894422531128, + "rewards/margins": 1.7876530885696411, + "rewards/rejected": 0.21824131906032562, + "step": 2984 + }, + { + "epoch": 0.48, + "learning_rate": 8.864635356493759e-06, + "logits/chosen": -0.9901809692382812, + "logits/rejected": -0.9194058179855347, + "logps/chosen": -50.07973861694336, + "logps/rejected": -31.01511001586914, + "loss": 0.3922, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.4831669330596924, + "rewards/margins": 0.15903019905090332, + "rewards/rejected": 2.324136734008789, + "step": 2985 + }, + { + "epoch": 0.48, + "learning_rate": 8.863801335603802e-06, + "logits/chosen": -0.7051461935043335, + "logits/rejected": -0.6703195571899414, + "logps/chosen": -44.98707962036133, + "logps/rejected": -45.71178436279297, + "loss": 1.2592, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.068171262741089, + "rewards/margins": -1.0823583602905273, + "rewards/rejected": 3.150529623031616, + "step": 2986 + }, + { + "epoch": 0.48, + "learning_rate": 8.862967047761734e-06, + "logits/chosen": -1.050702452659607, + "logits/rejected": -1.0779826641082764, + "logps/chosen": -47.64237976074219, + "logps/rejected": -47.38823318481445, + "loss": 0.4148, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.782915472984314, + "rewards/margins": -0.12006950378417969, + "rewards/rejected": 1.9029849767684937, + "step": 2987 + }, + { + "epoch": 0.48, + "learning_rate": 8.862132493025195e-06, + "logits/chosen": -0.9391381740570068, + "logits/rejected": -1.086121678352356, + "logps/chosen": -50.01292419433594, + "logps/rejected": -172.7759246826172, + "loss": 2.8432, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.570500135421753, + "rewards/margins": -4.996420860290527, + "rewards/rejected": 7.566920757293701, + "step": 2988 + }, + { + "epoch": 0.49, + "learning_rate": 8.861297671451845e-06, + "logits/chosen": -1.0766630172729492, + "logits/rejected": -1.0981988906860352, + "logps/chosen": -58.81641387939453, + "logps/rejected": -51.21592330932617, + "loss": 0.8687, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.5766037702560425, + "rewards/margins": -0.8160709142684937, + "rewards/rejected": 2.392674684524536, + "step": 2989 + }, + { + "epoch": 0.49, + "learning_rate": 8.86046258309936e-06, + "logits/chosen": -0.885152280330658, + "logits/rejected": -0.8974677920341492, + "logps/chosen": -93.70183563232422, + "logps/rejected": -141.1999053955078, + "loss": 1.3773, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.3289635181427, + "rewards/margins": -2.037856340408325, + "rewards/rejected": 5.366819858551025, + "step": 2990 + }, + { + "epoch": 0.49, + "learning_rate": 8.85962722802544e-06, + "logits/chosen": -1.0523781776428223, + "logits/rejected": -0.9752618670463562, + "logps/chosen": -78.53640747070312, + "logps/rejected": -108.28938293457031, + "loss": 0.4315, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.331075429916382, + "rewards/margins": -0.037395477294921875, + "rewards/rejected": 3.3684709072113037, + "step": 2991 + }, + { + "epoch": 0.49, + "learning_rate": 8.858791606287797e-06, + "logits/chosen": -1.0903657674789429, + "logits/rejected": -1.0795865058898926, + "logps/chosen": -72.29454803466797, + "logps/rejected": -133.16787719726562, + "loss": 0.2566, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.3831291198730469, + "rewards/margins": 0.8395652770996094, + "rewards/rejected": 0.5435638427734375, + "step": 2992 + }, + { + "epoch": 0.49, + "learning_rate": 8.857955717944168e-06, + "logits/chosen": -1.001865029335022, + "logits/rejected": -1.034478783607483, + "logps/chosen": -98.184326171875, + "logps/rejected": -86.48562622070312, + "loss": 1.8236, + "rewards/accuracies": 0.0, + "rewards/chosen": 5.943164348602295, + "rewards/margins": -0.7958722114562988, + "rewards/rejected": 6.739036560058594, + "step": 2993 + }, + { + "epoch": 0.49, + "learning_rate": 8.857119563052301e-06, + "logits/chosen": -1.130952000617981, + "logits/rejected": -1.1092650890350342, + "logps/chosen": -76.0953369140625, + "logps/rejected": -74.13322448730469, + "loss": 0.2924, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.210172414779663, + "rewards/margins": 0.7526634931564331, + "rewards/rejected": 1.45750892162323, + "step": 2994 + }, + { + "epoch": 0.49, + "learning_rate": 8.85628314166997e-06, + "logits/chosen": -0.9254301190376282, + "logits/rejected": -1.0059465169906616, + "logps/chosen": -87.1453628540039, + "logps/rejected": -56.17011642456055, + "loss": 2.2317, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.2685493528842926, + "rewards/margins": -3.3790225982666016, + "rewards/rejected": 3.6475720405578613, + "step": 2995 + }, + { + "epoch": 0.49, + "learning_rate": 8.855446453854964e-06, + "logits/chosen": -0.9817104339599609, + "logits/rejected": -0.922363817691803, + "logps/chosen": -46.82487869262695, + "logps/rejected": -62.7319221496582, + "loss": 0.2844, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.075413227081299, + "rewards/margins": 0.3913620710372925, + "rewards/rejected": 1.6840511560440063, + "step": 2996 + }, + { + "epoch": 0.49, + "learning_rate": 8.854609499665087e-06, + "logits/chosen": -1.3268884420394897, + "logits/rejected": -1.2494183778762817, + "logps/chosen": -89.95767974853516, + "logps/rejected": -73.94291687011719, + "loss": 2.5616, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.2879083156585693, + "rewards/margins": -3.543044328689575, + "rewards/rejected": 5.8309526443481445, + "step": 2997 + }, + { + "epoch": 0.49, + "learning_rate": 8.853772279158166e-06, + "logits/chosen": -0.8841682076454163, + "logits/rejected": -0.8528299927711487, + "logps/chosen": -22.91014862060547, + "logps/rejected": -39.914127349853516, + "loss": 0.397, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.5666797757148743, + "rewards/margins": -0.1329193115234375, + "rewards/rejected": 0.6995990872383118, + "step": 2998 + }, + { + "epoch": 0.49, + "learning_rate": 8.852934792392045e-06, + "logits/chosen": -1.0368107557296753, + "logits/rejected": -0.9695829749107361, + "logps/chosen": -63.486968994140625, + "logps/rejected": -54.135589599609375, + "loss": 0.4261, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.665553331375122, + "rewards/margins": 1.191270351409912, + "rewards/rejected": 2.47428297996521, + "step": 2999 + }, + { + "epoch": 0.49, + "learning_rate": 8.852097039424589e-06, + "logits/chosen": -0.8282901644706726, + "logits/rejected": -0.626552402973175, + "logps/chosen": -58.799407958984375, + "logps/rejected": -27.047161102294922, + "loss": 0.2764, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.5426461696624756, + "rewards/margins": 2.5315918922424316, + "rewards/rejected": 1.011054277420044, + "step": 3000 + }, + { + "epoch": 0.49, + "learning_rate": 8.851259020313674e-06, + "logits/chosen": -1.6039444208145142, + "logits/rejected": -1.4486180543899536, + "logps/chosen": -114.22940063476562, + "logps/rejected": -55.40373229980469, + "loss": 0.0162, + "rewards/accuracies": 1.0, + "rewards/chosen": 7.795575141906738, + "rewards/margins": 4.191403388977051, + "rewards/rejected": 3.6041717529296875, + "step": 3001 + }, + { + "epoch": 0.49, + "learning_rate": 8.850420735117202e-06, + "logits/chosen": -0.8347593545913696, + "logits/rejected": -1.0652562379837036, + "logps/chosen": -60.199310302734375, + "logps/rejected": -41.69712829589844, + "loss": 1.0024, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.7702263593673706, + "rewards/margins": -1.4437836408615112, + "rewards/rejected": 3.214010000228882, + "step": 3002 + }, + { + "epoch": 0.49, + "learning_rate": 8.84958218389309e-06, + "logits/chosen": -0.6660688519477844, + "logits/rejected": -0.6678577065467834, + "logps/chosen": -2.0316755771636963, + "logps/rejected": -4.858249187469482, + "loss": 0.3721, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.2314785271883011, + "rewards/margins": -0.09585906565189362, + "rewards/rejected": 0.3273375928401947, + "step": 3003 + }, + { + "epoch": 0.49, + "learning_rate": 8.848743366699275e-06, + "logits/chosen": -0.6526154279708862, + "logits/rejected": -0.6526154279708862, + "logps/chosen": -15.196013450622559, + "logps/rejected": -15.196013450622559, + "loss": 1.4636, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.26135683059692383, + "rewards/margins": 0.0, + "rewards/rejected": 0.26135683059692383, + "step": 3004 + }, + { + "epoch": 0.49, + "learning_rate": 8.847904283593712e-06, + "logits/chosen": -0.859060525894165, + "logits/rejected": -0.8266457915306091, + "logps/chosen": -74.04879760742188, + "logps/rejected": -126.92987060546875, + "loss": 0.9899, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.2248162031173706, + "rewards/margins": -0.7808936834335327, + "rewards/rejected": 2.0057098865509033, + "step": 3005 + }, + { + "epoch": 0.49, + "learning_rate": 8.84706493463437e-06, + "logits/chosen": -1.1960362195968628, + "logits/rejected": -1.2320061922073364, + "logps/chosen": -126.1302490234375, + "logps/rejected": -203.07467651367188, + "loss": 0.5477, + "rewards/accuracies": 0.0, + "rewards/chosen": 5.835568428039551, + "rewards/margins": -0.6867184638977051, + "rewards/rejected": 6.522286891937256, + "step": 3006 + }, + { + "epoch": 0.49, + "learning_rate": 8.846225319879243e-06, + "logits/chosen": -0.9001498818397522, + "logits/rejected": -0.599003255367279, + "logps/chosen": -119.56825256347656, + "logps/rejected": -45.57037353515625, + "loss": 0.0989, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.391673564910889, + "rewards/margins": 3.001084327697754, + "rewards/rejected": 3.3905892372131348, + "step": 3007 + }, + { + "epoch": 0.49, + "learning_rate": 8.845385439386338e-06, + "logits/chosen": -1.0603324174880981, + "logits/rejected": -0.69066321849823, + "logps/chosen": -103.17604064941406, + "logps/rejected": -91.65414428710938, + "loss": 0.478, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.893035888671875, + "rewards/margins": -0.44370126724243164, + "rewards/rejected": 2.3367371559143066, + "step": 3008 + }, + { + "epoch": 0.49, + "learning_rate": 8.844545293213687e-06, + "logits/chosen": -0.8919684886932373, + "logits/rejected": -0.8919684886932373, + "logps/chosen": -79.41967010498047, + "logps/rejected": -79.41967010498047, + "loss": 0.4757, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.8759636878967285, + "rewards/margins": 0.0, + "rewards/rejected": 2.8759636878967285, + "step": 3009 + }, + { + "epoch": 0.49, + "learning_rate": 8.843704881419333e-06, + "logits/chosen": -1.3774735927581787, + "logits/rejected": -1.3329510688781738, + "logps/chosen": -92.67431640625, + "logps/rejected": -42.882240295410156, + "loss": 0.1102, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.6946747303009033, + "rewards/margins": 1.6264362335205078, + "rewards/rejected": 1.0682384967803955, + "step": 3010 + }, + { + "epoch": 0.49, + "learning_rate": 8.842864204061341e-06, + "logits/chosen": -0.9998778700828552, + "logits/rejected": -0.8164469599723816, + "logps/chosen": -56.776084899902344, + "logps/rejected": -41.77940368652344, + "loss": 0.7544, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.215807318687439, + "rewards/margins": -1.0739952325820923, + "rewards/rejected": 2.2898025512695312, + "step": 3011 + }, + { + "epoch": 0.49, + "learning_rate": 8.842023261197794e-06, + "logits/chosen": -1.0512975454330444, + "logits/rejected": -0.9941706657409668, + "logps/chosen": -51.43650817871094, + "logps/rejected": -63.01072692871094, + "loss": 0.1278, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1283326148986816, + "rewards/margins": 1.2465797662734985, + "rewards/rejected": 1.881752848625183, + "step": 3012 + }, + { + "epoch": 0.49, + "learning_rate": 8.841182052886793e-06, + "logits/chosen": -1.035287857055664, + "logits/rejected": -0.973122239112854, + "logps/chosen": -68.33026123046875, + "logps/rejected": -48.51441192626953, + "loss": 0.3968, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.3299691677093506, + "rewards/margins": 2.4338834285736084, + "rewards/rejected": 0.8960857391357422, + "step": 3013 + }, + { + "epoch": 0.49, + "learning_rate": 8.840340579186457e-06, + "logits/chosen": -1.3530000448226929, + "logits/rejected": -1.3828614950180054, + "logps/chosen": -106.28108978271484, + "logps/rejected": -135.87644958496094, + "loss": 0.6716, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.9046363830566406, + "rewards/margins": -0.9332969188690186, + "rewards/rejected": 2.837933301925659, + "step": 3014 + }, + { + "epoch": 0.49, + "learning_rate": 8.839498840154925e-06, + "logits/chosen": -0.7402127981185913, + "logits/rejected": -0.6961296796798706, + "logps/chosen": -48.17761993408203, + "logps/rejected": -15.355435371398926, + "loss": 0.2351, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.172775983810425, + "rewards/margins": 1.6106739044189453, + "rewards/rejected": 0.5621020197868347, + "step": 3015 + }, + { + "epoch": 0.49, + "learning_rate": 8.838656835850354e-06, + "logits/chosen": -1.0168458223342896, + "logits/rejected": -0.8817301988601685, + "logps/chosen": -133.68057250976562, + "logps/rejected": -53.840484619140625, + "loss": 0.1676, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.706531047821045, + "rewards/margins": 0.9307253360748291, + "rewards/rejected": 3.775805711746216, + "step": 3016 + }, + { + "epoch": 0.49, + "learning_rate": 8.837814566330916e-06, + "logits/chosen": -0.9984648823738098, + "logits/rejected": -1.1470991373062134, + "logps/chosen": -67.39875030517578, + "logps/rejected": -111.83403015136719, + "loss": 1.5764, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.1914665699005127, + "rewards/margins": -3.09626841545105, + "rewards/rejected": 6.2877349853515625, + "step": 3017 + }, + { + "epoch": 0.49, + "learning_rate": 8.836972031654807e-06, + "logits/chosen": -0.9217196106910706, + "logits/rejected": -0.3873019218444824, + "logps/chosen": -124.10621643066406, + "logps/rejected": -35.763916015625, + "loss": 0.1538, + "rewards/accuracies": 1.0, + "rewards/chosen": 7.555690288543701, + "rewards/margins": 6.464145660400391, + "rewards/rejected": 1.0915447473526, + "step": 3018 + }, + { + "epoch": 0.49, + "learning_rate": 8.836129231880236e-06, + "logits/chosen": -1.003101110458374, + "logits/rejected": -1.017014980316162, + "logps/chosen": -71.64881896972656, + "logps/rejected": -64.67951202392578, + "loss": 0.392, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.390554904937744, + "rewards/margins": -0.07206273078918457, + "rewards/rejected": 2.4626176357269287, + "step": 3019 + }, + { + "epoch": 0.49, + "learning_rate": 8.835286167065431e-06, + "logits/chosen": -0.6470839381217957, + "logits/rejected": -0.64261394739151, + "logps/chosen": -10.92845630645752, + "logps/rejected": -7.80496883392334, + "loss": 0.5702, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.0664712190628052, + "rewards/margins": 0.09944325685501099, + "rewards/rejected": 0.9670279622077942, + "step": 3020 + }, + { + "epoch": 0.49, + "learning_rate": 8.834442837268642e-06, + "logits/chosen": -0.7594160437583923, + "logits/rejected": -0.6204272508621216, + "logps/chosen": -53.43421173095703, + "logps/rejected": -91.14787292480469, + "loss": 1.0174, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.8516578674316406, + "rewards/margins": -1.6454048156738281, + "rewards/rejected": 4.497062683105469, + "step": 3021 + }, + { + "epoch": 0.49, + "learning_rate": 8.833599242548137e-06, + "logits/chosen": -0.9035097360610962, + "logits/rejected": -1.18771493434906, + "logps/chosen": -78.10551452636719, + "logps/rejected": -51.69877243041992, + "loss": 0.6558, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1707763671875, + "rewards/margins": 0.12262368202209473, + "rewards/rejected": 3.0481526851654053, + "step": 3022 + }, + { + "epoch": 0.49, + "learning_rate": 8.832755382962196e-06, + "logits/chosen": -0.9499961733818054, + "logits/rejected": -0.8614532351493835, + "logps/chosen": -65.59428405761719, + "logps/rejected": -74.19789123535156, + "loss": 0.3465, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.950646162033081, + "rewards/margins": 0.2730245590209961, + "rewards/rejected": 2.677621603012085, + "step": 3023 + }, + { + "epoch": 0.49, + "learning_rate": 8.831911258569124e-06, + "logits/chosen": -1.2531023025512695, + "logits/rejected": -1.3180149793624878, + "logps/chosen": -87.19187927246094, + "logps/rejected": -72.05792236328125, + "loss": 0.9837, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.03739333152771, + "rewards/margins": -0.5577070713043213, + "rewards/rejected": 2.5951004028320312, + "step": 3024 + }, + { + "epoch": 0.49, + "learning_rate": 8.831066869427243e-06, + "logits/chosen": -0.9241101741790771, + "logits/rejected": -0.835781991481781, + "logps/chosen": -222.42916870117188, + "logps/rejected": -96.82518005371094, + "loss": 0.0175, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.823367595672607, + "rewards/margins": 3.7857866287231445, + "rewards/rejected": 1.0375808477401733, + "step": 3025 + }, + { + "epoch": 0.49, + "learning_rate": 8.83022221559489e-06, + "logits/chosen": -1.3195459842681885, + "logits/rejected": -1.3294557332992554, + "logps/chosen": -43.85515213012695, + "logps/rejected": -81.50146484375, + "loss": 0.5816, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.716268539428711, + "rewards/margins": 0.5105509757995605, + "rewards/rejected": 2.2057175636291504, + "step": 3026 + }, + { + "epoch": 0.49, + "learning_rate": 8.829377297130426e-06, + "logits/chosen": -0.8938084244728088, + "logits/rejected": -0.7848413586616516, + "logps/chosen": -60.99198913574219, + "logps/rejected": -27.454235076904297, + "loss": 0.2666, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.0230872631073, + "rewards/margins": 2.216778039932251, + "rewards/rejected": -0.193690687417984, + "step": 3027 + }, + { + "epoch": 0.49, + "learning_rate": 8.828532114092224e-06, + "logits/chosen": -1.2637161016464233, + "logits/rejected": -1.2181072235107422, + "logps/chosen": -69.00643157958984, + "logps/rejected": -19.415637969970703, + "loss": 0.234, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.2740753889083862, + "rewards/margins": 1.1672451496124268, + "rewards/rejected": 0.10683021694421768, + "step": 3028 + }, + { + "epoch": 0.49, + "learning_rate": 8.827686666538678e-06, + "logits/chosen": -0.7198803424835205, + "logits/rejected": -0.713880717754364, + "logps/chosen": -8.691843032836914, + "logps/rejected": -16.973024368286133, + "loss": 0.7682, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.28336963057518005, + "rewards/margins": -0.28058966994285583, + "rewards/rejected": 0.5639593005180359, + "step": 3029 + }, + { + "epoch": 0.49, + "learning_rate": 8.8268409545282e-06, + "logits/chosen": -0.9886541366577148, + "logits/rejected": -0.9789320826530457, + "logps/chosen": -53.0970344543457, + "logps/rejected": -94.9840316772461, + "loss": 1.1678, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.7985150814056396, + "rewards/margins": -1.6898829936981201, + "rewards/rejected": 4.48839807510376, + "step": 3030 + }, + { + "epoch": 0.49, + "learning_rate": 8.825994978119224e-06, + "logits/chosen": -0.717557430267334, + "logits/rejected": -0.7508876323699951, + "logps/chosen": -70.1478271484375, + "logps/rejected": -67.85243225097656, + "loss": 2.405, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.8871735334396362, + "rewards/margins": -1.2999380826950073, + "rewards/rejected": 3.1871116161346436, + "step": 3031 + }, + { + "epoch": 0.49, + "learning_rate": 8.825148737370195e-06, + "logits/chosen": -1.238034725189209, + "logits/rejected": -1.1890108585357666, + "logps/chosen": -103.10115051269531, + "logps/rejected": -78.21971130371094, + "loss": 1.216, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.5899949073791504, + "rewards/margins": -2.227522850036621, + "rewards/rejected": 4.8175177574157715, + "step": 3032 + }, + { + "epoch": 0.49, + "learning_rate": 8.824302232339584e-06, + "logits/chosen": -1.4924359321594238, + "logits/rejected": -1.3552112579345703, + "logps/chosen": -66.23249816894531, + "logps/rejected": -17.692005157470703, + "loss": 0.0043, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.341325283050537, + "rewards/margins": 5.561880111694336, + "rewards/rejected": 0.7794452905654907, + "step": 3033 + }, + { + "epoch": 0.49, + "learning_rate": 8.823455463085873e-06, + "logits/chosen": -0.787063717842102, + "logits/rejected": -0.48550647497177124, + "logps/chosen": -56.99156188964844, + "logps/rejected": -42.42027282714844, + "loss": 0.3969, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.4195313453674316, + "rewards/margins": 1.1836117506027222, + "rewards/rejected": 1.2359195947647095, + "step": 3034 + }, + { + "epoch": 0.49, + "learning_rate": 8.82260842966757e-06, + "logits/chosen": -1.239796757698059, + "logits/rejected": -1.4842634201049805, + "logps/chosen": -92.92929077148438, + "logps/rejected": -47.592796325683594, + "loss": 0.3715, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.9513565301895142, + "rewards/margins": -0.017109692096710205, + "rewards/rejected": 0.9684662222862244, + "step": 3035 + }, + { + "epoch": 0.49, + "learning_rate": 8.821761132143192e-06, + "logits/chosen": -0.9522799849510193, + "logits/rejected": -0.8512731194496155, + "logps/chosen": -56.70720672607422, + "logps/rejected": -43.54151153564453, + "loss": 1.3279, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.7621208429336548, + "rewards/margins": -1.298073649406433, + "rewards/rejected": 3.060194492340088, + "step": 3036 + }, + { + "epoch": 0.49, + "learning_rate": 8.820913570571283e-06, + "logits/chosen": -0.8746605515480042, + "logits/rejected": -0.7453556656837463, + "logps/chosen": -61.52783203125, + "logps/rejected": -45.230079650878906, + "loss": 0.2398, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.1526589393615723, + "rewards/margins": 0.5342670679092407, + "rewards/rejected": 1.6183918714523315, + "step": 3037 + }, + { + "epoch": 0.49, + "learning_rate": 8.820065745010398e-06, + "logits/chosen": -1.1235166788101196, + "logits/rejected": -1.1338287591934204, + "logps/chosen": -72.86158752441406, + "logps/rejected": -178.08926391601562, + "loss": 0.3521, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.1489670276641846, + "rewards/margins": 0.08648836612701416, + "rewards/rejected": 1.0624786615371704, + "step": 3038 + }, + { + "epoch": 0.49, + "learning_rate": 8.819217655519118e-06, + "logits/chosen": -1.0654771327972412, + "logits/rejected": -1.319993019104004, + "logps/chosen": -45.667808532714844, + "logps/rejected": -63.0662956237793, + "loss": 3.5145, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.685645341873169, + "rewards/margins": -5.488162040710449, + "rewards/rejected": 8.173807144165039, + "step": 3039 + }, + { + "epoch": 0.49, + "learning_rate": 8.818369302156034e-06, + "logits/chosen": -1.045373558998108, + "logits/rejected": -0.8597756624221802, + "logps/chosen": -136.04840087890625, + "logps/rejected": -107.64906311035156, + "loss": 0.206, + "rewards/accuracies": 1.0, + "rewards/chosen": 7.287045478820801, + "rewards/margins": 1.719587802886963, + "rewards/rejected": 5.567457675933838, + "step": 3040 + }, + { + "epoch": 0.49, + "learning_rate": 8.817520684979762e-06, + "logits/chosen": -0.5226727724075317, + "logits/rejected": -0.8265761733055115, + "logps/chosen": -81.44229125976562, + "logps/rejected": -51.0666618347168, + "loss": 0.9287, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.009930372238159, + "rewards/margins": 0.14826393127441406, + "rewards/rejected": 2.861666440963745, + "step": 3041 + }, + { + "epoch": 0.49, + "learning_rate": 8.816671804048933e-06, + "logits/chosen": -1.144666314125061, + "logits/rejected": -1.3344889879226685, + "logps/chosen": -107.76313781738281, + "logps/rejected": -119.16515350341797, + "loss": 2.1542, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.6442062854766846, + "rewards/margins": -2.2269446849823, + "rewards/rejected": 4.871150970458984, + "step": 3042 + }, + { + "epoch": 0.49, + "learning_rate": 8.815822659422195e-06, + "logits/chosen": -0.9086403250694275, + "logits/rejected": -0.9086403250694275, + "logps/chosen": -20.304779052734375, + "logps/rejected": -20.304779052734375, + "loss": 0.9147, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.15930214524269104, + "rewards/margins": 0.0, + "rewards/rejected": 0.15930214524269104, + "step": 3043 + }, + { + "epoch": 0.49, + "learning_rate": 8.814973251158217e-06, + "logits/chosen": -0.7313661575317383, + "logits/rejected": -0.7344601154327393, + "logps/chosen": -74.86367797851562, + "logps/rejected": -48.620235443115234, + "loss": 0.8246, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.9289619326591492, + "rewards/margins": -1.3011813163757324, + "rewards/rejected": 2.2301433086395264, + "step": 3044 + }, + { + "epoch": 0.49, + "learning_rate": 8.814123579315686e-06, + "logits/chosen": -0.8561168313026428, + "logits/rejected": -0.9041190147399902, + "logps/chosen": -100.03463745117188, + "logps/rejected": -69.47726440429688, + "loss": 1.3082, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.194421410560608, + "rewards/margins": -1.5424615144729614, + "rewards/rejected": 2.7368829250335693, + "step": 3045 + }, + { + "epoch": 0.49, + "learning_rate": 8.813273643953304e-06, + "logits/chosen": -0.7564297318458557, + "logits/rejected": -0.5521406531333923, + "logps/chosen": -162.02212524414062, + "logps/rejected": -36.263919830322266, + "loss": 0.9697, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1228013038635254, + "rewards/margins": 2.3292171955108643, + "rewards/rejected": 0.7935840487480164, + "step": 3046 + }, + { + "epoch": 0.49, + "learning_rate": 8.812423445129795e-06, + "logits/chosen": -0.7501633763313293, + "logits/rejected": -0.7524002194404602, + "logps/chosen": -75.14091491699219, + "logps/rejected": -124.75917053222656, + "loss": 0.2604, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.032729387283325, + "rewards/margins": 0.4246619939804077, + "rewards/rejected": 1.6080673933029175, + "step": 3047 + }, + { + "epoch": 0.49, + "learning_rate": 8.8115729829039e-06, + "logits/chosen": -0.7786397337913513, + "logits/rejected": -0.8464875221252441, + "logps/chosen": -63.54637908935547, + "logps/rejected": -68.94326782226562, + "loss": 0.2528, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.6010987758636475, + "rewards/margins": 0.6141419410705566, + "rewards/rejected": 1.9869568347930908, + "step": 3048 + }, + { + "epoch": 0.49, + "learning_rate": 8.810722257334376e-06, + "logits/chosen": -0.9172277450561523, + "logits/rejected": -0.8221109509468079, + "logps/chosen": -84.01081848144531, + "logps/rejected": -41.61894226074219, + "loss": 1.9808, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.66782546043396, + "rewards/margins": -0.8979873657226562, + "rewards/rejected": 3.565812826156616, + "step": 3049 + }, + { + "epoch": 0.5, + "learning_rate": 8.809871268480004e-06, + "logits/chosen": -1.0160422325134277, + "logits/rejected": -1.0180490016937256, + "logps/chosen": -80.31624603271484, + "logps/rejected": -84.16976928710938, + "loss": 0.8008, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.256112098693848, + "rewards/margins": 0.307863712310791, + "rewards/rejected": 5.948248386383057, + "step": 3050 + }, + { + "epoch": 0.5, + "learning_rate": 8.809020016399574e-06, + "logits/chosen": -1.2619168758392334, + "logits/rejected": -1.2139676809310913, + "logps/chosen": -321.9124755859375, + "logps/rejected": -109.3397216796875, + "loss": 0.045, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.051660060882568, + "rewards/margins": 4.853473663330078, + "rewards/rejected": 1.1981865167617798, + "step": 3051 + }, + { + "epoch": 0.5, + "learning_rate": 8.808168501151904e-06, + "logits/chosen": -0.5324645638465881, + "logits/rejected": -0.5324645638465881, + "logps/chosen": -4.615113735198975, + "logps/rejected": -4.615113735198975, + "loss": 0.5414, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.33029866218566895, + "rewards/margins": 0.0, + "rewards/rejected": 0.33029866218566895, + "step": 3052 + }, + { + "epoch": 0.5, + "learning_rate": 8.807316722795823e-06, + "logits/chosen": -1.2966742515563965, + "logits/rejected": -1.287598967552185, + "logps/chosen": -91.77548217773438, + "logps/rejected": -83.31111145019531, + "loss": 1.2527, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.636785864830017, + "rewards/margins": -0.7448333501815796, + "rewards/rejected": 2.3816192150115967, + "step": 3053 + }, + { + "epoch": 0.5, + "learning_rate": 8.806464681390182e-06, + "logits/chosen": -1.1117020845413208, + "logits/rejected": -1.1059073209762573, + "logps/chosen": -60.206581115722656, + "logps/rejected": -68.57697296142578, + "loss": 0.55, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.6866142749786377, + "rewards/margins": 0.44431912899017334, + "rewards/rejected": 1.2422951459884644, + "step": 3054 + }, + { + "epoch": 0.5, + "learning_rate": 8.805612376993847e-06, + "logits/chosen": -0.8626819849014282, + "logits/rejected": -0.9216857552528381, + "logps/chosen": -63.94970703125, + "logps/rejected": -82.54063415527344, + "loss": 0.248, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.8539215326309204, + "rewards/margins": 0.5478241443634033, + "rewards/rejected": 1.306097388267517, + "step": 3055 + }, + { + "epoch": 0.5, + "learning_rate": 8.804759809665708e-06, + "logits/chosen": -0.9623993635177612, + "logits/rejected": -0.982117772102356, + "logps/chosen": -58.32518768310547, + "logps/rejected": -74.80435943603516, + "loss": 0.4645, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.3508354425430298, + "rewards/margins": -0.09931492805480957, + "rewards/rejected": 1.4501503705978394, + "step": 3056 + }, + { + "epoch": 0.5, + "learning_rate": 8.803906979464665e-06, + "logits/chosen": -1.3074288368225098, + "logits/rejected": -1.2696948051452637, + "logps/chosen": -31.846500396728516, + "logps/rejected": -90.44441223144531, + "loss": 0.4268, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.716238021850586, + "rewards/margins": -0.2739444971084595, + "rewards/rejected": 1.9901825189590454, + "step": 3057 + }, + { + "epoch": 0.5, + "learning_rate": 8.803053886449644e-06, + "logits/chosen": -0.9861083030700684, + "logits/rejected": -1.0357613563537598, + "logps/chosen": -60.22649002075195, + "logps/rejected": -110.35612487792969, + "loss": 1.747, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.011953353881836, + "rewards/margins": 0.6644107103347778, + "rewards/rejected": 1.347542643547058, + "step": 3058 + }, + { + "epoch": 0.5, + "learning_rate": 8.802200530679584e-06, + "logits/chosen": -0.9822863936424255, + "logits/rejected": -0.9460336565971375, + "logps/chosen": -84.50537109375, + "logps/rejected": -43.03363800048828, + "loss": 0.5308, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.3254486322402954, + "rewards/margins": -0.18551373481750488, + "rewards/rejected": 1.5109623670578003, + "step": 3059 + }, + { + "epoch": 0.5, + "learning_rate": 8.801346912213445e-06, + "logits/chosen": -0.9095962047576904, + "logits/rejected": -0.9042643904685974, + "logps/chosen": -54.28776168823242, + "logps/rejected": -88.15846252441406, + "loss": 1.4562, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.8322086334228516, + "rewards/margins": 0.9411777257919312, + "rewards/rejected": 0.8910309076309204, + "step": 3060 + }, + { + "epoch": 0.5, + "learning_rate": 8.800493031110203e-06, + "logits/chosen": -1.0175387859344482, + "logits/rejected": -0.9391933083534241, + "logps/chosen": -79.05392456054688, + "logps/rejected": -82.90943908691406, + "loss": 0.2242, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.2350127696990967, + "rewards/margins": 0.7917678356170654, + "rewards/rejected": 1.4432449340820312, + "step": 3061 + }, + { + "epoch": 0.5, + "learning_rate": 8.79963888742885e-06, + "logits/chosen": -1.0043292045593262, + "logits/rejected": -0.7324211597442627, + "logps/chosen": -112.5743408203125, + "logps/rejected": -53.38649368286133, + "loss": 0.9799, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.659656047821045, + "rewards/margins": 2.2575788497924805, + "rewards/rejected": 3.4020771980285645, + "step": 3062 + }, + { + "epoch": 0.5, + "learning_rate": 8.798784481228405e-06, + "logits/chosen": -1.4298899173736572, + "logits/rejected": -1.4554365873336792, + "logps/chosen": -107.90972900390625, + "logps/rejected": -78.36865997314453, + "loss": 0.1864, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.547471761703491, + "rewards/margins": 0.8141976594924927, + "rewards/rejected": 1.7332741022109985, + "step": 3063 + }, + { + "epoch": 0.5, + "learning_rate": 8.797929812567897e-06, + "logits/chosen": -0.9401819109916687, + "logits/rejected": -0.7517083287239075, + "logps/chosen": -137.7515869140625, + "logps/rejected": -72.3238296508789, + "loss": 0.3575, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.449611186981201, + "rewards/margins": 3.246776819229126, + "rewards/rejected": 3.202834367752075, + "step": 3064 + }, + { + "epoch": 0.5, + "learning_rate": 8.797074881506375e-06, + "logits/chosen": -1.3834936618804932, + "logits/rejected": -1.262893557548523, + "logps/chosen": -112.3154296875, + "logps/rejected": -70.65676879882812, + "loss": 0.3306, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.749081611633301, + "rewards/margins": 1.2332825660705566, + "rewards/rejected": 4.515799045562744, + "step": 3065 + }, + { + "epoch": 0.5, + "learning_rate": 8.796219688102906e-06, + "logits/chosen": -1.2247155904769897, + "logits/rejected": -1.2282154560089111, + "logps/chosen": -191.23825073242188, + "logps/rejected": -48.167388916015625, + "loss": 0.1113, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.275216579437256, + "rewards/margins": 1.397948980331421, + "rewards/rejected": 2.877267599105835, + "step": 3066 + }, + { + "epoch": 0.5, + "learning_rate": 8.795364232416578e-06, + "logits/chosen": -1.0659490823745728, + "logits/rejected": -1.0447725057601929, + "logps/chosen": -100.88619232177734, + "logps/rejected": -97.57206726074219, + "loss": 2.0223, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.4191521406173706, + "rewards/margins": 0.017446160316467285, + "rewards/rejected": 1.4017059803009033, + "step": 3067 + }, + { + "epoch": 0.5, + "learning_rate": 8.794508514506493e-06, + "logits/chosen": -1.131477952003479, + "logits/rejected": -1.1285051107406616, + "logps/chosen": -59.131446838378906, + "logps/rejected": -54.604209899902344, + "loss": 2.6654, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.2129013538360596, + "rewards/margins": -2.075493574142456, + "rewards/rejected": 3.2883949279785156, + "step": 3068 + }, + { + "epoch": 0.5, + "learning_rate": 8.793652534431774e-06, + "logits/chosen": -1.0628751516342163, + "logits/rejected": -1.0831831693649292, + "logps/chosen": -42.56230926513672, + "logps/rejected": -53.68916320800781, + "loss": 0.9744, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.6327431201934814, + "rewards/margins": -1.7474119663238525, + "rewards/rejected": 4.380155086517334, + "step": 3069 + }, + { + "epoch": 0.5, + "learning_rate": 8.79279629225156e-06, + "logits/chosen": -1.1423465013504028, + "logits/rejected": -1.1423465013504028, + "logps/chosen": -61.40972137451172, + "logps/rejected": -61.40972137451172, + "loss": 0.564, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.4229226112365723, + "rewards/margins": 0.0, + "rewards/rejected": 2.4229226112365723, + "step": 3070 + }, + { + "epoch": 0.5, + "learning_rate": 8.79193978802501e-06, + "logits/chosen": -1.0932495594024658, + "logits/rejected": -0.9818858504295349, + "logps/chosen": -51.284332275390625, + "logps/rejected": -28.652080535888672, + "loss": 0.0717, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.7883431911468506, + "rewards/margins": 1.9130539894104004, + "rewards/rejected": 1.8752892017364502, + "step": 3071 + }, + { + "epoch": 0.5, + "learning_rate": 8.791083021811301e-06, + "logits/chosen": -0.8747981190681458, + "logits/rejected": -0.819014847278595, + "logps/chosen": -85.09132385253906, + "logps/rejected": -57.03120040893555, + "loss": 0.3696, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.8177307844161987, + "rewards/margins": -0.08956718444824219, + "rewards/rejected": 1.907297968864441, + "step": 3072 + }, + { + "epoch": 0.5, + "learning_rate": 8.790225993669625e-06, + "logits/chosen": -0.8271474242210388, + "logits/rejected": -0.815858781337738, + "logps/chosen": -18.39427947998047, + "logps/rejected": -19.67128562927246, + "loss": 1.0383, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.4003008008003235, + "rewards/margins": -0.8721931576728821, + "rewards/rejected": 1.2724939584732056, + "step": 3073 + }, + { + "epoch": 0.5, + "learning_rate": 8.789368703659199e-06, + "logits/chosen": -1.1771056652069092, + "logits/rejected": -1.1448431015014648, + "logps/chosen": -126.81139373779297, + "logps/rejected": -113.33349609375, + "loss": 0.8427, + "rewards/accuracies": 0.0, + "rewards/chosen": 6.603574275970459, + "rewards/margins": -0.9267740249633789, + "rewards/rejected": 7.530348300933838, + "step": 3074 + }, + { + "epoch": 0.5, + "learning_rate": 8.788511151839249e-06, + "logits/chosen": -0.7638033032417297, + "logits/rejected": -0.6525939702987671, + "logps/chosen": -85.87738800048828, + "logps/rejected": -15.679146766662598, + "loss": 0.3171, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.7916244864463806, + "rewards/margins": 0.16710656881332397, + "rewards/rejected": 0.6245179176330566, + "step": 3075 + }, + { + "epoch": 0.5, + "learning_rate": 8.787653338269027e-06, + "logits/chosen": -0.9715599417686462, + "logits/rejected": -0.9715599417686462, + "logps/chosen": -58.96846008300781, + "logps/rejected": -58.96846008300781, + "loss": 2.2271, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.183462619781494, + "rewards/margins": 0.0, + "rewards/rejected": 3.183462619781494, + "step": 3076 + }, + { + "epoch": 0.5, + "learning_rate": 8.786795263007798e-06, + "logits/chosen": -1.0344891548156738, + "logits/rejected": -1.0519026517868042, + "logps/chosen": -125.32112121582031, + "logps/rejected": -60.90457534790039, + "loss": 0.2646, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.5664565563201904, + "rewards/margins": 0.5798373222351074, + "rewards/rejected": 1.986619234085083, + "step": 3077 + }, + { + "epoch": 0.5, + "learning_rate": 8.785936926114847e-06, + "logits/chosen": -1.1526025533676147, + "logits/rejected": -1.1760320663452148, + "logps/chosen": -123.31924438476562, + "logps/rejected": -66.47616577148438, + "loss": 0.32, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.4151595830917358, + "rewards/margins": 0.20187067985534668, + "rewards/rejected": 1.2132889032363892, + "step": 3078 + }, + { + "epoch": 0.5, + "learning_rate": 8.785078327649476e-06, + "logits/chosen": -0.8771184086799622, + "logits/rejected": -0.8566896915435791, + "logps/chosen": -25.870811462402344, + "logps/rejected": -81.14781951904297, + "loss": 0.3738, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.9833984375, + "rewards/margins": -0.03481292724609375, + "rewards/rejected": 1.0182113647460938, + "step": 3079 + }, + { + "epoch": 0.5, + "learning_rate": 8.784219467671009e-06, + "logits/chosen": -1.023674726486206, + "logits/rejected": -1.2363098859786987, + "logps/chosen": -98.84642028808594, + "logps/rejected": -152.6827850341797, + "loss": 2.6206, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.5549468994140625, + "rewards/margins": -5.233885765075684, + "rewards/rejected": 8.788832664489746, + "step": 3080 + }, + { + "epoch": 0.5, + "learning_rate": 8.783360346238783e-06, + "logits/chosen": -1.0266056060791016, + "logits/rejected": -1.0266056060791016, + "logps/chosen": -50.435455322265625, + "logps/rejected": -50.435455322265625, + "loss": 0.8797, + "rewards/accuracies": 0.0, + "rewards/chosen": 4.0091142654418945, + "rewards/margins": 0.0, + "rewards/rejected": 4.0091142654418945, + "step": 3081 + }, + { + "epoch": 0.5, + "learning_rate": 8.782500963412156e-06, + "logits/chosen": -1.1764978170394897, + "logits/rejected": -1.0487282276153564, + "logps/chosen": -109.42230224609375, + "logps/rejected": -37.521759033203125, + "loss": 0.1891, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.720664978027344, + "rewards/margins": 1.2177062034606934, + "rewards/rejected": 3.5029587745666504, + "step": 3082 + }, + { + "epoch": 0.5, + "learning_rate": 8.781641319250502e-06, + "logits/chosen": -0.953725278377533, + "logits/rejected": -0.9874235987663269, + "logps/chosen": -24.749282836914062, + "logps/rejected": -44.02019119262695, + "loss": 1.8238, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.1924972534179688, + "rewards/margins": -1.1953136920928955, + "rewards/rejected": 2.3878109455108643, + "step": 3083 + }, + { + "epoch": 0.5, + "learning_rate": 8.780781413813217e-06, + "logits/chosen": -1.2380880117416382, + "logits/rejected": -1.1369514465332031, + "logps/chosen": -58.697696685791016, + "logps/rejected": -37.85370635986328, + "loss": 2.1114, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.6839427947998047, + "rewards/margins": 1.2251384258270264, + "rewards/rejected": 0.45880433917045593, + "step": 3084 + }, + { + "epoch": 0.5, + "learning_rate": 8.779921247159709e-06, + "logits/chosen": -0.9903768301010132, + "logits/rejected": -0.7772943377494812, + "logps/chosen": -66.7038345336914, + "logps/rejected": -21.24706268310547, + "loss": 0.3516, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.2723464965820312, + "rewards/margins": 2.5166831016540527, + "rewards/rejected": -0.24433670938014984, + "step": 3085 + }, + { + "epoch": 0.5, + "learning_rate": 8.779060819349408e-06, + "logits/chosen": -1.0401298999786377, + "logits/rejected": -1.0401298999786377, + "logps/chosen": -0.5711169242858887, + "logps/rejected": -0.5711169242858887, + "loss": 0.6656, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.3190458416938782, + "rewards/margins": 0.0, + "rewards/rejected": 0.3190458416938782, + "step": 3086 + }, + { + "epoch": 0.5, + "learning_rate": 8.778200130441761e-06, + "logits/chosen": -0.9207680821418762, + "logits/rejected": -0.5128045082092285, + "logps/chosen": -59.61284637451172, + "logps/rejected": -55.99083709716797, + "loss": 0.4002, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.2957206964492798, + "rewards/margins": -0.08708643913269043, + "rewards/rejected": 1.3828071355819702, + "step": 3087 + }, + { + "epoch": 0.5, + "learning_rate": 8.777339180496238e-06, + "logits/chosen": -0.6686239838600159, + "logits/rejected": -0.6261398196220398, + "logps/chosen": -100.70264434814453, + "logps/rejected": -118.33748626708984, + "loss": 0.8946, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.816497802734375, + "rewards/margins": 0.2588157653808594, + "rewards/rejected": 1.5576820373535156, + "step": 3088 + }, + { + "epoch": 0.5, + "learning_rate": 8.776477969572316e-06, + "logits/chosen": -0.9945324659347534, + "logits/rejected": -0.793526291847229, + "logps/chosen": -40.82911682128906, + "logps/rejected": -20.807661056518555, + "loss": 0.0199, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.37507963180542, + "rewards/margins": 3.916886806488037, + "rewards/rejected": 0.4581928253173828, + "step": 3089 + }, + { + "epoch": 0.5, + "learning_rate": 8.775616497729502e-06, + "logits/chosen": -0.7524502873420715, + "logits/rejected": -0.6748273372650146, + "logps/chosen": -133.8623046875, + "logps/rejected": -42.268733978271484, + "loss": 0.5393, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.4550232887268066, + "rewards/margins": 1.4325604438781738, + "rewards/rejected": 1.0224628448486328, + "step": 3090 + }, + { + "epoch": 0.5, + "learning_rate": 8.774754765027314e-06, + "logits/chosen": -0.8806155323982239, + "logits/rejected": -0.827594518661499, + "logps/chosen": -62.05290222167969, + "logps/rejected": -29.523021697998047, + "loss": 0.3066, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.2856369018554688, + "rewards/margins": 0.5616253018379211, + "rewards/rejected": 0.7240116000175476, + "step": 3091 + }, + { + "epoch": 0.5, + "learning_rate": 8.773892771525285e-06, + "logits/chosen": -0.6981081366539001, + "logits/rejected": -0.730330765247345, + "logps/chosen": -45.0623893737793, + "logps/rejected": -82.69322204589844, + "loss": 0.1506, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.2886340618133545, + "rewards/margins": 1.406170129776001, + "rewards/rejected": 0.8824638724327087, + "step": 3092 + }, + { + "epoch": 0.5, + "learning_rate": 8.773030517282978e-06, + "logits/chosen": -1.0721156597137451, + "logits/rejected": -1.298659086227417, + "logps/chosen": -78.05682373046875, + "logps/rejected": -49.34783172607422, + "loss": 0.8577, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.0327775478363037, + "rewards/margins": -0.46289825439453125, + "rewards/rejected": 3.495675802230835, + "step": 3093 + }, + { + "epoch": 0.5, + "learning_rate": 8.772168002359962e-06, + "logits/chosen": -0.8966013193130493, + "logits/rejected": -0.857401430606842, + "logps/chosen": -68.71771240234375, + "logps/rejected": -38.69983673095703, + "loss": 0.5307, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.3072983026504517, + "rewards/margins": -0.17468714714050293, + "rewards/rejected": 1.4819854497909546, + "step": 3094 + }, + { + "epoch": 0.5, + "learning_rate": 8.77130522681583e-06, + "logits/chosen": -1.0759798288345337, + "logits/rejected": -0.9352686405181885, + "logps/chosen": -131.25985717773438, + "logps/rejected": -15.11185073852539, + "loss": 0.128, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.942004442214966, + "rewards/margins": 3.588061571121216, + "rewards/rejected": 0.35394287109375, + "step": 3095 + }, + { + "epoch": 0.5, + "learning_rate": 8.77044219071019e-06, + "logits/chosen": -0.5038803219795227, + "logits/rejected": -0.4987097382545471, + "logps/chosen": -4.255629062652588, + "logps/rejected": -27.591718673706055, + "loss": 0.9873, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.24926243722438812, + "rewards/margins": -0.17676524817943573, + "rewards/rejected": 0.42602768540382385, + "step": 3096 + }, + { + "epoch": 0.5, + "learning_rate": 8.769578894102671e-06, + "logits/chosen": -1.2962441444396973, + "logits/rejected": -1.133104681968689, + "logps/chosen": -82.67000579833984, + "logps/rejected": -57.33946228027344, + "loss": 0.0489, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.1548075675964355, + "rewards/margins": 2.389277935028076, + "rewards/rejected": 2.7655296325683594, + "step": 3097 + }, + { + "epoch": 0.5, + "learning_rate": 8.768715337052918e-06, + "logits/chosen": -0.6690967679023743, + "logits/rejected": -0.564389705657959, + "logps/chosen": -52.68424987792969, + "logps/rejected": -34.19313430786133, + "loss": 1.218, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.2188819646835327, + "rewards/margins": 0.6104968786239624, + "rewards/rejected": 0.6083850860595703, + "step": 3098 + }, + { + "epoch": 0.5, + "learning_rate": 8.767851519620597e-06, + "logits/chosen": -0.9724507331848145, + "logits/rejected": -0.8654985427856445, + "logps/chosen": -72.37449645996094, + "logps/rejected": -18.953372955322266, + "loss": 0.5993, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.3857085704803467, + "rewards/margins": 2.2353341579437256, + "rewards/rejected": 0.1503744125366211, + "step": 3099 + }, + { + "epoch": 0.5, + "learning_rate": 8.766987441865386e-06, + "logits/chosen": -0.7217859029769897, + "logits/rejected": -0.729870080947876, + "logps/chosen": -47.32630920410156, + "logps/rejected": -94.75528717041016, + "loss": 0.5466, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.0974960327148438, + "rewards/margins": 0.2815284729003906, + "rewards/rejected": 1.8159675598144531, + "step": 3100 + }, + { + "epoch": 0.5, + "learning_rate": 8.766123103846987e-06, + "logits/chosen": -0.6651518940925598, + "logits/rejected": -0.6651518940925598, + "logps/chosen": -42.715599060058594, + "logps/rejected": -42.715599060058594, + "loss": 0.763, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.2585655450820923, + "rewards/margins": 0.0, + "rewards/rejected": 1.2585655450820923, + "step": 3101 + }, + { + "epoch": 0.5, + "learning_rate": 8.765258505625117e-06, + "logits/chosen": -0.8963487148284912, + "logits/rejected": -0.8081939220428467, + "logps/chosen": -60.254005432128906, + "logps/rejected": -44.10472869873047, + "loss": 0.3725, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.028188467025757, + "rewards/margins": -0.0991363525390625, + "rewards/rejected": 2.1273248195648193, + "step": 3102 + }, + { + "epoch": 0.5, + "learning_rate": 8.764393647259511e-06, + "logits/chosen": -0.8513326048851013, + "logits/rejected": -0.7822369933128357, + "logps/chosen": -56.97962188720703, + "logps/rejected": -74.54522705078125, + "loss": 0.2296, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.499396562576294, + "rewards/margins": 0.706601619720459, + "rewards/rejected": 2.792794942855835, + "step": 3103 + }, + { + "epoch": 0.5, + "learning_rate": 8.763528528809923e-06, + "logits/chosen": -1.333539605140686, + "logits/rejected": -1.3524099588394165, + "logps/chosen": -96.95330810546875, + "logps/rejected": -109.5002212524414, + "loss": 0.2797, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.933172702789307, + "rewards/margins": 0.3754005432128906, + "rewards/rejected": 5.557772159576416, + "step": 3104 + }, + { + "epoch": 0.5, + "learning_rate": 8.762663150336126e-06, + "logits/chosen": -0.7245292067527771, + "logits/rejected": -0.7245292067527771, + "logps/chosen": -13.306404113769531, + "logps/rejected": -13.306404113769531, + "loss": 0.6902, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.0623019933700562, + "rewards/margins": 0.0, + "rewards/rejected": 1.0623019933700562, + "step": 3105 + }, + { + "epoch": 0.5, + "learning_rate": 8.761797511897907e-06, + "logits/chosen": -0.8943729996681213, + "logits/rejected": -0.7878551483154297, + "logps/chosen": -27.835712432861328, + "logps/rejected": -17.491901397705078, + "loss": 0.1599, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.473941445350647, + "rewards/margins": 0.9955114126205444, + "rewards/rejected": 0.47843000292778015, + "step": 3106 + }, + { + "epoch": 0.5, + "learning_rate": 8.760931613555076e-06, + "logits/chosen": -1.2022165060043335, + "logits/rejected": -1.183095097541809, + "logps/chosen": -69.44306182861328, + "logps/rejected": -74.11092376708984, + "loss": 1.1659, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.9724884033203125, + "rewards/margins": -2.056553602218628, + "rewards/rejected": 3.0290420055389404, + "step": 3107 + }, + { + "epoch": 0.5, + "learning_rate": 8.760065455367454e-06, + "logits/chosen": -0.9375208616256714, + "logits/rejected": -0.8805354833602905, + "logps/chosen": -118.36470031738281, + "logps/rejected": -57.077842712402344, + "loss": 0.44, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.456019639968872, + "rewards/margins": -0.11521685123443604, + "rewards/rejected": 1.571236491203308, + "step": 3108 + }, + { + "epoch": 0.5, + "learning_rate": 8.759199037394888e-06, + "logits/chosen": -1.353838324546814, + "logits/rejected": -1.448853850364685, + "logps/chosen": -205.76422119140625, + "logps/rejected": -135.28228759765625, + "loss": 1.116, + "rewards/accuracies": 0.0, + "rewards/chosen": 4.8585205078125, + "rewards/margins": -2.037707805633545, + "rewards/rejected": 6.896228313446045, + "step": 3109 + }, + { + "epoch": 0.5, + "learning_rate": 8.758332359697238e-06, + "logits/chosen": -1.0942339897155762, + "logits/rejected": -1.0757088661193848, + "logps/chosen": -88.15692138671875, + "logps/rejected": -81.13810729980469, + "loss": 0.3193, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.4540786743164062, + "rewards/margins": 1.1255691051483154, + "rewards/rejected": 1.3285095691680908, + "step": 3110 + }, + { + "epoch": 0.5, + "learning_rate": 8.757465422334385e-06, + "logits/chosen": -1.3553638458251953, + "logits/rejected": -1.5234521627426147, + "logps/chosen": -300.5459899902344, + "logps/rejected": -105.98341369628906, + "loss": 0.5634, + "rewards/accuracies": 0.0, + "rewards/chosen": 6.183889865875244, + "rewards/margins": -0.7345595359802246, + "rewards/rejected": 6.918449401855469, + "step": 3111 + }, + { + "epoch": 0.51, + "learning_rate": 8.756598225366224e-06, + "logits/chosen": -0.9726642370223999, + "logits/rejected": -1.02047860622406, + "logps/chosen": -79.07263946533203, + "logps/rejected": -93.79750061035156, + "loss": 2.6028, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.4600563049316406, + "rewards/margins": -1.8116919994354248, + "rewards/rejected": 3.2717483043670654, + "step": 3112 + }, + { + "epoch": 0.51, + "learning_rate": 8.75573076885267e-06, + "logits/chosen": -0.7473878264427185, + "logits/rejected": -0.7473878264427185, + "logps/chosen": -77.7769546508789, + "logps/rejected": -77.7769546508789, + "loss": 0.3724, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.2936897277832031, + "rewards/margins": 0.0, + "rewards/rejected": 1.2936897277832031, + "step": 3113 + }, + { + "epoch": 0.51, + "learning_rate": 8.754863052853658e-06, + "logits/chosen": -0.8249735236167908, + "logits/rejected": -0.7481058835983276, + "logps/chosen": -29.281312942504883, + "logps/rejected": -6.463963985443115, + "loss": 1.165, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.8344327807426453, + "rewards/margins": 0.5320563316345215, + "rewards/rejected": 0.3023764193058014, + "step": 3114 + }, + { + "epoch": 0.51, + "learning_rate": 8.753995077429138e-06, + "logits/chosen": -0.9079311490058899, + "logits/rejected": -0.9068560004234314, + "logps/chosen": -26.529033660888672, + "logps/rejected": -17.439647674560547, + "loss": 0.7296, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.7375995516777039, + "rewards/margins": 0.08981055021286011, + "rewards/rejected": 0.6477890014648438, + "step": 3115 + }, + { + "epoch": 0.51, + "learning_rate": 8.75312684263908e-06, + "logits/chosen": -0.9360632300376892, + "logits/rejected": -0.9356096386909485, + "logps/chosen": -69.91348266601562, + "logps/rejected": -47.43474578857422, + "loss": 0.453, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.336406707763672, + "rewards/margins": -0.11705327033996582, + "rewards/rejected": 2.4534599781036377, + "step": 3116 + }, + { + "epoch": 0.51, + "learning_rate": 8.752258348543466e-06, + "logits/chosen": -0.7772596478462219, + "logits/rejected": -0.8029407262802124, + "logps/chosen": -0.8902326822280884, + "logps/rejected": -27.968994140625, + "loss": 0.4998, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5027296543121338, + "rewards/margins": 0.18353599309921265, + "rewards/rejected": 0.31919366121292114, + "step": 3117 + }, + { + "epoch": 0.51, + "learning_rate": 8.751389595202307e-06, + "logits/chosen": -1.538415551185608, + "logits/rejected": -1.5306371450424194, + "logps/chosen": -204.67930603027344, + "logps/rejected": -112.18238830566406, + "loss": 1.8814, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.5900590419769287, + "rewards/margins": -3.6567838191986084, + "rewards/rejected": 7.246842861175537, + "step": 3118 + }, + { + "epoch": 0.51, + "learning_rate": 8.750520582675621e-06, + "logits/chosen": -1.2095966339111328, + "logits/rejected": -1.2746766805648804, + "logps/chosen": -117.31069946289062, + "logps/rejected": -199.33697509765625, + "loss": 2.7778, + "rewards/accuracies": 0.0, + "rewards/chosen": 4.114892482757568, + "rewards/margins": -4.278244495391846, + "rewards/rejected": 8.393136978149414, + "step": 3119 + }, + { + "epoch": 0.51, + "learning_rate": 8.749651311023452e-06, + "logits/chosen": -0.7251395583152771, + "logits/rejected": -0.7243677377700806, + "logps/chosen": -65.4250259399414, + "logps/rejected": -52.04551315307617, + "loss": 1.0349, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.5773353576660156, + "rewards/margins": -0.6591136455535889, + "rewards/rejected": 2.2364490032196045, + "step": 3120 + }, + { + "epoch": 0.51, + "learning_rate": 8.748781780305858e-06, + "logits/chosen": -0.9186376333236694, + "logits/rejected": -0.8418815732002258, + "logps/chosen": -69.27616882324219, + "logps/rejected": -63.050209045410156, + "loss": 0.2938, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.3863022327423096, + "rewards/margins": 0.28635096549987793, + "rewards/rejected": 2.0999512672424316, + "step": 3121 + }, + { + "epoch": 0.51, + "learning_rate": 8.747911990582912e-06, + "logits/chosen": -0.6845328211784363, + "logits/rejected": -0.6845328211784363, + "logps/chosen": -49.66758728027344, + "logps/rejected": -49.66758728027344, + "loss": 1.9642, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.9618350863456726, + "rewards/margins": 0.0, + "rewards/rejected": 0.9618350863456726, + "step": 3122 + }, + { + "epoch": 0.51, + "learning_rate": 8.747041941914712e-06, + "logits/chosen": -0.3923344314098358, + "logits/rejected": -0.3916769325733185, + "logps/chosen": -3.12147855758667, + "logps/rejected": -1.4810830354690552, + "loss": 0.5484, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.21502871811389923, + "rewards/margins": -0.09714438021183014, + "rewards/rejected": 0.31217309832572937, + "step": 3123 + }, + { + "epoch": 0.51, + "learning_rate": 8.746171634361368e-06, + "logits/chosen": -0.9448392391204834, + "logits/rejected": -1.0352498292922974, + "logps/chosen": -101.94462585449219, + "logps/rejected": -127.00853729248047, + "loss": 3.0949, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.321120500564575, + "rewards/margins": -4.046903610229492, + "rewards/rejected": 6.3680243492126465, + "step": 3124 + }, + { + "epoch": 0.51, + "learning_rate": 8.745301067983012e-06, + "logits/chosen": -0.7716917991638184, + "logits/rejected": -0.7701359987258911, + "logps/chosen": -1.818604826927185, + "logps/rejected": -3.7738170623779297, + "loss": 0.6272, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.38258227705955505, + "rewards/margins": -0.11967453360557556, + "rewards/rejected": 0.5022568106651306, + "step": 3125 + }, + { + "epoch": 0.51, + "learning_rate": 8.744430242839788e-06, + "logits/chosen": -0.6041924953460693, + "logits/rejected": -0.7516666054725647, + "logps/chosen": -80.87993621826172, + "logps/rejected": -93.06455993652344, + "loss": 1.287, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.1545990705490112, + "rewards/margins": -2.4768080711364746, + "rewards/rejected": 3.6314072608947754, + "step": 3126 + }, + { + "epoch": 0.51, + "learning_rate": 8.743559158991867e-06, + "logits/chosen": -1.2246837615966797, + "logits/rejected": -1.2242954969406128, + "logps/chosen": -213.83786010742188, + "logps/rejected": -62.40750503540039, + "loss": 0.3987, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.2903900146484375, + "rewards/margins": 2.634063243865967, + "rewards/rejected": 3.6563267707824707, + "step": 3127 + }, + { + "epoch": 0.51, + "learning_rate": 8.742687816499428e-06, + "logits/chosen": -1.327725887298584, + "logits/rejected": -1.185140609741211, + "logps/chosen": -98.58808898925781, + "logps/rejected": -60.77745056152344, + "loss": 0.1415, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.0638322830200195, + "rewards/margins": 4.717689514160156, + "rewards/rejected": 1.3461426496505737, + "step": 3128 + }, + { + "epoch": 0.51, + "learning_rate": 8.741816215422676e-06, + "logits/chosen": -1.463709831237793, + "logits/rejected": -1.406901478767395, + "logps/chosen": -103.48199462890625, + "logps/rejected": -85.68392944335938, + "loss": 0.4123, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1679961681365967, + "rewards/margins": 0.09370183944702148, + "rewards/rejected": 3.074294328689575, + "step": 3129 + }, + { + "epoch": 0.51, + "learning_rate": 8.740944355821827e-06, + "logits/chosen": -0.7568255662918091, + "logits/rejected": -0.5106989741325378, + "logps/chosen": -73.54588317871094, + "logps/rejected": -11.279117584228516, + "loss": 0.1355, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9354400634765625, + "rewards/margins": 2.1965513229370117, + "rewards/rejected": 0.7388887405395508, + "step": 3130 + }, + { + "epoch": 0.51, + "learning_rate": 8.740072237757122e-06, + "logits/chosen": -1.0405759811401367, + "logits/rejected": -1.0405759811401367, + "logps/chosen": -71.80073547363281, + "logps/rejected": -71.80073547363281, + "loss": 0.6025, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.469866991043091, + "rewards/margins": 0.0, + "rewards/rejected": 3.469866991043091, + "step": 3131 + }, + { + "epoch": 0.51, + "learning_rate": 8.739199861288815e-06, + "logits/chosen": -1.233776569366455, + "logits/rejected": -1.2012145519256592, + "logps/chosen": -85.29695129394531, + "logps/rejected": -83.87442016601562, + "loss": 0.3557, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.955120861530304, + "rewards/margins": 0.1933654546737671, + "rewards/rejected": 0.7617554068565369, + "step": 3132 + }, + { + "epoch": 0.51, + "learning_rate": 8.738327226477177e-06, + "logits/chosen": -1.095947504043579, + "logits/rejected": -0.849612832069397, + "logps/chosen": -138.95542907714844, + "logps/rejected": -62.25068283081055, + "loss": 0.5002, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.945919990539551, + "rewards/margins": 4.062473773956299, + "rewards/rejected": 2.883446216583252, + "step": 3133 + }, + { + "epoch": 0.51, + "learning_rate": 8.7374543333825e-06, + "logits/chosen": -0.9965699911117554, + "logits/rejected": -0.9965699911117554, + "logps/chosen": -35.364139556884766, + "logps/rejected": -35.364139556884766, + "loss": 1.6421, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.8461445569992065, + "rewards/margins": 0.0, + "rewards/rejected": 1.8461445569992065, + "step": 3134 + }, + { + "epoch": 0.51, + "learning_rate": 8.736581182065092e-06, + "logits/chosen": -1.2726249694824219, + "logits/rejected": -1.242876410484314, + "logps/chosen": -102.10442352294922, + "logps/rejected": -94.40218353271484, + "loss": 1.3747, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.1376793384552002, + "rewards/margins": -2.598738193511963, + "rewards/rejected": 3.736417531967163, + "step": 3135 + }, + { + "epoch": 0.51, + "learning_rate": 8.73570777258528e-06, + "logits/chosen": -0.8368522524833679, + "logits/rejected": -0.6973925828933716, + "logps/chosen": -51.6776008605957, + "logps/rejected": -12.577520370483398, + "loss": 0.6031, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.8624088764190674, + "rewards/margins": 1.3475515842437744, + "rewards/rejected": 0.514857292175293, + "step": 3136 + }, + { + "epoch": 0.51, + "learning_rate": 8.73483410500341e-06, + "logits/chosen": -0.6822999715805054, + "logits/rejected": -0.5703274607658386, + "logps/chosen": -62.38957214355469, + "logps/rejected": -57.069541931152344, + "loss": 0.5529, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.240107774734497, + "rewards/margins": 0.6715973615646362, + "rewards/rejected": 1.5685104131698608, + "step": 3137 + }, + { + "epoch": 0.51, + "learning_rate": 8.733960179379842e-06, + "logits/chosen": -1.2237919569015503, + "logits/rejected": -1.1946678161621094, + "logps/chosen": -90.13794708251953, + "logps/rejected": -75.67644500732422, + "loss": 0.377, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.6792335510253906, + "rewards/margins": -0.09668433666229248, + "rewards/rejected": 1.775917887687683, + "step": 3138 + }, + { + "epoch": 0.51, + "learning_rate": 8.733085995774957e-06, + "logits/chosen": -0.36072707176208496, + "logits/rejected": -0.35792288184165955, + "logps/chosen": -3.2594594955444336, + "logps/rejected": -7.56066370010376, + "loss": 0.4387, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.32028666138648987, + "rewards/margins": 0.018151432275772095, + "rewards/rejected": 0.3021352291107178, + "step": 3139 + }, + { + "epoch": 0.51, + "learning_rate": 8.73221155424915e-06, + "logits/chosen": -1.0216878652572632, + "logits/rejected": -0.8472537994384766, + "logps/chosen": -57.65519714355469, + "logps/rejected": -35.9649658203125, + "loss": 0.0508, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1837143898010254, + "rewards/margins": 3.1480515003204346, + "rewards/rejected": 0.03566284105181694, + "step": 3140 + }, + { + "epoch": 0.51, + "learning_rate": 8.731336854862843e-06, + "logits/chosen": -0.6588640809059143, + "logits/rejected": -0.6588640809059143, + "logps/chosen": -1.644377589225769, + "logps/rejected": -1.644377589225769, + "loss": 0.9365, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.2764299809932709, + "rewards/margins": 0.0, + "rewards/rejected": 0.2764299809932709, + "step": 3141 + }, + { + "epoch": 0.51, + "learning_rate": 8.730461897676463e-06, + "logits/chosen": -0.5208459496498108, + "logits/rejected": -0.516261637210846, + "logps/chosen": -1.8471379280090332, + "logps/rejected": -1.9750630855560303, + "loss": 0.9731, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.18315580487251282, + "rewards/margins": -0.14441204071044922, + "rewards/rejected": 0.32756784558296204, + "step": 3142 + }, + { + "epoch": 0.51, + "learning_rate": 8.729586682750465e-06, + "logits/chosen": -0.9471021294593811, + "logits/rejected": -0.9418603181838989, + "logps/chosen": -118.41397094726562, + "logps/rejected": -99.80938720703125, + "loss": 0.4414, + "rewards/accuracies": 0.0, + "rewards/chosen": 4.693782329559326, + "rewards/margins": -0.34494447708129883, + "rewards/rejected": 5.038726806640625, + "step": 3143 + }, + { + "epoch": 0.51, + "learning_rate": 8.728711210145317e-06, + "logits/chosen": -0.876204788684845, + "logits/rejected": -1.064070463180542, + "logps/chosen": -74.51243591308594, + "logps/rejected": -112.57331848144531, + "loss": 1.4598, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.7867934703826904, + "rewards/margins": -2.842083692550659, + "rewards/rejected": 6.62887716293335, + "step": 3144 + }, + { + "epoch": 0.51, + "learning_rate": 8.727835479921504e-06, + "logits/chosen": -1.3200474977493286, + "logits/rejected": -1.1863757371902466, + "logps/chosen": -109.57072448730469, + "logps/rejected": -65.7491683959961, + "loss": 0.0712, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.153018474578857, + "rewards/margins": 1.9253907203674316, + "rewards/rejected": 4.227627754211426, + "step": 3145 + }, + { + "epoch": 0.51, + "learning_rate": 8.726959492139535e-06, + "logits/chosen": -0.9625959396362305, + "logits/rejected": -1.033353567123413, + "logps/chosen": -116.80284118652344, + "logps/rejected": -129.43495178222656, + "loss": 2.4173, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.846856713294983, + "rewards/margins": -4.814741611480713, + "rewards/rejected": 6.661598205566406, + "step": 3146 + }, + { + "epoch": 0.51, + "learning_rate": 8.726083246859929e-06, + "logits/chosen": -1.0242319107055664, + "logits/rejected": -1.008414626121521, + "logps/chosen": -66.48712921142578, + "logps/rejected": -11.858097076416016, + "loss": 0.4758, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.199676513671875, + "rewards/margins": 0.5543580651283264, + "rewards/rejected": 0.6453184485435486, + "step": 3147 + }, + { + "epoch": 0.51, + "learning_rate": 8.725206744143227e-06, + "logits/chosen": -1.0253692865371704, + "logits/rejected": -1.0051828622817993, + "logps/chosen": -45.26993179321289, + "logps/rejected": -40.99296188354492, + "loss": 0.3869, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.055707931518555, + "rewards/margins": 0.4523506164550781, + "rewards/rejected": 3.6033573150634766, + "step": 3148 + }, + { + "epoch": 0.51, + "learning_rate": 8.724329984049986e-06, + "logits/chosen": -1.7904988527297974, + "logits/rejected": -1.7301537990570068, + "logps/chosen": -112.3355484008789, + "logps/rejected": -96.99365234375, + "loss": 0.3165, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.253829479217529, + "rewards/margins": 2.747126340866089, + "rewards/rejected": 2.5067031383514404, + "step": 3149 + }, + { + "epoch": 0.51, + "learning_rate": 8.723452966640785e-06, + "logits/chosen": -1.3574581146240234, + "logits/rejected": -1.4684797525405884, + "logps/chosen": -129.056640625, + "logps/rejected": -154.68971252441406, + "loss": 3.8981, + "rewards/accuracies": 0.0, + "rewards/chosen": 4.030693054199219, + "rewards/margins": -7.7947540283203125, + "rewards/rejected": 11.825447082519531, + "step": 3150 + }, + { + "epoch": 0.51, + "learning_rate": 8.722575691976214e-06, + "logits/chosen": -1.126686930656433, + "logits/rejected": -1.1923836469650269, + "logps/chosen": -55.091346740722656, + "logps/rejected": -129.62730407714844, + "loss": 2.3841, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.072601318359375, + "rewards/margins": -3.074540615081787, + "rewards/rejected": 4.147141933441162, + "step": 3151 + }, + { + "epoch": 0.51, + "learning_rate": 8.721698160116885e-06, + "logits/chosen": -1.3195924758911133, + "logits/rejected": -1.2885313034057617, + "logps/chosen": -89.4277572631836, + "logps/rejected": -68.90167236328125, + "loss": 0.9753, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.184349775314331, + "rewards/margins": -1.115389347076416, + "rewards/rejected": 3.299739122390747, + "step": 3152 + }, + { + "epoch": 0.51, + "learning_rate": 8.720820371123431e-06, + "logits/chosen": -1.154029130935669, + "logits/rejected": -1.16990327835083, + "logps/chosen": -114.33636474609375, + "logps/rejected": -102.32691192626953, + "loss": 0.5948, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.2716081142425537, + "rewards/margins": -0.8060767650604248, + "rewards/rejected": 4.0776848793029785, + "step": 3153 + }, + { + "epoch": 0.51, + "learning_rate": 8.719942325056496e-06, + "logits/chosen": -0.6855369210243225, + "logits/rejected": -0.6137038469314575, + "logps/chosen": -69.71321868896484, + "logps/rejected": -52.46582794189453, + "loss": 0.3348, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.9242759943008423, + "rewards/margins": 0.09485852718353271, + "rewards/rejected": 1.8294174671173096, + "step": 3154 + }, + { + "epoch": 0.51, + "learning_rate": 8.719064021976742e-06, + "logits/chosen": -0.7127725481987, + "logits/rejected": -0.7143298387527466, + "logps/chosen": -6.421411514282227, + "logps/rejected": -3.35857892036438, + "loss": 0.8236, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.3773740828037262, + "rewards/margins": -0.1191815435886383, + "rewards/rejected": 0.4965556263923645, + "step": 3155 + }, + { + "epoch": 0.51, + "learning_rate": 8.718185461944857e-06, + "logits/chosen": -1.1317907571792603, + "logits/rejected": -0.9757561087608337, + "logps/chosen": -163.91493225097656, + "logps/rejected": -42.21548843383789, + "loss": 0.062, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.668727397918701, + "rewards/margins": 4.724267959594727, + "rewards/rejected": 1.9444591999053955, + "step": 3156 + }, + { + "epoch": 0.51, + "learning_rate": 8.717306645021537e-06, + "logits/chosen": -0.774386465549469, + "logits/rejected": -0.7675744295120239, + "logps/chosen": -47.09313201904297, + "logps/rejected": -62.44558334350586, + "loss": 1.6367, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5894947052001953, + "rewards/margins": 0.3588157594203949, + "rewards/rejected": 0.23067894577980042, + "step": 3157 + }, + { + "epoch": 0.51, + "learning_rate": 8.716427571267503e-06, + "logits/chosen": -0.9852405786514282, + "logits/rejected": -1.0516107082366943, + "logps/chosen": -94.1994400024414, + "logps/rejected": -67.39537048339844, + "loss": 1.7785, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.9281113147735596, + "rewards/margins": -0.10334157943725586, + "rewards/rejected": 2.0314528942108154, + "step": 3158 + }, + { + "epoch": 0.51, + "learning_rate": 8.715548240743487e-06, + "logits/chosen": -1.0682138204574585, + "logits/rejected": -1.1476722955703735, + "logps/chosen": -66.16049194335938, + "logps/rejected": -135.35252380371094, + "loss": 2.8804, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.364827036857605, + "rewards/margins": -5.097735404968262, + "rewards/rejected": 6.462562561035156, + "step": 3159 + }, + { + "epoch": 0.51, + "learning_rate": 8.714668653510246e-06, + "logits/chosen": -1.131020426750183, + "logits/rejected": -1.1136348247528076, + "logps/chosen": -85.52660369873047, + "logps/rejected": -159.45196533203125, + "loss": 0.4719, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.536444902420044, + "rewards/margins": -0.3957420587539673, + "rewards/rejected": 1.9321869611740112, + "step": 3160 + }, + { + "epoch": 0.51, + "learning_rate": 8.713788809628547e-06, + "logits/chosen": -1.179983139038086, + "logits/rejected": -1.1255112886428833, + "logps/chosen": -113.2125244140625, + "logps/rejected": -72.71775817871094, + "loss": 0.9976, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.7058013677597046, + "rewards/margins": -1.8305572271347046, + "rewards/rejected": 3.536358594894409, + "step": 3161 + }, + { + "epoch": 0.51, + "learning_rate": 8.712908709159183e-06, + "logits/chosen": -0.9497867822647095, + "logits/rejected": -0.9542422294616699, + "logps/chosen": -6.950551986694336, + "logps/rejected": -18.876819610595703, + "loss": 0.2908, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.14571629464626312, + "rewards/margins": 0.27655813097953796, + "rewards/rejected": -0.13084183633327484, + "step": 3162 + }, + { + "epoch": 0.51, + "learning_rate": 8.712028352162959e-06, + "logits/chosen": -0.831078827381134, + "logits/rejected": -0.8624100685119629, + "logps/chosen": -68.43449401855469, + "logps/rejected": -75.11146545410156, + "loss": 1.345, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.478755235671997, + "rewards/margins": -2.616323709487915, + "rewards/rejected": 5.095078945159912, + "step": 3163 + }, + { + "epoch": 0.51, + "learning_rate": 8.7111477387007e-06, + "logits/chosen": -1.0923758745193481, + "logits/rejected": -1.215389370918274, + "logps/chosen": -216.99920654296875, + "logps/rejected": -82.68203735351562, + "loss": 0.3354, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.443939208984375, + "rewards/margins": 0.09465312957763672, + "rewards/rejected": 5.349286079406738, + "step": 3164 + }, + { + "epoch": 0.51, + "learning_rate": 8.710266868833247e-06, + "logits/chosen": -0.9000536799430847, + "logits/rejected": -1.0634113550186157, + "logps/chosen": -87.67677307128906, + "logps/rejected": -165.16226196289062, + "loss": 1.2415, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.72130286693573, + "rewards/margins": -1.7656935453414917, + "rewards/rejected": 3.4869964122772217, + "step": 3165 + }, + { + "epoch": 0.51, + "learning_rate": 8.70938574262146e-06, + "logits/chosen": -1.1164153814315796, + "logits/rejected": -0.981991171836853, + "logps/chosen": -114.46365356445312, + "logps/rejected": -47.113319396972656, + "loss": 0.1234, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.343405246734619, + "rewards/margins": 1.8994011878967285, + "rewards/rejected": 2.4440040588378906, + "step": 3166 + }, + { + "epoch": 0.51, + "learning_rate": 8.708504360126216e-06, + "logits/chosen": -0.6950599551200867, + "logits/rejected": -0.687717616558075, + "logps/chosen": -21.845619201660156, + "logps/rejected": -20.92281723022461, + "loss": 0.6968, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.03990898281335831, + "rewards/margins": -0.5529924035072327, + "rewards/rejected": 0.5929014086723328, + "step": 3167 + }, + { + "epoch": 0.51, + "learning_rate": 8.707622721408413e-06, + "logits/chosen": -0.8968759179115295, + "logits/rejected": -0.8686927556991577, + "logps/chosen": -74.84896850585938, + "logps/rejected": -64.26947021484375, + "loss": 0.2218, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.7402284145355225, + "rewards/margins": 0.8344857692718506, + "rewards/rejected": 1.9057426452636719, + "step": 3168 + }, + { + "epoch": 0.51, + "learning_rate": 8.706740826528962e-06, + "logits/chosen": -1.1722800731658936, + "logits/rejected": -1.0533199310302734, + "logps/chosen": -47.77374267578125, + "logps/rejected": -23.00345802307129, + "loss": 0.2017, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.1297158002853394, + "rewards/margins": 1.016002893447876, + "rewards/rejected": 0.11371288448572159, + "step": 3169 + }, + { + "epoch": 0.51, + "learning_rate": 8.70585867554879e-06, + "logits/chosen": -1.042201042175293, + "logits/rejected": -1.0105667114257812, + "logps/chosen": -95.84860229492188, + "logps/rejected": -68.8385238647461, + "loss": 0.6146, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.907850742340088, + "rewards/margins": -0.7306807041168213, + "rewards/rejected": 3.638531446456909, + "step": 3170 + }, + { + "epoch": 0.51, + "learning_rate": 8.704976268528851e-06, + "logits/chosen": -1.2523304224014282, + "logits/rejected": -1.0922831296920776, + "logps/chosen": -49.10523223876953, + "logps/rejected": -67.01654052734375, + "loss": 1.4231, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.5195305347442627, + "rewards/margins": -2.382366418838501, + "rewards/rejected": 4.901896953582764, + "step": 3171 + }, + { + "epoch": 0.51, + "learning_rate": 8.704093605530108e-06, + "logits/chosen": -0.8557071089744568, + "logits/rejected": -0.8638603687286377, + "logps/chosen": -9.144514083862305, + "logps/rejected": -3.3941900730133057, + "loss": 0.3831, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.4724641740322113, + "rewards/margins": 0.07449579238891602, + "rewards/rejected": 0.3979683816432953, + "step": 3172 + }, + { + "epoch": 0.52, + "learning_rate": 8.703210686613546e-06, + "logits/chosen": -1.1168806552886963, + "logits/rejected": -1.118762493133545, + "logps/chosen": -94.66757202148438, + "logps/rejected": -57.264617919921875, + "loss": 1.4401, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.3571929931640625, + "rewards/margins": -0.19942474365234375, + "rewards/rejected": 1.5566177368164062, + "step": 3173 + }, + { + "epoch": 0.52, + "learning_rate": 8.702327511840165e-06, + "logits/chosen": -0.8581221103668213, + "logits/rejected": -0.9505004286766052, + "logps/chosen": -63.7335319519043, + "logps/rejected": -99.16011047363281, + "loss": 2.5242, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.141711950302124, + "rewards/margins": -5.039484977722168, + "rewards/rejected": 7.181196689605713, + "step": 3174 + }, + { + "epoch": 0.52, + "learning_rate": 8.701444081270985e-06, + "logits/chosen": -0.8657733798027039, + "logits/rejected": -0.807539165019989, + "logps/chosen": -41.36150360107422, + "logps/rejected": -41.242225646972656, + "loss": 0.6626, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.4308922290802, + "rewards/margins": -0.09817957878112793, + "rewards/rejected": 3.529071807861328, + "step": 3175 + }, + { + "epoch": 0.52, + "learning_rate": 8.700560394967043e-06, + "logits/chosen": -0.4543362557888031, + "logits/rejected": -0.45325517654418945, + "logps/chosen": -2.3556623458862305, + "logps/rejected": -14.838147163391113, + "loss": 0.6864, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.30944767594337463, + "rewards/margins": -0.13487330079078674, + "rewards/rejected": 0.4443209767341614, + "step": 3176 + }, + { + "epoch": 0.52, + "learning_rate": 8.699676452989392e-06, + "logits/chosen": -1.4500341415405273, + "logits/rejected": -1.3143397569656372, + "logps/chosen": -81.00945281982422, + "logps/rejected": -82.61225891113281, + "loss": 0.2915, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.962900638580322, + "rewards/margins": 2.1138038635253906, + "rewards/rejected": 3.8490967750549316, + "step": 3177 + }, + { + "epoch": 0.52, + "learning_rate": 8.698792255399104e-06, + "logits/chosen": -1.330145001411438, + "logits/rejected": -1.1421763896942139, + "logps/chosen": -115.28227996826172, + "logps/rejected": -15.003547668457031, + "loss": 0.0129, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.512056827545166, + "rewards/margins": 4.365505218505859, + "rewards/rejected": 1.1465517282485962, + "step": 3178 + }, + { + "epoch": 0.52, + "learning_rate": 8.69790780225727e-06, + "logits/chosen": -1.2877371311187744, + "logits/rejected": -1.2837239503860474, + "logps/chosen": -42.56169891357422, + "logps/rejected": -82.38998413085938, + "loss": 0.544, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.0515105724334717, + "rewards/margins": -0.6746749877929688, + "rewards/rejected": 2.7261855602264404, + "step": 3179 + }, + { + "epoch": 0.52, + "learning_rate": 8.697023093624999e-06, + "logits/chosen": -0.9504076242446899, + "logits/rejected": -0.953813910484314, + "logps/chosen": -94.452392578125, + "logps/rejected": -57.50051498413086, + "loss": 0.6704, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.186944603919983, + "rewards/margins": -0.20484662055969238, + "rewards/rejected": 1.3917912244796753, + "step": 3180 + }, + { + "epoch": 0.52, + "learning_rate": 8.696138129563412e-06, + "logits/chosen": -1.3714661598205566, + "logits/rejected": -1.2383900880813599, + "logps/chosen": -111.14605712890625, + "logps/rejected": -60.17350387573242, + "loss": 0.5391, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.253881931304932, + "rewards/margins": 4.065863609313965, + "rewards/rejected": 2.188018560409546, + "step": 3181 + }, + { + "epoch": 0.52, + "learning_rate": 8.695252910133653e-06, + "logits/chosen": -1.0743799209594727, + "logits/rejected": -1.1041537523269653, + "logps/chosen": -217.15869140625, + "logps/rejected": -91.69480895996094, + "loss": 0.8169, + "rewards/accuracies": 0.0, + "rewards/chosen": 5.333004951477051, + "rewards/margins": -1.3427734375, + "rewards/rejected": 6.675778388977051, + "step": 3182 + }, + { + "epoch": 0.52, + "learning_rate": 8.694367435396882e-06, + "logits/chosen": -1.2093675136566162, + "logits/rejected": -1.2186005115509033, + "logps/chosen": -34.62775802612305, + "logps/rejected": -107.58073425292969, + "loss": 0.1788, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.4666225910186768, + "rewards/margins": 0.8570644855499268, + "rewards/rejected": 2.60955810546875, + "step": 3183 + }, + { + "epoch": 0.52, + "learning_rate": 8.69348170541428e-06, + "logits/chosen": -1.0863046646118164, + "logits/rejected": -0.9432185292243958, + "logps/chosen": -132.25485229492188, + "logps/rejected": -115.89189147949219, + "loss": 1.1309, + "rewards/accuracies": 0.0, + "rewards/chosen": 5.0766801834106445, + "rewards/margins": -1.7579617500305176, + "rewards/rejected": 6.834641933441162, + "step": 3184 + }, + { + "epoch": 0.52, + "learning_rate": 8.69259572024704e-06, + "logits/chosen": -0.6526725888252258, + "logits/rejected": -0.6526725888252258, + "logps/chosen": -1.9386701583862305, + "logps/rejected": -1.9386701583862305, + "loss": 0.6433, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.5135096907615662, + "rewards/margins": 0.0, + "rewards/rejected": 0.5135096907615662, + "step": 3185 + }, + { + "epoch": 0.52, + "learning_rate": 8.691709479956373e-06, + "logits/chosen": -0.9904046058654785, + "logits/rejected": -1.032102108001709, + "logps/chosen": -50.83546447753906, + "logps/rejected": -58.24166488647461, + "loss": 0.4103, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.5252349376678467, + "rewards/margins": 0.5977351665496826, + "rewards/rejected": 1.927499771118164, + "step": 3186 + }, + { + "epoch": 0.52, + "learning_rate": 8.690822984603514e-06, + "logits/chosen": -0.8442906737327576, + "logits/rejected": -0.8415609002113342, + "logps/chosen": -57.3540153503418, + "logps/rejected": -94.43331909179688, + "loss": 0.8584, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.6827625036239624, + "rewards/margins": -1.0774518251419067, + "rewards/rejected": 2.760214328765869, + "step": 3187 + }, + { + "epoch": 0.52, + "learning_rate": 8.689936234249709e-06, + "logits/chosen": -1.169517993927002, + "logits/rejected": -1.0610971450805664, + "logps/chosen": -63.019447326660156, + "logps/rejected": -18.510705947875977, + "loss": 0.3291, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.1905479431152344, + "rewards/margins": 1.990064263343811, + "rewards/rejected": 0.20048370957374573, + "step": 3188 + }, + { + "epoch": 0.52, + "learning_rate": 8.689049228956224e-06, + "logits/chosen": -1.0473586320877075, + "logits/rejected": -0.9785053730010986, + "logps/chosen": -79.72855377197266, + "logps/rejected": -47.38597869873047, + "loss": 0.0608, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.359484910964966, + "rewards/margins": 2.473052978515625, + "rewards/rejected": 0.886431872844696, + "step": 3189 + }, + { + "epoch": 0.52, + "learning_rate": 8.688161968784346e-06, + "logits/chosen": -0.795356273651123, + "logits/rejected": -0.7820473909378052, + "logps/chosen": -43.88263702392578, + "logps/rejected": -87.89936828613281, + "loss": 0.6896, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.0851571559906006, + "rewards/margins": -0.4259796142578125, + "rewards/rejected": 2.511136770248413, + "step": 3190 + }, + { + "epoch": 0.52, + "learning_rate": 8.687274453795372e-06, + "logits/chosen": -0.9938024282455444, + "logits/rejected": -1.0031497478485107, + "logps/chosen": -60.35858154296875, + "logps/rejected": -71.1849136352539, + "loss": 0.5433, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.734614610671997, + "rewards/margins": -0.6363022327423096, + "rewards/rejected": 2.3709168434143066, + "step": 3191 + }, + { + "epoch": 0.52, + "learning_rate": 8.68638668405062e-06, + "logits/chosen": -1.0944006443023682, + "logits/rejected": -0.9997096061706543, + "logps/chosen": -89.87403869628906, + "logps/rejected": -65.76342010498047, + "loss": 1.345, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.323713779449463, + "rewards/margins": 2.1868393421173096, + "rewards/rejected": 2.1368744373321533, + "step": 3192 + }, + { + "epoch": 0.52, + "learning_rate": 8.685498659611434e-06, + "logits/chosen": -0.9299030303955078, + "logits/rejected": -1.14878249168396, + "logps/chosen": -29.784400939941406, + "logps/rejected": -33.313941955566406, + "loss": 1.8371, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.9121894836425781, + "rewards/margins": -1.2133514881134033, + "rewards/rejected": 3.1255409717559814, + "step": 3193 + }, + { + "epoch": 0.52, + "learning_rate": 8.68461038053916e-06, + "logits/chosen": -1.4246857166290283, + "logits/rejected": -1.277435064315796, + "logps/chosen": -115.93253326416016, + "logps/rejected": -89.58619689941406, + "loss": 0.3341, + "rewards/accuracies": 1.0, + "rewards/chosen": 9.699344635009766, + "rewards/margins": 4.974009037017822, + "rewards/rejected": 4.725335597991943, + "step": 3194 + }, + { + "epoch": 0.52, + "learning_rate": 8.683721846895173e-06, + "logits/chosen": -1.0041580200195312, + "logits/rejected": -0.8670055270195007, + "logps/chosen": -80.93363952636719, + "logps/rejected": -84.01721954345703, + "loss": 0.0627, + "rewards/accuracies": 1.0, + "rewards/chosen": 7.4136199951171875, + "rewards/margins": 2.5576744079589844, + "rewards/rejected": 4.855945587158203, + "step": 3195 + }, + { + "epoch": 0.52, + "learning_rate": 8.682833058740862e-06, + "logits/chosen": -0.9635661244392395, + "logits/rejected": -0.9691869020462036, + "logps/chosen": -75.41029357910156, + "logps/rejected": -120.85248565673828, + "loss": 0.714, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.487147569656372, + "rewards/margins": 1.059241533279419, + "rewards/rejected": 0.4279060363769531, + "step": 3196 + }, + { + "epoch": 0.52, + "learning_rate": 8.681944016137635e-06, + "logits/chosen": -1.0327048301696777, + "logits/rejected": -0.8261178731918335, + "logps/chosen": -89.81976318359375, + "logps/rejected": -33.55957794189453, + "loss": 0.0181, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.514036655426025, + "rewards/margins": 3.608517646789551, + "rewards/rejected": 0.9055191278457642, + "step": 3197 + }, + { + "epoch": 0.52, + "learning_rate": 8.681054719146915e-06, + "logits/chosen": -0.8887273669242859, + "logits/rejected": -0.9100773334503174, + "logps/chosen": -36.20624542236328, + "logps/rejected": -49.287925720214844, + "loss": 0.8277, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.7788101434707642, + "rewards/margins": 0.5933517217636108, + "rewards/rejected": 1.1854584217071533, + "step": 3198 + }, + { + "epoch": 0.52, + "learning_rate": 8.680165167830144e-06, + "logits/chosen": -0.48823490738868713, + "logits/rejected": -0.4897744059562683, + "logps/chosen": -3.601094961166382, + "logps/rejected": -1.1950753927230835, + "loss": 0.4111, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.016274763271212578, + "rewards/margins": -0.17319732904434204, + "rewards/rejected": 0.15692256391048431, + "step": 3199 + }, + { + "epoch": 0.52, + "learning_rate": 8.679275362248783e-06, + "logits/chosen": -1.4475905895233154, + "logits/rejected": -1.3479152917861938, + "logps/chosen": -69.63166809082031, + "logps/rejected": -36.54401779174805, + "loss": 1.4371, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.901484727859497, + "rewards/margins": 1.4001431465148926, + "rewards/rejected": 0.5013416409492493, + "step": 3200 + }, + { + "epoch": 0.52, + "learning_rate": 8.678385302464307e-06, + "logits/chosen": -0.6993629336357117, + "logits/rejected": -0.6364297270774841, + "logps/chosen": -50.801334381103516, + "logps/rejected": -18.226852416992188, + "loss": 0.2597, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.0052769184112549, + "rewards/margins": 0.38577407598495483, + "rewards/rejected": 0.6195028424263, + "step": 3201 + }, + { + "epoch": 0.52, + "learning_rate": 8.67749498853821e-06, + "logits/chosen": -1.2818312644958496, + "logits/rejected": -1.2990976572036743, + "logps/chosen": -75.26142120361328, + "logps/rejected": -102.58173370361328, + "loss": 0.9843, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.8063499331474304, + "rewards/margins": -1.2325592041015625, + "rewards/rejected": 2.0389091968536377, + "step": 3202 + }, + { + "epoch": 0.52, + "learning_rate": 8.676604420532009e-06, + "logits/chosen": -1.1493910551071167, + "logits/rejected": -1.1396503448486328, + "logps/chosen": -61.7958984375, + "logps/rejected": -46.547149658203125, + "loss": 1.0083, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.950671374797821, + "rewards/margins": -0.9711975455284119, + "rewards/rejected": 1.921868920326233, + "step": 3203 + }, + { + "epoch": 0.52, + "learning_rate": 8.675713598507232e-06, + "logits/chosen": -1.3791987895965576, + "logits/rejected": -1.391210675239563, + "logps/chosen": -154.61996459960938, + "logps/rejected": -45.09669494628906, + "loss": 0.6587, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.1481231451034546, + "rewards/margins": 0.14297521114349365, + "rewards/rejected": 1.005147933959961, + "step": 3204 + }, + { + "epoch": 0.52, + "learning_rate": 8.674822522525422e-06, + "logits/chosen": -1.1457306146621704, + "logits/rejected": -1.091546893119812, + "logps/chosen": -41.066497802734375, + "logps/rejected": -81.28402709960938, + "loss": 1.0377, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.5500800609588623, + "rewards/margins": -1.0933313369750977, + "rewards/rejected": 3.64341139793396, + "step": 3205 + }, + { + "epoch": 0.52, + "learning_rate": 8.673931192648148e-06, + "logits/chosen": -0.7192034721374512, + "logits/rejected": -0.7722673416137695, + "logps/chosen": -50.008872985839844, + "logps/rejected": -113.58722686767578, + "loss": 0.2513, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.199362277984619, + "rewards/margins": 1.5171173810958862, + "rewards/rejected": 0.6822448968887329, + "step": 3206 + }, + { + "epoch": 0.52, + "learning_rate": 8.673039608936993e-06, + "logits/chosen": -1.0621415376663208, + "logits/rejected": -1.0242066383361816, + "logps/chosen": -60.79335403442383, + "logps/rejected": -131.72909545898438, + "loss": 1.2297, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.8481578826904297, + "rewards/margins": 0.4557567834854126, + "rewards/rejected": 1.392401099205017, + "step": 3207 + }, + { + "epoch": 0.52, + "learning_rate": 8.672147771453554e-06, + "logits/chosen": -0.8874786496162415, + "logits/rejected": -0.9130334854125977, + "logps/chosen": -96.28751373291016, + "logps/rejected": -87.39645385742188, + "loss": 0.7848, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.51452112197876, + "rewards/margins": 1.3757209777832031, + "rewards/rejected": 3.1388001441955566, + "step": 3208 + }, + { + "epoch": 0.52, + "learning_rate": 8.671255680259451e-06, + "logits/chosen": -0.858992338180542, + "logits/rejected": -0.744263231754303, + "logps/chosen": -31.587488174438477, + "logps/rejected": -40.147850036621094, + "loss": 0.2573, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.538813591003418, + "rewards/margins": 0.4206991195678711, + "rewards/rejected": 1.1181144714355469, + "step": 3209 + }, + { + "epoch": 0.52, + "learning_rate": 8.670363335416319e-06, + "logits/chosen": -1.1304291486740112, + "logits/rejected": -1.0112264156341553, + "logps/chosen": -75.21443176269531, + "logps/rejected": -35.122039794921875, + "loss": 0.1616, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.595240831375122, + "rewards/margins": 1.704681396484375, + "rewards/rejected": 0.8905593752861023, + "step": 3210 + }, + { + "epoch": 0.52, + "learning_rate": 8.669470736985809e-06, + "logits/chosen": -1.2559643983840942, + "logits/rejected": -1.278618335723877, + "logps/chosen": -100.92681884765625, + "logps/rejected": -69.3691635131836, + "loss": 0.4106, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.463294267654419, + "rewards/margins": 0.7218300104141235, + "rewards/rejected": 1.7414642572402954, + "step": 3211 + }, + { + "epoch": 0.52, + "learning_rate": 8.668577885029592e-06, + "logits/chosen": -0.8497194647789001, + "logits/rejected": -0.7159144282341003, + "logps/chosen": -74.3167495727539, + "logps/rejected": -34.589534759521484, + "loss": 0.076, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.836249589920044, + "rewards/margins": 2.010768175125122, + "rewards/rejected": 1.8254814147949219, + "step": 3212 + }, + { + "epoch": 0.52, + "learning_rate": 8.667684779609356e-06, + "logits/chosen": -1.0239460468292236, + "logits/rejected": -1.0076948404312134, + "logps/chosen": -170.39096069335938, + "logps/rejected": -41.40229415893555, + "loss": 1.6128, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.2243499755859375, + "rewards/margins": 2.7356770038604736, + "rewards/rejected": 2.488672971725464, + "step": 3213 + }, + { + "epoch": 0.52, + "learning_rate": 8.666791420786805e-06, + "logits/chosen": -1.1810736656188965, + "logits/rejected": -1.1706715822219849, + "logps/chosen": -152.76168823242188, + "logps/rejected": -58.18183898925781, + "loss": 0.3469, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.9241960048675537, + "rewards/margins": 0.48450183868408203, + "rewards/rejected": 3.4396941661834717, + "step": 3214 + }, + { + "epoch": 0.52, + "learning_rate": 8.665897808623662e-06, + "logits/chosen": -0.8647972345352173, + "logits/rejected": -0.7734511494636536, + "logps/chosen": -79.0694580078125, + "logps/rejected": -31.1265869140625, + "loss": 0.2339, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.538231611251831, + "rewards/margins": 1.4508178234100342, + "rewards/rejected": 1.0874137878417969, + "step": 3215 + }, + { + "epoch": 0.52, + "learning_rate": 8.665003943181669e-06, + "logits/chosen": -0.7954455018043518, + "logits/rejected": -0.6877449750900269, + "logps/chosen": -101.44322204589844, + "logps/rejected": -59.118770599365234, + "loss": 0.3181, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.7858963012695312, + "rewards/margins": 0.7924037575721741, + "rewards/rejected": 0.9934925436973572, + "step": 3216 + }, + { + "epoch": 0.52, + "learning_rate": 8.664109824522581e-06, + "logits/chosen": -0.6655705571174622, + "logits/rejected": -0.6655705571174622, + "logps/chosen": -32.05577850341797, + "logps/rejected": -32.05577850341797, + "loss": 1.0337, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.7079025506973267, + "rewards/margins": 0.0, + "rewards/rejected": 0.7079025506973267, + "step": 3217 + }, + { + "epoch": 0.52, + "learning_rate": 8.663215452708173e-06, + "logits/chosen": -0.9588837623596191, + "logits/rejected": -0.9196271300315857, + "logps/chosen": -107.30255126953125, + "logps/rejected": -73.5377426147461, + "loss": 0.4004, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.303849935531616, + "rewards/margins": -0.15303421020507812, + "rewards/rejected": 2.4568841457366943, + "step": 3218 + }, + { + "epoch": 0.52, + "learning_rate": 8.66232082780024e-06, + "logits/chosen": -1.2706184387207031, + "logits/rejected": -1.2534260749816895, + "logps/chosen": -65.01589965820312, + "logps/rejected": -58.82050323486328, + "loss": 0.2946, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.0607857704162598, + "rewards/margins": 0.40612494945526123, + "rewards/rejected": 1.6546608209609985, + "step": 3219 + }, + { + "epoch": 0.52, + "learning_rate": 8.661425949860592e-06, + "logits/chosen": -0.906396210193634, + "logits/rejected": -0.9761794209480286, + "logps/chosen": -61.13207244873047, + "logps/rejected": -128.94369506835938, + "loss": 0.6493, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.5151885747909546, + "rewards/margins": -0.17773902416229248, + "rewards/rejected": 1.692927598953247, + "step": 3220 + }, + { + "epoch": 0.52, + "learning_rate": 8.660530818951055e-06, + "logits/chosen": -0.46782976388931274, + "logits/rejected": -0.494839608669281, + "logps/chosen": -3.8426175117492676, + "logps/rejected": -72.12841033935547, + "loss": 0.7736, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.5690073370933533, + "rewards/margins": -0.039969444274902344, + "rewards/rejected": 0.6089767813682556, + "step": 3221 + }, + { + "epoch": 0.52, + "learning_rate": 8.659635435133476e-06, + "logits/chosen": -0.9614307284355164, + "logits/rejected": -0.9926373958587646, + "logps/chosen": -16.663576126098633, + "logps/rejected": -53.300559997558594, + "loss": 0.5348, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.9748380780220032, + "rewards/margins": -0.5163301825523376, + "rewards/rejected": 1.4911682605743408, + "step": 3222 + }, + { + "epoch": 0.52, + "learning_rate": 8.658739798469713e-06, + "logits/chosen": -1.063845157623291, + "logits/rejected": -1.0212757587432861, + "logps/chosen": -90.0905532836914, + "logps/rejected": -56.6981201171875, + "loss": 0.8734, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9202325344085693, + "rewards/margins": 1.6784554719924927, + "rewards/rejected": 1.2417770624160767, + "step": 3223 + }, + { + "epoch": 0.52, + "learning_rate": 8.657843909021652e-06, + "logits/chosen": -1.0998313426971436, + "logits/rejected": -1.0381066799163818, + "logps/chosen": -204.22927856445312, + "logps/rejected": -97.96617126464844, + "loss": 0.4478, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.927585124969482, + "rewards/margins": 2.3389804363250732, + "rewards/rejected": 2.588604688644409, + "step": 3224 + }, + { + "epoch": 0.52, + "learning_rate": 8.656947766851188e-06, + "logits/chosen": -1.0114091634750366, + "logits/rejected": -1.0286545753479004, + "logps/chosen": -83.26187133789062, + "logps/rejected": -108.8295669555664, + "loss": 1.0607, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.8062690496444702, + "rewards/margins": 0.5142737627029419, + "rewards/rejected": 1.2919952869415283, + "step": 3225 + }, + { + "epoch": 0.52, + "learning_rate": 8.656051372020232e-06, + "logits/chosen": -0.6340943574905396, + "logits/rejected": -0.6615628004074097, + "logps/chosen": -17.095542907714844, + "logps/rejected": -33.20759582519531, + "loss": 0.7791, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.49615898728370667, + "rewards/margins": -0.26792070269584656, + "rewards/rejected": 0.7640796899795532, + "step": 3226 + }, + { + "epoch": 0.52, + "learning_rate": 8.655154724590724e-06, + "logits/chosen": -1.098319411277771, + "logits/rejected": -1.1147316694259644, + "logps/chosen": -91.68133544921875, + "logps/rejected": -61.227699279785156, + "loss": 0.4802, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.4573326110839844, + "rewards/margins": -0.40645527839660645, + "rewards/rejected": 1.8637878894805908, + "step": 3227 + }, + { + "epoch": 0.52, + "learning_rate": 8.654257824624608e-06, + "logits/chosen": -1.1160181760787964, + "logits/rejected": -0.9917163252830505, + "logps/chosen": -99.78912353515625, + "logps/rejected": -35.02939224243164, + "loss": 0.0556, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.4568703174591064, + "rewards/margins": 2.149651050567627, + "rewards/rejected": 0.30721932649612427, + "step": 3228 + }, + { + "epoch": 0.52, + "learning_rate": 8.653360672183852e-06, + "logits/chosen": -1.2078115940093994, + "logits/rejected": -1.119369387626648, + "logps/chosen": -102.14044952392578, + "logps/rejected": -37.20997619628906, + "loss": 0.4556, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.3022866249084473, + "rewards/margins": 2.6767830848693848, + "rewards/rejected": 0.6255035400390625, + "step": 3229 + }, + { + "epoch": 0.52, + "learning_rate": 8.652463267330445e-06, + "logits/chosen": -0.8383998870849609, + "logits/rejected": -0.8136780858039856, + "logps/chosen": -29.66179656982422, + "logps/rejected": -27.22997283935547, + "loss": 0.7127, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2635253965854645, + "rewards/margins": 0.15607032179832458, + "rewards/rejected": 0.1074550673365593, + "step": 3230 + }, + { + "epoch": 0.52, + "learning_rate": 8.651565610126385e-06, + "logits/chosen": -1.2685343027114868, + "logits/rejected": -1.1880873441696167, + "logps/chosen": -206.8112335205078, + "logps/rejected": -73.58352661132812, + "loss": 0.0803, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.71969747543335, + "rewards/margins": 2.4747138023376465, + "rewards/rejected": 4.244983673095703, + "step": 3231 + }, + { + "epoch": 0.52, + "learning_rate": 8.650667700633692e-06, + "logits/chosen": -1.0825929641723633, + "logits/rejected": -0.9152448177337646, + "logps/chosen": -114.15836334228516, + "logps/rejected": -51.35772705078125, + "loss": 0.4457, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.101953886449337, + "rewards/margins": 0.07457809150218964, + "rewards/rejected": 0.02737579308450222, + "step": 3232 + }, + { + "epoch": 0.52, + "learning_rate": 8.649769538914406e-06, + "logits/chosen": -0.8814654350280762, + "logits/rejected": -0.8717344403266907, + "logps/chosen": -2.6074130535125732, + "logps/rejected": -9.635481834411621, + "loss": 0.3574, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.13094046711921692, + "rewards/margins": -0.012761563062667847, + "rewards/rejected": 0.14370203018188477, + "step": 3233 + }, + { + "epoch": 0.52, + "learning_rate": 8.648871125030576e-06, + "logits/chosen": -0.9629667401313782, + "logits/rejected": -0.9747430682182312, + "logps/chosen": -122.47755432128906, + "logps/rejected": -87.40222930908203, + "loss": 0.9295, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.7265671491622925, + "rewards/margins": -0.6088272333145142, + "rewards/rejected": 2.3353943824768066, + "step": 3234 + }, + { + "epoch": 0.53, + "learning_rate": 8.647972459044279e-06, + "logits/chosen": -1.0264966487884521, + "logits/rejected": -0.9665829539299011, + "logps/chosen": -102.32400512695312, + "logps/rejected": -73.62605285644531, + "loss": 0.8376, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.8738738894462585, + "rewards/margins": -1.4647583961486816, + "rewards/rejected": 2.338632345199585, + "step": 3235 + }, + { + "epoch": 0.53, + "learning_rate": 8.647073541017602e-06, + "logits/chosen": -0.8222516179084778, + "logits/rejected": -0.9361518025398254, + "logps/chosen": -42.9672966003418, + "logps/rejected": -57.30830383300781, + "loss": 1.3852, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.52661395072937, + "rewards/margins": -2.184824228286743, + "rewards/rejected": 4.711438179016113, + "step": 3236 + }, + { + "epoch": 0.53, + "learning_rate": 8.646174371012653e-06, + "logits/chosen": -1.3748772144317627, + "logits/rejected": -1.144333004951477, + "logps/chosen": -88.81265258789062, + "logps/rejected": -17.49755096435547, + "loss": 0.0146, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.397714138031006, + "rewards/margins": 5.75463342666626, + "rewards/rejected": 0.6430807113647461, + "step": 3237 + }, + { + "epoch": 0.53, + "learning_rate": 8.645274949091556e-06, + "logits/chosen": -0.9325742125511169, + "logits/rejected": -0.7015034556388855, + "logps/chosen": -95.80264282226562, + "logps/rejected": -56.58203887939453, + "loss": 0.0412, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.819305419921875, + "rewards/margins": 4.0050835609436035, + "rewards/rejected": 1.814221978187561, + "step": 3238 + }, + { + "epoch": 0.53, + "learning_rate": 8.64437527531645e-06, + "logits/chosen": -1.0125442743301392, + "logits/rejected": -0.9626800417900085, + "logps/chosen": -94.89226531982422, + "logps/rejected": -53.91122817993164, + "loss": 0.9053, + "rewards/accuracies": 1.0, + "rewards/chosen": 7.3550543785095215, + "rewards/margins": 4.8886942863464355, + "rewards/rejected": 2.466360092163086, + "step": 3239 + }, + { + "epoch": 0.53, + "learning_rate": 8.643475349749497e-06, + "logits/chosen": -1.082528829574585, + "logits/rejected": -1.154421091079712, + "logps/chosen": -118.93225860595703, + "logps/rejected": -93.98554229736328, + "loss": 0.6915, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.3530235290527344, + "rewards/margins": -0.854102373123169, + "rewards/rejected": 3.2071259021759033, + "step": 3240 + }, + { + "epoch": 0.53, + "learning_rate": 8.642575172452871e-06, + "logits/chosen": -0.7583034634590149, + "logits/rejected": -0.7982924580574036, + "logps/chosen": -55.26115417480469, + "logps/rejected": -74.20909118652344, + "loss": 0.5062, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.4919068813323975, + "rewards/margins": 1.439682126045227, + "rewards/rejected": 1.0522247552871704, + "step": 3241 + }, + { + "epoch": 0.53, + "learning_rate": 8.64167474348877e-06, + "logits/chosen": -1.1806358098983765, + "logits/rejected": -1.0746277570724487, + "logps/chosen": -47.99725341796875, + "logps/rejected": -23.81061553955078, + "loss": 1.5065, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.971165418624878, + "rewards/margins": 2.1943700313568115, + "rewards/rejected": 0.7767953872680664, + "step": 3242 + }, + { + "epoch": 0.53, + "learning_rate": 8.640774062919399e-06, + "logits/chosen": -1.1961607933044434, + "logits/rejected": -1.0769479274749756, + "logps/chosen": -53.939178466796875, + "logps/rejected": -24.022174835205078, + "loss": 0.3837, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0115678310394287, + "rewards/margins": 3.0843231678009033, + "rewards/rejected": -0.07275523990392685, + "step": 3243 + }, + { + "epoch": 0.53, + "learning_rate": 8.639873130806991e-06, + "logits/chosen": -0.9809715747833252, + "logits/rejected": -0.8996875286102295, + "logps/chosen": -104.92040252685547, + "logps/rejected": -58.42873764038086, + "loss": 0.6146, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.920917510986328, + "rewards/margins": 0.41280484199523926, + "rewards/rejected": 2.508112668991089, + "step": 3244 + }, + { + "epoch": 0.53, + "learning_rate": 8.638971947213791e-06, + "logits/chosen": -1.253318428993225, + "logits/rejected": -1.081884741783142, + "logps/chosen": -161.2872772216797, + "logps/rejected": -85.52073669433594, + "loss": 0.0749, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.541255474090576, + "rewards/margins": 2.0886895656585693, + "rewards/rejected": 3.452565908432007, + "step": 3245 + }, + { + "epoch": 0.53, + "learning_rate": 8.638070512202059e-06, + "logits/chosen": -1.2080953121185303, + "logits/rejected": -1.1699312925338745, + "logps/chosen": -53.295406341552734, + "logps/rejected": -52.84062957763672, + "loss": 1.6895, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.7945034503936768, + "rewards/margins": -0.6086819171905518, + "rewards/rejected": 2.4031853675842285, + "step": 3246 + }, + { + "epoch": 0.53, + "learning_rate": 8.637168825834081e-06, + "logits/chosen": -1.2300302982330322, + "logits/rejected": -1.1794342994689941, + "logps/chosen": -65.2564697265625, + "logps/rejected": -210.9666290283203, + "loss": 3.2818, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.8272156715393066, + "rewards/margins": -6.412062168121338, + "rewards/rejected": 10.239277839660645, + "step": 3247 + }, + { + "epoch": 0.53, + "learning_rate": 8.636266888172151e-06, + "logits/chosen": -0.9762563705444336, + "logits/rejected": -0.8216996788978577, + "logps/chosen": -72.69453430175781, + "logps/rejected": -29.780075073242188, + "loss": 0.0935, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.8991378545761108, + "rewards/margins": 2.3617501258850098, + "rewards/rejected": -0.4626123607158661, + "step": 3248 + }, + { + "epoch": 0.53, + "learning_rate": 8.635364699278587e-06, + "logits/chosen": -1.735556721687317, + "logits/rejected": -1.7036620378494263, + "logps/chosen": -121.20816040039062, + "logps/rejected": -131.99253845214844, + "loss": 0.4021, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.392312526702881, + "rewards/margins": 1.1169309616088867, + "rewards/rejected": 5.275381565093994, + "step": 3249 + }, + { + "epoch": 0.53, + "learning_rate": 8.634462259215719e-06, + "logits/chosen": -0.9521010518074036, + "logits/rejected": -0.7322450876235962, + "logps/chosen": -143.2721710205078, + "logps/rejected": -16.075679779052734, + "loss": 0.8895, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.6840744018554688, + "rewards/margins": 1.4649183750152588, + "rewards/rejected": 0.21915607154369354, + "step": 3250 + }, + { + "epoch": 0.53, + "learning_rate": 8.6335595680459e-06, + "logits/chosen": -0.8405278921127319, + "logits/rejected": -0.8392329812049866, + "logps/chosen": -95.42726135253906, + "logps/rejected": -100.25137329101562, + "loss": 0.1738, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.7969703674316406, + "rewards/margins": 1.0726509094238281, + "rewards/rejected": 0.7243194580078125, + "step": 3251 + }, + { + "epoch": 0.53, + "learning_rate": 8.632656625831495e-06, + "logits/chosen": -1.1709823608398438, + "logits/rejected": -1.088392734527588, + "logps/chosen": -50.67562484741211, + "logps/rejected": -57.17911911010742, + "loss": 0.2375, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0162456035614014, + "rewards/margins": 0.6080634593963623, + "rewards/rejected": 2.408182144165039, + "step": 3252 + }, + { + "epoch": 0.53, + "learning_rate": 8.631753432634889e-06, + "logits/chosen": -0.9663090109825134, + "logits/rejected": -1.0518300533294678, + "logps/chosen": -75.64080047607422, + "logps/rejected": -94.23567199707031, + "loss": 2.4843, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.7873246669769287, + "rewards/margins": -3.706164598464966, + "rewards/rejected": 6.4934892654418945, + "step": 3253 + }, + { + "epoch": 0.53, + "learning_rate": 8.630849988518486e-06, + "logits/chosen": -1.0294440984725952, + "logits/rejected": -1.023017168045044, + "logps/chosen": -101.9702377319336, + "logps/rejected": -152.5607452392578, + "loss": 0.2197, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.6239982843399048, + "rewards/margins": 0.679822564125061, + "rewards/rejected": 0.9441757202148438, + "step": 3254 + }, + { + "epoch": 0.53, + "learning_rate": 8.629946293544703e-06, + "logits/chosen": -0.8878222107887268, + "logits/rejected": -0.783315122127533, + "logps/chosen": -52.87125015258789, + "logps/rejected": -54.68330383300781, + "loss": 0.2014, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.1541210412979126, + "rewards/margins": 1.1540111303329468, + "rewards/rejected": 0.00010986328561557457, + "step": 3255 + }, + { + "epoch": 0.53, + "learning_rate": 8.62904234777598e-06, + "logits/chosen": -0.7627984881401062, + "logits/rejected": -0.7658034563064575, + "logps/chosen": -117.83289337158203, + "logps/rejected": -64.76326751708984, + "loss": 1.4131, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.8363273739814758, + "rewards/margins": -2.65157413482666, + "rewards/rejected": 3.487901449203491, + "step": 3256 + }, + { + "epoch": 0.53, + "learning_rate": 8.628138151274767e-06, + "logits/chosen": -1.418626308441162, + "logits/rejected": -1.523270606994629, + "logps/chosen": -219.99923706054688, + "logps/rejected": -145.91744995117188, + "loss": 2.0406, + "rewards/accuracies": 0.0, + "rewards/chosen": 7.444095134735107, + "rewards/margins": -4.063671588897705, + "rewards/rejected": 11.507766723632812, + "step": 3257 + }, + { + "epoch": 0.53, + "learning_rate": 8.627233704103538e-06, + "logits/chosen": -1.3070526123046875, + "logits/rejected": -1.2109335660934448, + "logps/chosen": -71.53594207763672, + "logps/rejected": -31.461088180541992, + "loss": 0.7847, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.7012627124786377, + "rewards/margins": 1.5897024869918823, + "rewards/rejected": 0.11156024783849716, + "step": 3258 + }, + { + "epoch": 0.53, + "learning_rate": 8.626329006324782e-06, + "logits/chosen": -1.5111123323440552, + "logits/rejected": -1.613849401473999, + "logps/chosen": -214.14036560058594, + "logps/rejected": -24.733203887939453, + "loss": 0.3127, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.6245224475860596, + "rewards/margins": 3.2800846099853516, + "rewards/rejected": 0.3444378077983856, + "step": 3259 + }, + { + "epoch": 0.53, + "learning_rate": 8.625424058001004e-06, + "logits/chosen": -1.3146421909332275, + "logits/rejected": -1.4020322561264038, + "logps/chosen": -118.37857055664062, + "logps/rejected": -68.31227111816406, + "loss": 1.1069, + "rewards/accuracies": 0.0, + "rewards/chosen": 4.908346652984619, + "rewards/margins": -1.9920806884765625, + "rewards/rejected": 6.900427341461182, + "step": 3260 + }, + { + "epoch": 0.53, + "learning_rate": 8.624518859194727e-06, + "logits/chosen": -0.9231230020523071, + "logits/rejected": -0.9231230020523071, + "logps/chosen": -20.992111206054688, + "logps/rejected": -20.992111206054688, + "loss": 0.7859, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.280752658843994, + "rewards/margins": 0.0, + "rewards/rejected": 2.280752658843994, + "step": 3261 + }, + { + "epoch": 0.53, + "learning_rate": 8.623613409968492e-06, + "logits/chosen": -0.7423629760742188, + "logits/rejected": -0.7423629760742188, + "logps/chosen": -56.77678298950195, + "logps/rejected": -56.77678298950195, + "loss": 2.2293, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.116907835006714, + "rewards/margins": 0.0, + "rewards/rejected": 2.116907835006714, + "step": 3262 + }, + { + "epoch": 0.53, + "learning_rate": 8.62270771038486e-06, + "logits/chosen": -1.2997227907180786, + "logits/rejected": -1.4821611642837524, + "logps/chosen": -84.20559692382812, + "logps/rejected": -35.73859786987305, + "loss": 0.0434, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.923659563064575, + "rewards/margins": 2.6230008602142334, + "rewards/rejected": 0.30065879225730896, + "step": 3263 + }, + { + "epoch": 0.53, + "learning_rate": 8.621801760506401e-06, + "logits/chosen": -0.6762521862983704, + "logits/rejected": -0.6673162579536438, + "logps/chosen": -1.8858033418655396, + "logps/rejected": -9.635726928710938, + "loss": 0.3799, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.33725205063819885, + "rewards/margins": 0.2332022786140442, + "rewards/rejected": 0.10404977947473526, + "step": 3264 + }, + { + "epoch": 0.53, + "learning_rate": 8.62089556039571e-06, + "logits/chosen": -1.196823239326477, + "logits/rejected": -1.2123337984085083, + "logps/chosen": -81.4337387084961, + "logps/rejected": -110.39418029785156, + "loss": 0.4867, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.7520546317100525, + "rewards/margins": -0.4296554923057556, + "rewards/rejected": 1.181710124015808, + "step": 3265 + }, + { + "epoch": 0.53, + "learning_rate": 8.619989110115398e-06, + "logits/chosen": -0.8725991249084473, + "logits/rejected": -0.8626412749290466, + "logps/chosen": -86.45905303955078, + "logps/rejected": -57.56909942626953, + "loss": 0.3168, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.4130637645721436, + "rewards/margins": 0.26444244384765625, + "rewards/rejected": 2.1486213207244873, + "step": 3266 + }, + { + "epoch": 0.53, + "learning_rate": 8.619082409728093e-06, + "logits/chosen": -0.8827927112579346, + "logits/rejected": -0.8697187900543213, + "logps/chosen": -7.689728260040283, + "logps/rejected": -4.586639404296875, + "loss": 0.3853, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.59843909740448, + "rewards/margins": 0.3709934949874878, + "rewards/rejected": 1.2274456024169922, + "step": 3267 + }, + { + "epoch": 0.53, + "learning_rate": 8.618175459296434e-06, + "logits/chosen": -0.8736600875854492, + "logits/rejected": -1.1196503639221191, + "logps/chosen": -95.02084350585938, + "logps/rejected": -102.33798217773438, + "loss": 3.3964, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.6062278747558594, + "rewards/margins": -5.383408546447754, + "rewards/rejected": 7.989636421203613, + "step": 3268 + }, + { + "epoch": 0.53, + "learning_rate": 8.61726825888309e-06, + "logits/chosen": -1.0953073501586914, + "logits/rejected": -0.9996435642242432, + "logps/chosen": -85.56884002685547, + "logps/rejected": -72.25128173828125, + "loss": 0.3521, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.649543046951294, + "rewards/margins": 0.2569282054901123, + "rewards/rejected": 3.3926148414611816, + "step": 3269 + }, + { + "epoch": 0.53, + "learning_rate": 8.616360808550733e-06, + "logits/chosen": -0.8784956336021423, + "logits/rejected": -0.9416801929473877, + "logps/chosen": -74.32858276367188, + "logps/rejected": -90.23405456542969, + "loss": 0.3373, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.9147689938545227, + "rewards/margins": 0.11379319429397583, + "rewards/rejected": 0.8009757995605469, + "step": 3270 + }, + { + "epoch": 0.53, + "learning_rate": 8.615453108362064e-06, + "logits/chosen": -1.1292461156845093, + "logits/rejected": -1.1145057678222656, + "logps/chosen": -49.88331604003906, + "logps/rejected": -58.123313903808594, + "loss": 0.4494, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.84332275390625, + "rewards/margins": -0.2816718816757202, + "rewards/rejected": 1.1249946355819702, + "step": 3271 + }, + { + "epoch": 0.53, + "learning_rate": 8.614545158379793e-06, + "logits/chosen": -0.8287418484687805, + "logits/rejected": -0.7453885674476624, + "logps/chosen": -42.67110824584961, + "logps/rejected": -62.51523208618164, + "loss": 0.1792, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2340779304504395, + "rewards/margins": 0.8452796936035156, + "rewards/rejected": 2.388798236846924, + "step": 3272 + }, + { + "epoch": 0.53, + "learning_rate": 8.613636958666655e-06, + "logits/chosen": -0.9205492734909058, + "logits/rejected": -0.91603022813797, + "logps/chosen": -95.71026611328125, + "logps/rejected": -85.43890380859375, + "loss": 0.4135, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.3876450061798096, + "rewards/margins": 0.4477463364601135, + "rewards/rejected": 0.939898669719696, + "step": 3273 + }, + { + "epoch": 0.53, + "learning_rate": 8.612728509285395e-06, + "logits/chosen": -0.863488495349884, + "logits/rejected": -0.6540666222572327, + "logps/chosen": -84.58900451660156, + "logps/rejected": -47.417213439941406, + "loss": 0.4514, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.2104156017303467, + "rewards/margins": 0.16912460327148438, + "rewards/rejected": 2.0412909984588623, + "step": 3274 + }, + { + "epoch": 0.53, + "learning_rate": 8.611819810298778e-06, + "logits/chosen": -1.1589199304580688, + "logits/rejected": -0.6978470683097839, + "logps/chosen": -73.09709930419922, + "logps/rejected": -212.19064331054688, + "loss": 0.6554, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0012688636779785, + "rewards/margins": 1.2134286165237427, + "rewards/rejected": 1.7878402471542358, + "step": 3275 + }, + { + "epoch": 0.53, + "learning_rate": 8.610910861769589e-06, + "logits/chosen": -1.0118865966796875, + "logits/rejected": -1.0507268905639648, + "logps/chosen": -61.7310791015625, + "logps/rejected": -91.14885711669922, + "loss": 0.6108, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.920217990875244, + "rewards/margins": 0.6190078258514404, + "rewards/rejected": 2.3012101650238037, + "step": 3276 + }, + { + "epoch": 0.53, + "learning_rate": 8.610001663760625e-06, + "logits/chosen": -1.0410540103912354, + "logits/rejected": -0.954230785369873, + "logps/chosen": -157.37213134765625, + "logps/rejected": -88.36822509765625, + "loss": 0.2392, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.211691379547119, + "rewards/margins": 0.7513978481292725, + "rewards/rejected": 3.4602935314178467, + "step": 3277 + }, + { + "epoch": 0.53, + "learning_rate": 8.609092216334705e-06, + "logits/chosen": -0.5917785167694092, + "logits/rejected": -0.5905286073684692, + "logps/chosen": -7.806990146636963, + "logps/rejected": -0.8528746962547302, + "loss": 0.4025, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.061499547213315964, + "rewards/margins": -0.1300279200077057, + "rewards/rejected": 0.19152747094631195, + "step": 3278 + }, + { + "epoch": 0.53, + "learning_rate": 8.608182519554662e-06, + "logits/chosen": -1.2215180397033691, + "logits/rejected": -1.1996026039123535, + "logps/chosen": -88.79539489746094, + "logps/rejected": -62.00810623168945, + "loss": 0.2914, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.6578240394592285, + "rewards/margins": 0.345292329788208, + "rewards/rejected": 2.3125317096710205, + "step": 3279 + }, + { + "epoch": 0.53, + "learning_rate": 8.607272573483348e-06, + "logits/chosen": -1.2288497686386108, + "logits/rejected": -1.2677010297775269, + "logps/chosen": -105.10441589355469, + "logps/rejected": -133.29269409179688, + "loss": 1.0022, + "rewards/accuracies": 0.0, + "rewards/chosen": 6.074995517730713, + "rewards/margins": -1.7999067306518555, + "rewards/rejected": 7.874902248382568, + "step": 3280 + }, + { + "epoch": 0.53, + "learning_rate": 8.606362378183633e-06, + "logits/chosen": -1.3328595161437988, + "logits/rejected": -1.42190420627594, + "logps/chosen": -72.86524963378906, + "logps/rejected": -89.30319213867188, + "loss": 2.0124, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.5664138793945312, + "rewards/margins": -3.958817958831787, + "rewards/rejected": 6.525231838226318, + "step": 3281 + }, + { + "epoch": 0.53, + "learning_rate": 8.6054519337184e-06, + "logits/chosen": -0.8480396866798401, + "logits/rejected": -0.6542419791221619, + "logps/chosen": -38.43938064575195, + "logps/rejected": -16.64350700378418, + "loss": 0.3363, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.8568477630615234, + "rewards/margins": 0.04842740297317505, + "rewards/rejected": 0.8084203600883484, + "step": 3282 + }, + { + "epoch": 0.53, + "learning_rate": 8.604541240150552e-06, + "logits/chosen": -1.5186535120010376, + "logits/rejected": -1.415705680847168, + "logps/chosen": -35.83262634277344, + "logps/rejected": -22.760295867919922, + "loss": 0.6619, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.8476150035858154, + "rewards/margins": 3.377880334854126, + "rewards/rejected": 0.4697345793247223, + "step": 3283 + }, + { + "epoch": 0.53, + "learning_rate": 8.603630297543014e-06, + "logits/chosen": -1.2576298713684082, + "logits/rejected": -1.3331098556518555, + "logps/chosen": -64.15709686279297, + "logps/rejected": -107.73127746582031, + "loss": 3.2023, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.342519521713257, + "rewards/margins": -2.0046684741973877, + "rewards/rejected": 4.3471879959106445, + "step": 3284 + }, + { + "epoch": 0.53, + "learning_rate": 8.602719105958716e-06, + "logits/chosen": -1.0643521547317505, + "logits/rejected": -0.9784411191940308, + "logps/chosen": -56.97407913208008, + "logps/rejected": -33.571136474609375, + "loss": 1.1698, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.063722610473633, + "rewards/margins": 0.6480827331542969, + "rewards/rejected": 2.415639877319336, + "step": 3285 + }, + { + "epoch": 0.53, + "learning_rate": 8.60180766546062e-06, + "logits/chosen": -0.640572190284729, + "logits/rejected": -0.667148232460022, + "logps/chosen": -87.77574157714844, + "logps/rejected": -51.02049255371094, + "loss": 0.5219, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.9516128897666931, + "rewards/margins": -0.6036667227745056, + "rewards/rejected": 1.5552796125411987, + "step": 3286 + }, + { + "epoch": 0.53, + "learning_rate": 8.600895976111696e-06, + "logits/chosen": -0.9250308275222778, + "logits/rejected": -0.8404595851898193, + "logps/chosen": -55.19343948364258, + "logps/rejected": -53.461097717285156, + "loss": 0.7331, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.18559992313385, + "rewards/margins": -0.6995677947998047, + "rewards/rejected": 1.8851677179336548, + "step": 3287 + }, + { + "epoch": 0.53, + "learning_rate": 8.599984037974928e-06, + "logits/chosen": -1.004501223564148, + "logits/rejected": -0.9771875739097595, + "logps/chosen": -51.11834716796875, + "logps/rejected": -65.55500030517578, + "loss": 0.29, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.231931447982788, + "rewards/margins": 0.43493974208831787, + "rewards/rejected": 1.7969917058944702, + "step": 3288 + }, + { + "epoch": 0.53, + "learning_rate": 8.59907185111333e-06, + "logits/chosen": -0.6775504946708679, + "logits/rejected": -0.6604315042495728, + "logps/chosen": -78.00425720214844, + "logps/rejected": -62.87452697753906, + "loss": 0.4393, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.1635864973068237, + "rewards/margins": -0.26049184799194336, + "rewards/rejected": 1.424078345298767, + "step": 3289 + }, + { + "epoch": 0.53, + "learning_rate": 8.59815941558992e-06, + "logits/chosen": -0.8799943923950195, + "logits/rejected": -0.8877853751182556, + "logps/chosen": -90.23165893554688, + "logps/rejected": -77.09521484375, + "loss": 0.155, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2985870838165283, + "rewards/margins": 1.0669493675231934, + "rewards/rejected": 2.231637716293335, + "step": 3290 + }, + { + "epoch": 0.53, + "learning_rate": 8.597246731467742e-06, + "logits/chosen": -1.1432281732559204, + "logits/rejected": -1.159098744392395, + "logps/chosen": -40.11910629272461, + "logps/rejected": -69.113525390625, + "loss": 0.3577, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.8870937824249268, + "rewards/margins": 0.24972808361053467, + "rewards/rejected": 1.637365698814392, + "step": 3291 + }, + { + "epoch": 0.53, + "learning_rate": 8.596333798809852e-06, + "logits/chosen": -1.4337272644042969, + "logits/rejected": -1.0819861888885498, + "logps/chosen": -171.5050048828125, + "logps/rejected": -61.5727424621582, + "loss": 0.0281, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.450135707855225, + "rewards/margins": 2.9138078689575195, + "rewards/rejected": 3.536327838897705, + "step": 3292 + }, + { + "epoch": 0.53, + "learning_rate": 8.595420617679324e-06, + "logits/chosen": -1.4088352918624878, + "logits/rejected": -1.3796881437301636, + "logps/chosen": -54.98432922363281, + "logps/rejected": -58.109474182128906, + "loss": 2.0049, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.6121246814727783, + "rewards/margins": 0.8077034950256348, + "rewards/rejected": 2.8044211864471436, + "step": 3293 + }, + { + "epoch": 0.53, + "learning_rate": 8.594507188139251e-06, + "logits/chosen": -0.4931833744049072, + "logits/rejected": -0.6070471405982971, + "logps/chosen": -95.924072265625, + "logps/rejected": -101.9320068359375, + "loss": 1.8186, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.5688530206680298, + "rewards/margins": -3.6024727821350098, + "rewards/rejected": 5.17132568359375, + "step": 3294 + }, + { + "epoch": 0.53, + "learning_rate": 8.593593510252745e-06, + "logits/chosen": -0.8444546461105347, + "logits/rejected": -0.8613472580909729, + "logps/chosen": -60.98267364501953, + "logps/rejected": -106.2298583984375, + "loss": 0.4096, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.427565813064575, + "rewards/margins": 0.07790899276733398, + "rewards/rejected": 2.349656820297241, + "step": 3295 + }, + { + "epoch": 0.53, + "learning_rate": 8.59267958408293e-06, + "logits/chosen": -1.2620429992675781, + "logits/rejected": -1.3268262147903442, + "logps/chosen": -49.72235870361328, + "logps/rejected": -67.79373168945312, + "loss": 4.1331, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.9064576625823975, + "rewards/margins": -0.983673095703125, + "rewards/rejected": 3.8901307582855225, + "step": 3296 + }, + { + "epoch": 0.54, + "learning_rate": 8.591765409692949e-06, + "logits/chosen": -1.4192885160446167, + "logits/rejected": -1.0705469846725464, + "logps/chosen": -89.35722351074219, + "logps/rejected": -17.97332000732422, + "loss": 0.0054, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.424238681793213, + "rewards/margins": 5.694474220275879, + "rewards/rejected": 0.7297644019126892, + "step": 3297 + }, + { + "epoch": 0.54, + "learning_rate": 8.590850987145964e-06, + "logits/chosen": -0.914488673210144, + "logits/rejected": -0.9277645349502563, + "logps/chosen": -56.91742706298828, + "logps/rejected": -33.23609161376953, + "loss": 0.6976, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.476886034011841, + "rewards/margins": 0.384702205657959, + "rewards/rejected": 2.092183828353882, + "step": 3298 + }, + { + "epoch": 0.54, + "learning_rate": 8.589936316505154e-06, + "logits/chosen": -0.853057861328125, + "logits/rejected": -0.8683133125305176, + "logps/chosen": -70.01631927490234, + "logps/rejected": -82.74542999267578, + "loss": 0.3082, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.9928398132324219, + "rewards/margins": 0.22368156909942627, + "rewards/rejected": 1.7691582441329956, + "step": 3299 + }, + { + "epoch": 0.54, + "learning_rate": 8.589021397833712e-06, + "logits/chosen": -0.753656268119812, + "logits/rejected": -0.7009319067001343, + "logps/chosen": -62.965614318847656, + "logps/rejected": -106.76280212402344, + "loss": 0.1868, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.465798258781433, + "rewards/margins": 1.431227207183838, + "rewards/rejected": 0.034571077674627304, + "step": 3300 + }, + { + "epoch": 0.54, + "learning_rate": 8.588106231194851e-06, + "logits/chosen": -0.7794460654258728, + "logits/rejected": -0.6701031923294067, + "logps/chosen": -65.98298645019531, + "logps/rejected": -68.26097106933594, + "loss": 0.1024, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.728651523590088, + "rewards/margins": 1.6479774713516235, + "rewards/rejected": 1.0806740522384644, + "step": 3301 + }, + { + "epoch": 0.54, + "learning_rate": 8.5871908166518e-06, + "logits/chosen": -1.0675244331359863, + "logits/rejected": -1.0230010747909546, + "logps/chosen": -88.44552612304688, + "logps/rejected": -34.72526168823242, + "loss": 0.8464, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.711133599281311, + "rewards/margins": 0.8094581961631775, + "rewards/rejected": 0.9016754031181335, + "step": 3302 + }, + { + "epoch": 0.54, + "learning_rate": 8.586275154267806e-06, + "logits/chosen": -0.9151536226272583, + "logits/rejected": -0.9121227860450745, + "logps/chosen": -61.7591667175293, + "logps/rejected": -40.348751068115234, + "loss": 0.7938, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.567630410194397, + "rewards/margins": -0.36864471435546875, + "rewards/rejected": 1.9362751245498657, + "step": 3303 + }, + { + "epoch": 0.54, + "learning_rate": 8.585359244106132e-06, + "logits/chosen": -0.8959190845489502, + "logits/rejected": -0.7689272165298462, + "logps/chosen": -52.03409194946289, + "logps/rejected": -16.639404296875, + "loss": 0.2962, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.2585324048995972, + "rewards/margins": 0.979149341583252, + "rewards/rejected": 0.2793830931186676, + "step": 3304 + }, + { + "epoch": 0.54, + "learning_rate": 8.58444308623006e-06, + "logits/chosen": -1.0961464643478394, + "logits/rejected": -1.0259474515914917, + "logps/chosen": -125.218994140625, + "logps/rejected": -92.02511596679688, + "loss": 0.5017, + "rewards/accuracies": 0.0, + "rewards/chosen": 4.236721992492676, + "rewards/margins": -0.35275888442993164, + "rewards/rejected": 4.589480876922607, + "step": 3305 + }, + { + "epoch": 0.54, + "learning_rate": 8.583526680702888e-06, + "logits/chosen": -1.406192660331726, + "logits/rejected": -1.4175063371658325, + "logps/chosen": -45.28224182128906, + "logps/rejected": -96.11775207519531, + "loss": 0.4046, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.6747276782989502, + "rewards/margins": 0.5196632146835327, + "rewards/rejected": 1.1550644636154175, + "step": 3306 + }, + { + "epoch": 0.54, + "learning_rate": 8.582610027587928e-06, + "logits/chosen": -1.1762967109680176, + "logits/rejected": -1.3274736404418945, + "logps/chosen": -65.91390991210938, + "logps/rejected": -93.30330657958984, + "loss": 2.4579, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.2875587940216064, + "rewards/margins": -4.191409111022949, + "rewards/rejected": 6.478968143463135, + "step": 3307 + }, + { + "epoch": 0.54, + "learning_rate": 8.581693126948514e-06, + "logits/chosen": -0.9104203581809998, + "logits/rejected": -0.8518548607826233, + "logps/chosen": -60.026004791259766, + "logps/rejected": -64.05406951904297, + "loss": 0.248, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.429105043411255, + "rewards/margins": 0.6062237024307251, + "rewards/rejected": 1.8228813409805298, + "step": 3308 + }, + { + "epoch": 0.54, + "learning_rate": 8.580775978847997e-06, + "logits/chosen": -1.027130365371704, + "logits/rejected": -1.0333960056304932, + "logps/chosen": -53.36833953857422, + "logps/rejected": -45.85559844970703, + "loss": 1.5402, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.5385797023773193, + "rewards/margins": -0.5403366088867188, + "rewards/rejected": 3.078916311264038, + "step": 3309 + }, + { + "epoch": 0.54, + "learning_rate": 8.57985858334974e-06, + "logits/chosen": -0.7414705753326416, + "logits/rejected": -0.7684093117713928, + "logps/chosen": -80.99552917480469, + "logps/rejected": -51.55470275878906, + "loss": 1.4796, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.7357720136642456, + "rewards/margins": 0.6053463220596313, + "rewards/rejected": 1.1304256916046143, + "step": 3310 + }, + { + "epoch": 0.54, + "learning_rate": 8.578940940517129e-06, + "logits/chosen": -1.080896258354187, + "logits/rejected": -1.0117013454437256, + "logps/chosen": -50.48564147949219, + "logps/rejected": -82.05697631835938, + "loss": 0.3734, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.836845397949219, + "rewards/margins": 0.5531082153320312, + "rewards/rejected": 4.2837371826171875, + "step": 3311 + }, + { + "epoch": 0.54, + "learning_rate": 8.578023050413562e-06, + "logits/chosen": -0.615009069442749, + "logits/rejected": -0.6148008108139038, + "logps/chosen": -2.6803507804870605, + "logps/rejected": -0.7400466799736023, + "loss": 0.6572, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.12271099537611008, + "rewards/margins": -0.09250714629888535, + "rewards/rejected": 0.21521814167499542, + "step": 3312 + }, + { + "epoch": 0.54, + "learning_rate": 8.577104913102458e-06, + "logits/chosen": -1.0601091384887695, + "logits/rejected": -1.007171869277954, + "logps/chosen": -37.39273452758789, + "logps/rejected": -31.844776153564453, + "loss": 0.2584, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.142329454421997, + "rewards/margins": 0.47578704357147217, + "rewards/rejected": 1.666542410850525, + "step": 3313 + }, + { + "epoch": 0.54, + "learning_rate": 8.576186528647253e-06, + "logits/chosen": -0.893899142742157, + "logits/rejected": -0.9043979644775391, + "logps/chosen": -20.475011825561523, + "logps/rejected": -57.30037307739258, + "loss": 0.4645, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.1362098753452301, + "rewards/margins": -0.3090839385986328, + "rewards/rejected": 0.4452938139438629, + "step": 3314 + }, + { + "epoch": 0.54, + "learning_rate": 8.575267897111397e-06, + "logits/chosen": -0.7734777331352234, + "logits/rejected": -0.7734777331352234, + "logps/chosen": -1.698748230934143, + "logps/rejected": -1.698748230934143, + "loss": 0.3605, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.2455243170261383, + "rewards/margins": 0.0, + "rewards/rejected": 0.2455243170261383, + "step": 3315 + }, + { + "epoch": 0.54, + "learning_rate": 8.574349018558357e-06, + "logits/chosen": -0.7798172235488892, + "logits/rejected": -0.7086809277534485, + "logps/chosen": -30.222536087036133, + "logps/rejected": -10.999486923217773, + "loss": 3.1611, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.7596429586410522, + "rewards/margins": 0.48416900634765625, + "rewards/rejected": 1.275473952293396, + "step": 3316 + }, + { + "epoch": 0.54, + "learning_rate": 8.573429893051621e-06, + "logits/chosen": -1.0712181329727173, + "logits/rejected": -1.0411368608474731, + "logps/chosen": -89.20120239257812, + "logps/rejected": -96.76255798339844, + "loss": 0.2242, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.381192922592163, + "rewards/margins": 1.421586036682129, + "rewards/rejected": 0.959606945514679, + "step": 3317 + }, + { + "epoch": 0.54, + "learning_rate": 8.572510520654692e-06, + "logits/chosen": -0.8159565329551697, + "logits/rejected": -0.8783283829689026, + "logps/chosen": -47.63791275024414, + "logps/rejected": -74.9410400390625, + "loss": 1.1496, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.017252802848816, + "rewards/margins": -1.2982975244522095, + "rewards/rejected": 2.3155503273010254, + "step": 3318 + }, + { + "epoch": 0.54, + "learning_rate": 8.57159090143109e-06, + "logits/chosen": -1.5978641510009766, + "logits/rejected": -1.4923800230026245, + "logps/chosen": -112.01013946533203, + "logps/rejected": -72.36117553710938, + "loss": 0.0431, + "rewards/accuracies": 1.0, + "rewards/chosen": 7.622802257537842, + "rewards/margins": 3.342595100402832, + "rewards/rejected": 4.28020715713501, + "step": 3319 + }, + { + "epoch": 0.54, + "learning_rate": 8.570671035444351e-06, + "logits/chosen": -1.2979190349578857, + "logits/rejected": -1.2889623641967773, + "logps/chosen": -68.7465591430664, + "logps/rejected": -85.29634094238281, + "loss": 0.7601, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.738787055015564, + "rewards/margins": 1.6220703125, + "rewards/rejected": 0.11671676486730576, + "step": 3320 + }, + { + "epoch": 0.54, + "learning_rate": 8.569750922758029e-06, + "logits/chosen": -1.0287381410598755, + "logits/rejected": -0.8229853510856628, + "logps/chosen": -67.194580078125, + "logps/rejected": -37.78681945800781, + "loss": 0.6859, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.0772141218185425, + "rewards/margins": 1.2244232892990112, + "rewards/rejected": -0.14720916748046875, + "step": 3321 + }, + { + "epoch": 0.54, + "learning_rate": 8.568830563435695e-06, + "logits/chosen": -1.1968801021575928, + "logits/rejected": -0.9822098016738892, + "logps/chosen": -105.0009994506836, + "logps/rejected": -46.415687561035156, + "loss": 0.0202, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.953006744384766, + "rewards/margins": 3.305180311203003, + "rewards/rejected": 2.6478264331817627, + "step": 3322 + }, + { + "epoch": 0.54, + "learning_rate": 8.567909957540939e-06, + "logits/chosen": -1.2070565223693848, + "logits/rejected": -1.2432286739349365, + "logps/chosen": -76.68084716796875, + "logps/rejected": -106.76606750488281, + "loss": 0.3488, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.5829391479492188, + "rewards/margins": 0.5199309587478638, + "rewards/rejected": 1.063008189201355, + "step": 3323 + }, + { + "epoch": 0.54, + "learning_rate": 8.566989105137364e-06, + "logits/chosen": -0.7588050961494446, + "logits/rejected": -0.7633838653564453, + "logps/chosen": -7.927440643310547, + "logps/rejected": -3.3710410594940186, + "loss": 0.9053, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.35430994629859924, + "rewards/margins": 0.06198504567146301, + "rewards/rejected": 0.29232490062713623, + "step": 3324 + }, + { + "epoch": 0.54, + "learning_rate": 8.566068006288593e-06, + "logits/chosen": -0.8210468292236328, + "logits/rejected": -0.8210468292236328, + "logps/chosen": -49.450172424316406, + "logps/rejected": -49.450172424316406, + "loss": 1.4004, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.089152216911316, + "rewards/margins": 0.0, + "rewards/rejected": 1.089152216911316, + "step": 3325 + }, + { + "epoch": 0.54, + "learning_rate": 8.565146661058266e-06, + "logits/chosen": -0.8781479597091675, + "logits/rejected": -0.831430196762085, + "logps/chosen": -74.52273559570312, + "logps/rejected": -41.622344970703125, + "loss": 0.6564, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.003796339035034, + "rewards/margins": 0.08199143409729004, + "rewards/rejected": 2.921804904937744, + "step": 3326 + }, + { + "epoch": 0.54, + "learning_rate": 8.564225069510038e-06, + "logits/chosen": -0.9893931746482849, + "logits/rejected": -0.952822208404541, + "logps/chosen": -83.78422546386719, + "logps/rejected": -174.1700897216797, + "loss": 2.8623, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.700415015220642, + "rewards/margins": -3.437887668609619, + "rewards/rejected": 5.138302803039551, + "step": 3327 + }, + { + "epoch": 0.54, + "learning_rate": 8.563303231707582e-06, + "logits/chosen": -0.6935350894927979, + "logits/rejected": -0.6855975389480591, + "logps/chosen": -4.4124064445495605, + "logps/rejected": -2.616173505783081, + "loss": 0.5466, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2517774999141693, + "rewards/margins": 0.09221111238002777, + "rewards/rejected": 0.15956638753414154, + "step": 3328 + }, + { + "epoch": 0.54, + "learning_rate": 8.56238114771459e-06, + "logits/chosen": -1.0350594520568848, + "logits/rejected": -0.9926037788391113, + "logps/chosen": -87.23353576660156, + "logps/rejected": -53.66737365722656, + "loss": 1.6856, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.8335281610488892, + "rewards/margins": 0.59033203125, + "rewards/rejected": 1.2431961297988892, + "step": 3329 + }, + { + "epoch": 0.54, + "learning_rate": 8.561458817594767e-06, + "logits/chosen": -0.7703756093978882, + "logits/rejected": -0.7320684194564819, + "logps/chosen": -85.6480712890625, + "logps/rejected": -165.918701171875, + "loss": 0.3815, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.7195510864257812, + "rewards/margins": 1.1222929954528809, + "rewards/rejected": 0.5972580313682556, + "step": 3330 + }, + { + "epoch": 0.54, + "learning_rate": 8.56053624141184e-06, + "logits/chosen": -1.1936218738555908, + "logits/rejected": -1.1834732294082642, + "logps/chosen": -68.82077026367188, + "logps/rejected": -71.89412689208984, + "loss": 0.4083, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.3960556983947754, + "rewards/margins": -0.21396183967590332, + "rewards/rejected": 2.6100175380706787, + "step": 3331 + }, + { + "epoch": 0.54, + "learning_rate": 8.559613419229549e-06, + "logits/chosen": -1.769099473953247, + "logits/rejected": -1.3322089910507202, + "logps/chosen": -98.81385040283203, + "logps/rejected": -54.33699035644531, + "loss": 1.1847, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.03901743888855, + "rewards/margins": -0.24906253814697266, + "rewards/rejected": 2.2880799770355225, + "step": 3332 + }, + { + "epoch": 0.54, + "learning_rate": 8.558690351111651e-06, + "logits/chosen": -1.2607405185699463, + "logits/rejected": -1.2720712423324585, + "logps/chosen": -66.81153106689453, + "logps/rejected": -66.73953247070312, + "loss": 0.5369, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.0976113080978394, + "rewards/margins": 0.18707126379013062, + "rewards/rejected": 0.9105400443077087, + "step": 3333 + }, + { + "epoch": 0.54, + "learning_rate": 8.557767037121923e-06, + "logits/chosen": -1.0527697801589966, + "logits/rejected": -0.8256465792655945, + "logps/chosen": -84.57109069824219, + "logps/rejected": -14.295022010803223, + "loss": 0.1209, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.7254714965820312, + "rewards/margins": 1.9345672130584717, + "rewards/rejected": 0.7909043431282043, + "step": 3334 + }, + { + "epoch": 0.54, + "learning_rate": 8.556843477324155e-06, + "logits/chosen": -0.853549599647522, + "logits/rejected": -0.9703808426856995, + "logps/chosen": -78.487548828125, + "logps/rejected": -78.28717803955078, + "loss": 1.6503, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.3439652919769287, + "rewards/margins": -2.6519219875335693, + "rewards/rejected": 4.995887279510498, + "step": 3335 + }, + { + "epoch": 0.54, + "learning_rate": 8.55591967178216e-06, + "logits/chosen": -0.7091636061668396, + "logits/rejected": -0.7129708528518677, + "logps/chosen": -4.072240829467773, + "logps/rejected": -9.717824935913086, + "loss": 0.5671, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.3441275656223297, + "rewards/margins": -0.11200237274169922, + "rewards/rejected": 0.45612993836402893, + "step": 3336 + }, + { + "epoch": 0.54, + "learning_rate": 8.554995620559761e-06, + "logits/chosen": -1.3766459226608276, + "logits/rejected": -1.4165565967559814, + "logps/chosen": -131.33114624023438, + "logps/rejected": -174.41151428222656, + "loss": 0.9494, + "rewards/accuracies": 0.0, + "rewards/chosen": 6.735536098480225, + "rewards/margins": -1.7350220680236816, + "rewards/rejected": 8.470558166503906, + "step": 3337 + }, + { + "epoch": 0.54, + "learning_rate": 8.554071323720802e-06, + "logits/chosen": -1.1273932456970215, + "logits/rejected": -1.0662709474563599, + "logps/chosen": -44.53889465332031, + "logps/rejected": -69.81342315673828, + "loss": 0.3513, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.5609756708145142, + "rewards/margins": -0.00879669189453125, + "rewards/rejected": 1.5697723627090454, + "step": 3338 + }, + { + "epoch": 0.54, + "learning_rate": 8.553146781329144e-06, + "logits/chosen": -0.3108806014060974, + "logits/rejected": -0.31314027309417725, + "logps/chosen": -9.936992645263672, + "logps/rejected": -6.684468746185303, + "loss": 0.4296, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.6129276156425476, + "rewards/margins": 0.2246171236038208, + "rewards/rejected": 0.3883104920387268, + "step": 3339 + }, + { + "epoch": 0.54, + "learning_rate": 8.552221993448664e-06, + "logits/chosen": -1.1348240375518799, + "logits/rejected": -0.988688588142395, + "logps/chosen": -121.67619323730469, + "logps/rejected": -89.68498229980469, + "loss": 0.5249, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.3266372680664062, + "rewards/margins": -0.5988814830780029, + "rewards/rejected": 3.925518751144409, + "step": 3340 + }, + { + "epoch": 0.54, + "learning_rate": 8.551296960143257e-06, + "logits/chosen": -0.9428461194038391, + "logits/rejected": -0.8192808032035828, + "logps/chosen": -102.33354187011719, + "logps/rejected": -55.69244384765625, + "loss": 0.052, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.7318007946014404, + "rewards/margins": 2.2473220825195312, + "rewards/rejected": 1.4844788312911987, + "step": 3341 + }, + { + "epoch": 0.54, + "learning_rate": 8.55037168147683e-06, + "logits/chosen": -1.0012848377227783, + "logits/rejected": -1.093658685684204, + "logps/chosen": -46.89757537841797, + "logps/rejected": -66.54953002929688, + "loss": 0.7137, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.252190351486206, + "rewards/margins": -1.147721290588379, + "rewards/rejected": 3.399911642074585, + "step": 3342 + }, + { + "epoch": 0.54, + "learning_rate": 8.549446157513314e-06, + "logits/chosen": -0.8469308614730835, + "logits/rejected": -0.8306767344474792, + "logps/chosen": -56.62394332885742, + "logps/rejected": -82.95471954345703, + "loss": 0.1416, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.153986930847168, + "rewards/margins": 1.3181188106536865, + "rewards/rejected": 2.8358681201934814, + "step": 3343 + }, + { + "epoch": 0.54, + "learning_rate": 8.548520388316655e-06, + "logits/chosen": -0.871579110622406, + "logits/rejected": -0.9430810809135437, + "logps/chosen": -71.24414825439453, + "logps/rejected": -89.159423828125, + "loss": 0.5431, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.3095864057540894, + "rewards/margins": -0.6739493608474731, + "rewards/rejected": 1.9835357666015625, + "step": 3344 + }, + { + "epoch": 0.54, + "learning_rate": 8.547594373950814e-06, + "logits/chosen": -1.141366958618164, + "logits/rejected": -1.0585623979568481, + "logps/chosen": -70.22853088378906, + "logps/rejected": -61.72450637817383, + "loss": 1.3484, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.0197556018829346, + "rewards/margins": -1.411933422088623, + "rewards/rejected": 3.4316890239715576, + "step": 3345 + }, + { + "epoch": 0.54, + "learning_rate": 8.546668114479769e-06, + "logits/chosen": -0.8378391265869141, + "logits/rejected": -0.864905059337616, + "logps/chosen": -74.79295349121094, + "logps/rejected": -96.54190826416016, + "loss": 0.5839, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.747418999671936, + "rewards/margins": 0.35191190242767334, + "rewards/rejected": 1.3955070972442627, + "step": 3346 + }, + { + "epoch": 0.54, + "learning_rate": 8.545741609967515e-06, + "logits/chosen": -0.4763079881668091, + "logits/rejected": -0.49760517477989197, + "logps/chosen": -3.8507204055786133, + "logps/rejected": -23.528535842895508, + "loss": 0.373, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.1826462745666504, + "rewards/margins": -0.004964545369148254, + "rewards/rejected": 0.18761081993579865, + "step": 3347 + }, + { + "epoch": 0.54, + "learning_rate": 8.544814860478065e-06, + "logits/chosen": -1.026160717010498, + "logits/rejected": -1.004209041595459, + "logps/chosen": -37.938663482666016, + "logps/rejected": -43.89044189453125, + "loss": 0.4156, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.333153247833252, + "rewards/margins": -0.20440936088562012, + "rewards/rejected": 3.537562608718872, + "step": 3348 + }, + { + "epoch": 0.54, + "learning_rate": 8.543887866075451e-06, + "logits/chosen": -1.089408278465271, + "logits/rejected": -1.1864547729492188, + "logps/chosen": -38.579715728759766, + "logps/rejected": -40.02357864379883, + "loss": 0.9481, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1964008808135986, + "rewards/margins": 0.8846163749694824, + "rewards/rejected": 2.311784505844116, + "step": 3349 + }, + { + "epoch": 0.54, + "learning_rate": 8.54296062682372e-06, + "logits/chosen": -0.972489595413208, + "logits/rejected": -0.7026064395904541, + "logps/chosen": -108.80602264404297, + "logps/rejected": -44.080665588378906, + "loss": 0.0279, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.715564727783203, + "rewards/margins": 3.393357753753662, + "rewards/rejected": 2.322206974029541, + "step": 3350 + }, + { + "epoch": 0.54, + "learning_rate": 8.542033142786932e-06, + "logits/chosen": -0.9994036555290222, + "logits/rejected": -0.9832361340522766, + "logps/chosen": -40.68027877807617, + "logps/rejected": -15.804023742675781, + "loss": 0.7532, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.8666263818740845, + "rewards/margins": 0.16951030492782593, + "rewards/rejected": 0.6971160769462585, + "step": 3351 + }, + { + "epoch": 0.54, + "learning_rate": 8.541105414029167e-06, + "logits/chosen": -1.2118293046951294, + "logits/rejected": -1.206025242805481, + "logps/chosen": -96.94316101074219, + "logps/rejected": -142.71484375, + "loss": 1.9295, + "rewards/accuracies": 0.0, + "rewards/chosen": 5.302578926086426, + "rewards/margins": -3.693455696105957, + "rewards/rejected": 8.996034622192383, + "step": 3352 + }, + { + "epoch": 0.54, + "learning_rate": 8.540177440614525e-06, + "logits/chosen": -0.6095247268676758, + "logits/rejected": -0.6818711161613464, + "logps/chosen": -68.34014892578125, + "logps/rejected": -57.87074279785156, + "loss": 0.3972, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.7701095342636108, + "rewards/margins": -0.10527348518371582, + "rewards/rejected": 1.8753830194473267, + "step": 3353 + }, + { + "epoch": 0.54, + "learning_rate": 8.53924922260712e-06, + "logits/chosen": -1.0039201974868774, + "logits/rejected": -1.0231319665908813, + "logps/chosen": -70.06098175048828, + "logps/rejected": -66.95668029785156, + "loss": 1.0372, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.266218662261963, + "rewards/margins": -1.8698363304138184, + "rewards/rejected": 4.136054992675781, + "step": 3354 + }, + { + "epoch": 0.54, + "learning_rate": 8.538320760071082e-06, + "logits/chosen": -0.5372804403305054, + "logits/rejected": -0.5372804403305054, + "logps/chosen": -17.1475887298584, + "logps/rejected": -17.1475887298584, + "loss": 0.3985, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.33635959029197693, + "rewards/margins": 0.0, + "rewards/rejected": 0.33635959029197693, + "step": 3355 + }, + { + "epoch": 0.54, + "learning_rate": 8.537392053070562e-06, + "logits/chosen": -1.4508352279663086, + "logits/rejected": -1.5256816148757935, + "logps/chosen": -118.00167846679688, + "logps/rejected": -63.561187744140625, + "loss": 0.0076, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.649847507476807, + "rewards/margins": 6.297292232513428, + "rewards/rejected": 0.3525550961494446, + "step": 3356 + }, + { + "epoch": 0.54, + "learning_rate": 8.536463101669718e-06, + "logits/chosen": -1.176954746246338, + "logits/rejected": -1.0142625570297241, + "logps/chosen": -160.11891174316406, + "logps/rejected": -167.58444213867188, + "loss": 2.2814, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.822404384613037, + "rewards/margins": 2.5390334129333496, + "rewards/rejected": 4.2833709716796875, + "step": 3357 + }, + { + "epoch": 0.55, + "learning_rate": 8.535533905932739e-06, + "logits/chosen": -1.1882882118225098, + "logits/rejected": -0.9037259817123413, + "logps/chosen": -118.57441711425781, + "logps/rejected": -54.00364685058594, + "loss": 0.7831, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.8920944929122925, + "rewards/margins": -0.44580376148223877, + "rewards/rejected": 2.3378982543945312, + "step": 3358 + }, + { + "epoch": 0.55, + "learning_rate": 8.534604465923819e-06, + "logits/chosen": -0.9762523770332336, + "logits/rejected": -0.7558935284614563, + "logps/chosen": -66.7606430053711, + "logps/rejected": -28.878942489624023, + "loss": 0.0538, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.8656508922576904, + "rewards/margins": 3.362515926361084, + "rewards/rejected": -0.49686509370803833, + "step": 3359 + }, + { + "epoch": 0.55, + "learning_rate": 8.533674781707176e-06, + "logits/chosen": -0.936427116394043, + "logits/rejected": -0.9301104545593262, + "logps/chosen": -1.1125646829605103, + "logps/rejected": -6.25372838973999, + "loss": 0.8278, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.35688623785972595, + "rewards/margins": -0.06409704685211182, + "rewards/rejected": 0.42098328471183777, + "step": 3360 + }, + { + "epoch": 0.55, + "learning_rate": 8.532744853347042e-06, + "logits/chosen": -1.1347637176513672, + "logits/rejected": -1.1005444526672363, + "logps/chosen": -70.03038787841797, + "logps/rejected": -50.41025924682617, + "loss": 0.6299, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.104448676109314, + "rewards/margins": 0.271315336227417, + "rewards/rejected": 0.833133339881897, + "step": 3361 + }, + { + "epoch": 0.55, + "learning_rate": 8.531814680907664e-06, + "logits/chosen": -1.0946621894836426, + "logits/rejected": -1.1015266180038452, + "logps/chosen": -74.20555877685547, + "logps/rejected": -60.19330596923828, + "loss": 0.6767, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.6894416809082031, + "rewards/margins": -0.3135772943496704, + "rewards/rejected": 1.0030189752578735, + "step": 3362 + }, + { + "epoch": 0.55, + "learning_rate": 8.530884264453311e-06, + "logits/chosen": -1.0256537199020386, + "logits/rejected": -1.0163544416427612, + "logps/chosen": -78.07588195800781, + "logps/rejected": -59.57212448120117, + "loss": 3.7169, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.5072357654571533, + "rewards/margins": -1.3456447124481201, + "rewards/rejected": 2.8528804779052734, + "step": 3363 + }, + { + "epoch": 0.55, + "learning_rate": 8.529953604048264e-06, + "logits/chosen": -0.7407521605491638, + "logits/rejected": -0.7888574004173279, + "logps/chosen": -32.301109313964844, + "logps/rejected": -69.63922119140625, + "loss": 0.493, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.6295087337493896, + "rewards/margins": 1.0784802436828613, + "rewards/rejected": 1.5510284900665283, + "step": 3364 + }, + { + "epoch": 0.55, + "learning_rate": 8.529022699756826e-06, + "logits/chosen": -1.0434080362319946, + "logits/rejected": -0.8846772909164429, + "logps/chosen": -151.78265380859375, + "logps/rejected": -77.07363891601562, + "loss": 0.7552, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.576298475265503, + "rewards/margins": -1.1869134902954102, + "rewards/rejected": 3.763211965560913, + "step": 3365 + }, + { + "epoch": 0.55, + "learning_rate": 8.52809155164331e-06, + "logits/chosen": -1.2295371294021606, + "logits/rejected": -0.8533576726913452, + "logps/chosen": -83.74433135986328, + "logps/rejected": -208.40902709960938, + "loss": 3.1481, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.1946464776992798, + "rewards/margins": -5.9542460441589355, + "rewards/rejected": 7.148892402648926, + "step": 3366 + }, + { + "epoch": 0.55, + "learning_rate": 8.52716015977205e-06, + "logits/chosen": -1.2942086458206177, + "logits/rejected": -1.2860454320907593, + "logps/chosen": -61.979469299316406, + "logps/rejected": -46.02730178833008, + "loss": 0.8204, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.660144090652466, + "rewards/margins": -0.543898344039917, + "rewards/rejected": 3.204042434692383, + "step": 3367 + }, + { + "epoch": 0.55, + "learning_rate": 8.526228524207398e-06, + "logits/chosen": -0.6899625062942505, + "logits/rejected": -0.7084176540374756, + "logps/chosen": -45.83392333984375, + "logps/rejected": -70.69985961914062, + "loss": 1.4405, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.1798454374074936, + "rewards/margins": -0.5731163024902344, + "rewards/rejected": 0.7529617547988892, + "step": 3368 + }, + { + "epoch": 0.55, + "learning_rate": 8.52529664501372e-06, + "logits/chosen": -1.1435457468032837, + "logits/rejected": -1.1287943124771118, + "logps/chosen": -177.93545532226562, + "logps/rejected": -49.94504165649414, + "loss": 0.0428, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.389855861663818, + "rewards/margins": 2.9481401443481445, + "rewards/rejected": 2.441715717315674, + "step": 3369 + }, + { + "epoch": 0.55, + "learning_rate": 8.5243645222554e-06, + "logits/chosen": -0.9582415819168091, + "logits/rejected": -1.4402951002120972, + "logps/chosen": -90.6998062133789, + "logps/rejected": -58.13203048706055, + "loss": 0.3074, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.19976806640625, + "rewards/margins": 1.240993857383728, + "rewards/rejected": 0.958774209022522, + "step": 3370 + }, + { + "epoch": 0.55, + "learning_rate": 8.52343215599684e-06, + "logits/chosen": -0.7562300562858582, + "logits/rejected": -0.7719454169273376, + "logps/chosen": -101.32958984375, + "logps/rejected": -84.26828002929688, + "loss": 0.1633, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1012496948242188, + "rewards/margins": 2.0246894359588623, + "rewards/rejected": 1.0765602588653564, + "step": 3371 + }, + { + "epoch": 0.55, + "learning_rate": 8.522499546302459e-06, + "logits/chosen": -1.0539886951446533, + "logits/rejected": -0.9170181751251221, + "logps/chosen": -42.69917678833008, + "logps/rejected": -4.175522327423096, + "loss": 0.47, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.8783382177352905, + "rewards/margins": 0.9253160357475281, + "rewards/rejected": 0.9530221819877625, + "step": 3372 + }, + { + "epoch": 0.55, + "learning_rate": 8.521566693236687e-06, + "logits/chosen": -1.2029774188995361, + "logits/rejected": -1.2233810424804688, + "logps/chosen": -101.10594177246094, + "logps/rejected": -65.99531555175781, + "loss": 2.3139, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.1874656677246094, + "rewards/margins": -2.1279640197753906, + "rewards/rejected": 4.3154296875, + "step": 3373 + }, + { + "epoch": 0.55, + "learning_rate": 8.520633596863978e-06, + "logits/chosen": -1.291666030883789, + "logits/rejected": -1.3627901077270508, + "logps/chosen": -51.55307388305664, + "logps/rejected": -89.240966796875, + "loss": 2.077, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.3750882148742676, + "rewards/margins": -4.113354682922363, + "rewards/rejected": 7.488442897796631, + "step": 3374 + }, + { + "epoch": 0.55, + "learning_rate": 8.519700257248803e-06, + "logits/chosen": -0.9154106974601746, + "logits/rejected": -0.9154106974601746, + "logps/chosen": -66.97956085205078, + "logps/rejected": -66.97956085205078, + "loss": 0.7208, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.5954872369766235, + "rewards/margins": 0.0, + "rewards/rejected": 1.5954872369766235, + "step": 3375 + }, + { + "epoch": 0.55, + "learning_rate": 8.518766674455642e-06, + "logits/chosen": -1.0634901523590088, + "logits/rejected": -1.0606647729873657, + "logps/chosen": -92.92601013183594, + "logps/rejected": -54.59300231933594, + "loss": 1.2902, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.393310546875, + "rewards/margins": -0.6262085437774658, + "rewards/rejected": 2.019519090652466, + "step": 3376 + }, + { + "epoch": 0.55, + "learning_rate": 8.517832848548997e-06, + "logits/chosen": -1.130236268043518, + "logits/rejected": -1.080540418624878, + "logps/chosen": -198.3535614013672, + "logps/rejected": -142.0504150390625, + "loss": 0.0722, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.4594972133636475, + "rewards/margins": 2.104336738586426, + "rewards/rejected": 1.3551605939865112, + "step": 3377 + }, + { + "epoch": 0.55, + "learning_rate": 8.51689877959339e-06, + "logits/chosen": -0.9089531898498535, + "logits/rejected": -0.8702637553215027, + "logps/chosen": -59.483604431152344, + "logps/rejected": -63.674747467041016, + "loss": 0.8201, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.0961220264434814, + "rewards/margins": -1.3256847858428955, + "rewards/rejected": 2.421806812286377, + "step": 3378 + }, + { + "epoch": 0.55, + "learning_rate": 8.515964467653354e-06, + "logits/chosen": -1.1760530471801758, + "logits/rejected": -1.1512740850448608, + "logps/chosen": -37.60731506347656, + "logps/rejected": -49.628173828125, + "loss": 1.0058, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.5329757928848267, + "rewards/margins": -1.0213967561721802, + "rewards/rejected": 2.554372549057007, + "step": 3379 + }, + { + "epoch": 0.55, + "learning_rate": 8.515029912793443e-06, + "logits/chosen": -1.0215409994125366, + "logits/rejected": -0.8542560935020447, + "logps/chosen": -49.93413162231445, + "logps/rejected": -14.299370765686035, + "loss": 0.1283, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.921268105506897, + "rewards/margins": 1.30250883102417, + "rewards/rejected": 0.618759274482727, + "step": 3380 + }, + { + "epoch": 0.55, + "learning_rate": 8.514095115078223e-06, + "logits/chosen": -0.86236172914505, + "logits/rejected": -0.8059133887290955, + "logps/chosen": -88.79644012451172, + "logps/rejected": -39.38606262207031, + "loss": 0.8356, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.1652618646621704, + "rewards/margins": -0.48708343505859375, + "rewards/rejected": 1.6523452997207642, + "step": 3381 + }, + { + "epoch": 0.55, + "learning_rate": 8.51316007457228e-06, + "logits/chosen": -1.118597388267517, + "logits/rejected": -1.1131445169448853, + "logps/chosen": -117.9200668334961, + "logps/rejected": -71.2283935546875, + "loss": 0.8675, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.272020697593689, + "rewards/margins": 1.1915031671524048, + "rewards/rejected": 0.08051758259534836, + "step": 3382 + }, + { + "epoch": 0.55, + "learning_rate": 8.51222479134022e-06, + "logits/chosen": -0.4994238615036011, + "logits/rejected": -0.4994238615036011, + "logps/chosen": -1.323556661605835, + "logps/rejected": -1.323556661605835, + "loss": 0.8194, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.35588473081588745, + "rewards/margins": 0.0, + "rewards/rejected": 0.35588473081588745, + "step": 3383 + }, + { + "epoch": 0.55, + "learning_rate": 8.511289265446659e-06, + "logits/chosen": -0.8852449059486389, + "logits/rejected": -0.7532964944839478, + "logps/chosen": -68.56163024902344, + "logps/rejected": -48.924072265625, + "loss": 0.6397, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.228327989578247, + "rewards/margins": 0.35427170991897583, + "rewards/rejected": 0.8740562796592712, + "step": 3384 + }, + { + "epoch": 0.55, + "learning_rate": 8.510353496956235e-06, + "logits/chosen": -1.1736704111099243, + "logits/rejected": -1.1405760049819946, + "logps/chosen": -209.2593994140625, + "logps/rejected": -97.93876647949219, + "loss": 0.0838, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.800610542297363, + "rewards/margins": 5.0442352294921875, + "rewards/rejected": -0.24362488090991974, + "step": 3385 + }, + { + "epoch": 0.55, + "learning_rate": 8.509417485933598e-06, + "logits/chosen": -1.2847739458084106, + "logits/rejected": -1.16960608959198, + "logps/chosen": -74.3395767211914, + "logps/rejected": -32.70804214477539, + "loss": 0.267, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.0320305824279785, + "rewards/margins": 1.5937790870666504, + "rewards/rejected": 0.4382514953613281, + "step": 3386 + }, + { + "epoch": 0.55, + "learning_rate": 8.50848123244342e-06, + "logits/chosen": -1.0851030349731445, + "logits/rejected": -1.0440564155578613, + "logps/chosen": -84.98033905029297, + "logps/rejected": -54.76570510864258, + "loss": 0.4519, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.254763126373291, + "rewards/margins": -0.3675723075866699, + "rewards/rejected": 2.622335433959961, + "step": 3387 + }, + { + "epoch": 0.55, + "learning_rate": 8.507544736550386e-06, + "logits/chosen": -0.7245989441871643, + "logits/rejected": -0.6921525597572327, + "logps/chosen": -36.075828552246094, + "logps/rejected": -45.62816619873047, + "loss": 0.3017, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.0752967596054077, + "rewards/margins": 0.5457290410995483, + "rewards/rejected": 0.5295677185058594, + "step": 3388 + }, + { + "epoch": 0.55, + "learning_rate": 8.506607998319198e-06, + "logits/chosen": -1.034519910812378, + "logits/rejected": -1.0071947574615479, + "logps/chosen": -49.58473205566406, + "logps/rejected": -53.02789306640625, + "loss": 0.2314, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.274876356124878, + "rewards/margins": 0.953384280204773, + "rewards/rejected": 1.321492075920105, + "step": 3389 + }, + { + "epoch": 0.55, + "learning_rate": 8.505671017814581e-06, + "logits/chosen": -0.4073285758495331, + "logits/rejected": -0.3889988958835602, + "logps/chosen": -1.074677586555481, + "logps/rejected": -23.320329666137695, + "loss": 0.315, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3785342872142792, + "rewards/margins": 0.5700793862342834, + "rewards/rejected": -0.19154511392116547, + "step": 3390 + }, + { + "epoch": 0.55, + "learning_rate": 8.504733795101264e-06, + "logits/chosen": -1.1196653842926025, + "logits/rejected": -1.1855190992355347, + "logps/chosen": -109.79202270507812, + "logps/rejected": -129.66546630859375, + "loss": 0.5328, + "rewards/accuracies": 0.0, + "rewards/chosen": 6.126254558563232, + "rewards/margins": -0.5841217041015625, + "rewards/rejected": 6.710376262664795, + "step": 3391 + }, + { + "epoch": 0.55, + "learning_rate": 8.503796330244005e-06, + "logits/chosen": -1.1097218990325928, + "logits/rejected": -1.0138932466506958, + "logps/chosen": -107.6807861328125, + "logps/rejected": -100.29823303222656, + "loss": 0.0461, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.478421211242676, + "rewards/margins": 2.5681793689727783, + "rewards/rejected": 2.9102418422698975, + "step": 3392 + }, + { + "epoch": 0.55, + "learning_rate": 8.502858623307574e-06, + "logits/chosen": -1.158860206604004, + "logits/rejected": -0.91416335105896, + "logps/chosen": -162.05670166015625, + "logps/rejected": -82.02833557128906, + "loss": 0.3046, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.930691719055176, + "rewards/margins": 0.9175338745117188, + "rewards/rejected": 5.013157844543457, + "step": 3393 + }, + { + "epoch": 0.55, + "learning_rate": 8.501920674356755e-06, + "logits/chosen": -1.3699992895126343, + "logits/rejected": -1.377185583114624, + "logps/chosen": -68.54522705078125, + "logps/rejected": -79.20985412597656, + "loss": 1.0885, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.059617757797241, + "rewards/margins": -0.7231314182281494, + "rewards/rejected": 2.7827491760253906, + "step": 3394 + }, + { + "epoch": 0.55, + "learning_rate": 8.500982483456353e-06, + "logits/chosen": -0.8044196367263794, + "logits/rejected": -0.8044196367263794, + "logps/chosen": -77.80891418457031, + "logps/rejected": -77.80891418457031, + "loss": 0.3485, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.302137851715088, + "rewards/margins": 0.0, + "rewards/rejected": 3.302137851715088, + "step": 3395 + }, + { + "epoch": 0.55, + "learning_rate": 8.500044050671188e-06, + "logits/chosen": -0.6601247191429138, + "logits/rejected": -0.681618332862854, + "logps/chosen": -60.63536071777344, + "logps/rejected": -65.34632873535156, + "loss": 0.9665, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.130744218826294, + "rewards/margins": -0.911959171295166, + "rewards/rejected": 3.04270339012146, + "step": 3396 + }, + { + "epoch": 0.55, + "learning_rate": 8.499105376066098e-06, + "logits/chosen": -1.133378505706787, + "logits/rejected": -1.13200044631958, + "logps/chosen": -51.403045654296875, + "logps/rejected": -79.95722961425781, + "loss": 0.4806, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.5810348987579346, + "rewards/margins": 0.35730743408203125, + "rewards/rejected": 3.2237274646759033, + "step": 3397 + }, + { + "epoch": 0.55, + "learning_rate": 8.498166459705934e-06, + "logits/chosen": -0.8286464214324951, + "logits/rejected": -0.691713809967041, + "logps/chosen": -47.67245864868164, + "logps/rejected": -75.99317169189453, + "loss": 0.289, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.432016372680664, + "rewards/margins": 0.5088908672332764, + "rewards/rejected": 1.9231255054473877, + "step": 3398 + }, + { + "epoch": 0.55, + "learning_rate": 8.49722730165557e-06, + "logits/chosen": -1.5001906156539917, + "logits/rejected": -1.1729011535644531, + "logps/chosen": -115.81616973876953, + "logps/rejected": -88.1526870727539, + "loss": 0.2183, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.839527130126953, + "rewards/margins": 0.6253035068511963, + "rewards/rejected": 2.214223623275757, + "step": 3399 + }, + { + "epoch": 0.55, + "learning_rate": 8.496287901979888e-06, + "logits/chosen": -1.186943531036377, + "logits/rejected": -1.1710602045059204, + "logps/chosen": -114.24466705322266, + "logps/rejected": -137.75245666503906, + "loss": 0.4636, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.554889678955078, + "rewards/margins": -0.3664283752441406, + "rewards/rejected": 2.9213180541992188, + "step": 3400 + }, + { + "epoch": 0.55, + "learning_rate": 8.495348260743795e-06, + "logits/chosen": -1.0570399761199951, + "logits/rejected": -1.0570399761199951, + "logps/chosen": -130.23385620117188, + "logps/rejected": -130.23385620117188, + "loss": 0.5073, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.2576751708984375, + "rewards/margins": 0.0, + "rewards/rejected": 3.2576751708984375, + "step": 3401 + }, + { + "epoch": 0.55, + "learning_rate": 8.494408378012208e-06, + "logits/chosen": -0.4942915737628937, + "logits/rejected": -0.49051353335380554, + "logps/chosen": -2.0735929012298584, + "logps/rejected": -1.318987488746643, + "loss": 0.6971, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.17982375621795654, + "rewards/margins": -0.13321435451507568, + "rewards/rejected": 0.3130381107330322, + "step": 3402 + }, + { + "epoch": 0.55, + "learning_rate": 8.49346825385007e-06, + "logits/chosen": -1.033174753189087, + "logits/rejected": -1.1140629053115845, + "logps/chosen": -44.40880584716797, + "logps/rejected": -46.3514518737793, + "loss": 0.5199, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.1892173290252686, + "rewards/margins": -0.19508862495422363, + "rewards/rejected": 2.384305953979492, + "step": 3403 + }, + { + "epoch": 0.55, + "learning_rate": 8.49252788832233e-06, + "logits/chosen": -1.478269100189209, + "logits/rejected": -1.5051803588867188, + "logps/chosen": -98.54869079589844, + "logps/rejected": -107.58201599121094, + "loss": 1.5229, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.4978164434432983, + "rewards/margins": -1.9189180135726929, + "rewards/rejected": 3.416734457015991, + "step": 3404 + }, + { + "epoch": 0.55, + "learning_rate": 8.491587281493961e-06, + "logits/chosen": -1.0225058794021606, + "logits/rejected": -0.944965124130249, + "logps/chosen": -69.92073059082031, + "logps/rejected": -46.35470962524414, + "loss": 1.4283, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.491253852844238, + "rewards/margins": 1.6792683601379395, + "rewards/rejected": 3.811985492706299, + "step": 3405 + }, + { + "epoch": 0.55, + "learning_rate": 8.490646433429946e-06, + "logits/chosen": -0.7819623947143555, + "logits/rejected": -0.7861465215682983, + "logps/chosen": -15.360469818115234, + "logps/rejected": -2.250774621963501, + "loss": 0.4273, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3736743927001953, + "rewards/margins": 0.18848450481891632, + "rewards/rejected": 0.185189887881279, + "step": 3406 + }, + { + "epoch": 0.55, + "learning_rate": 8.489705344195292e-06, + "logits/chosen": -1.1242222785949707, + "logits/rejected": -1.1965386867523193, + "logps/chosen": -91.6492691040039, + "logps/rejected": -156.67811584472656, + "loss": 1.2608, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.274004340171814, + "rewards/margins": -2.4080252647399902, + "rewards/rejected": 3.6820297241210938, + "step": 3407 + }, + { + "epoch": 0.55, + "learning_rate": 8.48876401385502e-06, + "logits/chosen": -0.8842774629592896, + "logits/rejected": -0.8211004734039307, + "logps/chosen": -53.788047790527344, + "logps/rejected": -72.76846313476562, + "loss": 0.7387, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.9929039478302002, + "rewards/margins": 0.00039136409759521484, + "rewards/rejected": 1.992512583732605, + "step": 3408 + }, + { + "epoch": 0.55, + "learning_rate": 8.487822442474162e-06, + "logits/chosen": -1.1350871324539185, + "logits/rejected": -1.046434760093689, + "logps/chosen": -80.27676391601562, + "logps/rejected": -65.99227905273438, + "loss": 0.1125, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.38360071182251, + "rewards/margins": 2.959111213684082, + "rewards/rejected": 1.4244896173477173, + "step": 3409 + }, + { + "epoch": 0.55, + "learning_rate": 8.48688063011778e-06, + "logits/chosen": -0.7786433696746826, + "logits/rejected": -0.7253755331039429, + "logps/chosen": -68.21123504638672, + "logps/rejected": -44.53212356567383, + "loss": 0.6099, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.7733787298202515, + "rewards/margins": -0.7483083009719849, + "rewards/rejected": 2.5216870307922363, + "step": 3410 + }, + { + "epoch": 0.55, + "learning_rate": 8.485938576850937e-06, + "logits/chosen": -1.1788837909698486, + "logits/rejected": -1.063975214958191, + "logps/chosen": -119.68220520019531, + "logps/rejected": -96.26673889160156, + "loss": 0.0851, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.14216947555542, + "rewards/margins": 1.6902101039886475, + "rewards/rejected": 3.4519593715667725, + "step": 3411 + }, + { + "epoch": 0.55, + "learning_rate": 8.484996282738722e-06, + "logits/chosen": -0.7850188612937927, + "logits/rejected": -0.8156745433807373, + "logps/chosen": -77.2062759399414, + "logps/rejected": -41.738433837890625, + "loss": 0.8498, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.3329437971115112, + "rewards/margins": -0.5514533519744873, + "rewards/rejected": 1.8843971490859985, + "step": 3412 + }, + { + "epoch": 0.55, + "learning_rate": 8.484053747846242e-06, + "logits/chosen": -1.3905599117279053, + "logits/rejected": -1.3369417190551758, + "logps/chosen": -43.300960540771484, + "logps/rejected": -32.38899230957031, + "loss": 0.6886, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.9403843879699707, + "rewards/margins": -0.5293467044830322, + "rewards/rejected": 3.469731092453003, + "step": 3413 + }, + { + "epoch": 0.55, + "learning_rate": 8.483110972238612e-06, + "logits/chosen": -1.3361693620681763, + "logits/rejected": -1.2138522863388062, + "logps/chosen": -93.25152587890625, + "logps/rejected": -17.613231658935547, + "loss": 0.1455, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2518088817596436, + "rewards/margins": 3.1428966522216797, + "rewards/rejected": 0.10891228169202805, + "step": 3414 + }, + { + "epoch": 0.55, + "learning_rate": 8.48216795598097e-06, + "logits/chosen": -1.4149004220962524, + "logits/rejected": -1.3969579935073853, + "logps/chosen": -126.74345397949219, + "logps/rejected": -139.63873291015625, + "loss": 0.7724, + "rewards/accuracies": 0.0, + "rewards/chosen": 8.032454490661621, + "rewards/margins": -0.2954444885253906, + "rewards/rejected": 8.327898979187012, + "step": 3415 + }, + { + "epoch": 0.55, + "learning_rate": 8.481224699138476e-06, + "logits/chosen": -0.6951255202293396, + "logits/rejected": -0.6951255202293396, + "logps/chosen": -50.50981903076172, + "logps/rejected": -50.50981903076172, + "loss": 0.355, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.886487603187561, + "rewards/margins": 0.0, + "rewards/rejected": 1.886487603187561, + "step": 3416 + }, + { + "epoch": 0.55, + "learning_rate": 8.480281201776291e-06, + "logits/chosen": -0.9610909223556519, + "logits/rejected": -1.0514805316925049, + "logps/chosen": -53.66318893432617, + "logps/rejected": -190.22909545898438, + "loss": 0.4458, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.1408398151397705, + "rewards/margins": -0.35451698303222656, + "rewards/rejected": 1.495356798171997, + "step": 3417 + }, + { + "epoch": 0.55, + "learning_rate": 8.479337463959607e-06, + "logits/chosen": -0.7053212523460388, + "logits/rejected": -0.7147103548049927, + "logps/chosen": -9.7933349609375, + "logps/rejected": -9.09112548828125, + "loss": 0.3848, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.03439140319824219, + "rewards/margins": -0.13813896477222443, + "rewards/rejected": 0.10374756157398224, + "step": 3418 + }, + { + "epoch": 0.55, + "learning_rate": 8.478393485753625e-06, + "logits/chosen": -1.512524962425232, + "logits/rejected": -1.519087314605713, + "logps/chosen": -136.06903076171875, + "logps/rejected": -137.63485717773438, + "loss": 0.9643, + "rewards/accuracies": 0.0, + "rewards/chosen": 4.666177272796631, + "rewards/margins": -1.7100481986999512, + "rewards/rejected": 6.376225471496582, + "step": 3419 + }, + { + "epoch": 0.56, + "learning_rate": 8.477449267223565e-06, + "logits/chosen": -0.6332047581672668, + "logits/rejected": -0.6382142305374146, + "logps/chosen": -3.1668076515197754, + "logps/rejected": -2.00810170173645, + "loss": 2.6197, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.5402215719223022, + "rewards/margins": -0.015709340572357178, + "rewards/rejected": 0.5559309124946594, + "step": 3420 + }, + { + "epoch": 0.56, + "learning_rate": 8.476504808434667e-06, + "logits/chosen": -0.9301678538322449, + "logits/rejected": -0.8097879886627197, + "logps/chosen": -66.80337524414062, + "logps/rejected": -9.301515579223633, + "loss": 0.191, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.2647767066955566, + "rewards/margins": 1.0372920036315918, + "rewards/rejected": 1.2274847030639648, + "step": 3421 + }, + { + "epoch": 0.56, + "learning_rate": 8.47556010945218e-06, + "logits/chosen": -1.2553809881210327, + "logits/rejected": -1.149356722831726, + "logps/chosen": -92.93833923339844, + "logps/rejected": -18.62009048461914, + "loss": 0.11, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.449794054031372, + "rewards/margins": 1.437159776687622, + "rewards/rejected": 0.01263427734375, + "step": 3422 + }, + { + "epoch": 0.56, + "learning_rate": 8.474615170341378e-06, + "logits/chosen": -1.100602626800537, + "logits/rejected": -0.8994664549827576, + "logps/chosen": -238.7655487060547, + "logps/rejected": -132.1357879638672, + "loss": 1.177, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.739668369293213, + "rewards/margins": 0.6381454467773438, + "rewards/rejected": 5.101522922515869, + "step": 3423 + }, + { + "epoch": 0.56, + "learning_rate": 8.473669991167543e-06, + "logits/chosen": -0.8509697318077087, + "logits/rejected": -0.8211547136306763, + "logps/chosen": -119.99961853027344, + "logps/rejected": -77.11038208007812, + "loss": 0.5105, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.064723253250122, + "rewards/margins": -0.29856419563293457, + "rewards/rejected": 2.3632874488830566, + "step": 3424 + }, + { + "epoch": 0.56, + "learning_rate": 8.472724571995979e-06, + "logits/chosen": -1.0345215797424316, + "logits/rejected": -0.9319786429405212, + "logps/chosen": -66.11958312988281, + "logps/rejected": -66.18939971923828, + "loss": 0.2585, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.2783355712890625, + "rewards/margins": 0.6343543529510498, + "rewards/rejected": 3.6439812183380127, + "step": 3425 + }, + { + "epoch": 0.56, + "learning_rate": 8.471778912892008e-06, + "logits/chosen": -1.0561684370040894, + "logits/rejected": -0.7552542686462402, + "logps/chosen": -126.44355773925781, + "logps/rejected": -136.15516662597656, + "loss": 0.8517, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.814352512359619, + "rewards/margins": 0.7522704601287842, + "rewards/rejected": 3.062082052230835, + "step": 3426 + }, + { + "epoch": 0.56, + "learning_rate": 8.470833013920963e-06, + "logits/chosen": -0.578394889831543, + "logits/rejected": -0.578394889831543, + "logps/chosen": -67.1594009399414, + "logps/rejected": -67.1594009399414, + "loss": 0.3547, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.140911817550659, + "rewards/margins": 0.0, + "rewards/rejected": 2.140911817550659, + "step": 3427 + }, + { + "epoch": 0.56, + "learning_rate": 8.469886875148199e-06, + "logits/chosen": -1.460220217704773, + "logits/rejected": -1.34396493434906, + "logps/chosen": -94.80464172363281, + "logps/rejected": -57.949501037597656, + "loss": 0.4777, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.641064643859863, + "rewards/margins": 2.729355812072754, + "rewards/rejected": 1.9117088317871094, + "step": 3428 + }, + { + "epoch": 0.56, + "learning_rate": 8.468940496639084e-06, + "logits/chosen": -1.1134132146835327, + "logits/rejected": -0.9916444420814514, + "logps/chosen": -57.66124725341797, + "logps/rejected": -64.56578826904297, + "loss": 1.334, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.5617852210998535, + "rewards/margins": -0.7598311901092529, + "rewards/rejected": 3.3216164112091064, + "step": 3429 + }, + { + "epoch": 0.56, + "learning_rate": 8.467993878459005e-06, + "logits/chosen": -0.9461562037467957, + "logits/rejected": -1.067704677581787, + "logps/chosen": -113.20481872558594, + "logps/rejected": -166.61199951171875, + "loss": 2.634, + "rewards/accuracies": 0.0, + "rewards/chosen": 5.2838029861450195, + "rewards/margins": -3.9301834106445312, + "rewards/rejected": 9.21398639678955, + "step": 3430 + }, + { + "epoch": 0.56, + "learning_rate": 8.467047020673361e-06, + "logits/chosen": -0.8705344796180725, + "logits/rejected": -0.8007715344429016, + "logps/chosen": -59.001766204833984, + "logps/rejected": -54.4432258605957, + "loss": 0.7749, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.6575253009796143, + "rewards/margins": -0.3526625633239746, + "rewards/rejected": 2.010187864303589, + "step": 3431 + }, + { + "epoch": 0.56, + "learning_rate": 8.466099923347576e-06, + "logits/chosen": -0.8894330263137817, + "logits/rejected": -0.9842557311058044, + "logps/chosen": -114.90815734863281, + "logps/rejected": -95.01652526855469, + "loss": 2.7027, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.3460693359375, + "rewards/margins": -5.392439365386963, + "rewards/rejected": 6.738508701324463, + "step": 3432 + }, + { + "epoch": 0.56, + "learning_rate": 8.46515258654708e-06, + "logits/chosen": -1.4928417205810547, + "logits/rejected": -1.333469033241272, + "logps/chosen": -185.19754028320312, + "logps/rejected": -68.57144927978516, + "loss": 0.1509, + "rewards/accuracies": 1.0, + "rewards/chosen": 7.081689357757568, + "rewards/margins": 4.06928014755249, + "rewards/rejected": 3.012409210205078, + "step": 3433 + }, + { + "epoch": 0.56, + "learning_rate": 8.46420501033733e-06, + "logits/chosen": -1.182502031326294, + "logits/rejected": -1.1254305839538574, + "logps/chosen": -62.28379821777344, + "logps/rejected": -58.156436920166016, + "loss": 0.6529, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.556536912918091, + "rewards/margins": 0.4524967670440674, + "rewards/rejected": 3.1040401458740234, + "step": 3434 + }, + { + "epoch": 0.56, + "learning_rate": 8.463257194783793e-06, + "logits/chosen": -1.131704330444336, + "logits/rejected": -1.0881853103637695, + "logps/chosen": -66.63267517089844, + "logps/rejected": -63.43412399291992, + "loss": 0.2409, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.529367208480835, + "rewards/margins": 0.8565397262573242, + "rewards/rejected": 2.6728274822235107, + "step": 3435 + }, + { + "epoch": 0.56, + "learning_rate": 8.462309139951952e-06, + "logits/chosen": -1.1148464679718018, + "logits/rejected": -1.1103867292404175, + "logps/chosen": -66.421875, + "logps/rejected": -76.68254089355469, + "loss": 1.7721, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.45896315574646, + "rewards/margins": -1.79298996925354, + "rewards/rejected": 5.251953125, + "step": 3436 + }, + { + "epoch": 0.56, + "learning_rate": 8.461360845907312e-06, + "logits/chosen": -1.2043206691741943, + "logits/rejected": -1.1954017877578735, + "logps/chosen": -57.194297790527344, + "logps/rejected": -89.99732971191406, + "loss": 0.1824, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.1454856395721436, + "rewards/margins": 0.9015204906463623, + "rewards/rejected": 1.2439651489257812, + "step": 3437 + }, + { + "epoch": 0.56, + "learning_rate": 8.460412312715386e-06, + "logits/chosen": -1.3220683336257935, + "logits/rejected": -1.2473782300949097, + "logps/chosen": -124.03932189941406, + "logps/rejected": -71.25534057617188, + "loss": 0.5397, + "rewards/accuracies": 0.0, + "rewards/chosen": 4.488401889801025, + "rewards/margins": -0.6384978294372559, + "rewards/rejected": 5.126899719238281, + "step": 3438 + }, + { + "epoch": 0.56, + "learning_rate": 8.459463540441716e-06, + "logits/chosen": -0.8409707546234131, + "logits/rejected": -0.884675920009613, + "logps/chosen": -61.942657470703125, + "logps/rejected": -144.8980712890625, + "loss": 0.2547, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.5360885858535767, + "rewards/margins": 0.7252395749092102, + "rewards/rejected": 0.8108490109443665, + "step": 3439 + }, + { + "epoch": 0.56, + "learning_rate": 8.458514529151847e-06, + "logits/chosen": -1.2124873399734497, + "logits/rejected": -1.3779020309448242, + "logps/chosen": -107.63330078125, + "logps/rejected": -95.69558715820312, + "loss": 3.1814, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.942770481109619, + "rewards/margins": -6.3328328132629395, + "rewards/rejected": 9.275603294372559, + "step": 3440 + }, + { + "epoch": 0.56, + "learning_rate": 8.457565278911349e-06, + "logits/chosen": -0.5830385088920593, + "logits/rejected": -0.588954746723175, + "logps/chosen": -4.1200690269470215, + "logps/rejected": -5.696778774261475, + "loss": 0.7743, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.33128586411476135, + "rewards/margins": 0.20424246788024902, + "rewards/rejected": 0.12704339623451233, + "step": 3441 + }, + { + "epoch": 0.56, + "learning_rate": 8.456615789785804e-06, + "logits/chosen": -1.0674421787261963, + "logits/rejected": -1.0674421787261963, + "logps/chosen": -52.548606872558594, + "logps/rejected": -52.548606872558594, + "loss": 0.4571, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.3019462823867798, + "rewards/margins": 0.0, + "rewards/rejected": 1.3019462823867798, + "step": 3442 + }, + { + "epoch": 0.56, + "learning_rate": 8.455666061840816e-06, + "logits/chosen": -0.8189242482185364, + "logits/rejected": -0.7648062705993652, + "logps/chosen": -79.59741973876953, + "logps/rejected": -51.331817626953125, + "loss": 0.321, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9873313903808594, + "rewards/margins": 0.8376212120056152, + "rewards/rejected": 2.149710178375244, + "step": 3443 + }, + { + "epoch": 0.56, + "learning_rate": 8.454716095142001e-06, + "logits/chosen": -1.1652178764343262, + "logits/rejected": -1.1512564420700073, + "logps/chosen": -88.54935455322266, + "logps/rejected": -59.0173225402832, + "loss": 0.3434, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.896675944328308, + "rewards/margins": 0.017178058624267578, + "rewards/rejected": 1.8794978857040405, + "step": 3444 + }, + { + "epoch": 0.56, + "learning_rate": 8.453765889754995e-06, + "logits/chosen": -1.0232009887695312, + "logits/rejected": -1.0412120819091797, + "logps/chosen": -71.16270446777344, + "logps/rejected": -23.55021858215332, + "loss": 0.7858, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.1236093044281006, + "rewards/margins": 1.1293402910232544, + "rewards/rejected": 0.9942690134048462, + "step": 3445 + }, + { + "epoch": 0.56, + "learning_rate": 8.452815445745443e-06, + "logits/chosen": -0.7288081049919128, + "logits/rejected": -0.6917932033538818, + "logps/chosen": -49.476402282714844, + "logps/rejected": -84.78884887695312, + "loss": 0.1386, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.9008339643478394, + "rewards/margins": 1.3189964294433594, + "rewards/rejected": 0.5818374752998352, + "step": 3446 + }, + { + "epoch": 0.56, + "learning_rate": 8.451864763179016e-06, + "logits/chosen": -1.3593019247055054, + "logits/rejected": -1.2257999181747437, + "logps/chosen": -107.25050354003906, + "logps/rejected": -140.6503143310547, + "loss": 0.0298, + "rewards/accuracies": 1.0, + "rewards/chosen": 8.166544914245605, + "rewards/margins": 2.8639674186706543, + "rewards/rejected": 5.302577495574951, + "step": 3447 + }, + { + "epoch": 0.56, + "learning_rate": 8.450913842121396e-06, + "logits/chosen": -1.1894547939300537, + "logits/rejected": -1.1606117486953735, + "logps/chosen": -85.05877685546875, + "logps/rejected": -78.07327270507812, + "loss": 0.3568, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9109437465667725, + "rewards/margins": 0.20266270637512207, + "rewards/rejected": 2.7082810401916504, + "step": 3448 + }, + { + "epoch": 0.56, + "learning_rate": 8.44996268263828e-06, + "logits/chosen": -1.0637938976287842, + "logits/rejected": -1.1749935150146484, + "logps/chosen": -63.36334991455078, + "logps/rejected": -88.27951049804688, + "loss": 2.2763, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.4912407398223877, + "rewards/margins": -3.0386087894439697, + "rewards/rejected": 5.529849529266357, + "step": 3449 + }, + { + "epoch": 0.56, + "learning_rate": 8.449011284795389e-06, + "logits/chosen": -1.134789228439331, + "logits/rejected": -1.1986762285232544, + "logps/chosen": -80.83200073242188, + "logps/rejected": -98.56452178955078, + "loss": 0.9605, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.2181077003479004, + "rewards/margins": 0.4576096534729004, + "rewards/rejected": 1.760498046875, + "step": 3450 + }, + { + "epoch": 0.56, + "learning_rate": 8.44805964865845e-06, + "logits/chosen": -1.415256381034851, + "logits/rejected": -1.2694494724273682, + "logps/chosen": -101.66616821289062, + "logps/rejected": -21.32833480834961, + "loss": 0.21, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.553723096847534, + "rewards/margins": 1.948920726776123, + "rewards/rejected": 0.6048023104667664, + "step": 3451 + }, + { + "epoch": 0.56, + "learning_rate": 8.447107774293219e-06, + "logits/chosen": -1.1342239379882812, + "logits/rejected": -1.1500321626663208, + "logps/chosen": -158.9480438232422, + "logps/rejected": -89.98086547851562, + "loss": 1.8185, + "rewards/accuracies": 0.0, + "rewards/chosen": 6.4126296043396, + "rewards/margins": -1.9413275718688965, + "rewards/rejected": 8.353957176208496, + "step": 3452 + }, + { + "epoch": 0.56, + "learning_rate": 8.446155661765457e-06, + "logits/chosen": -0.6855287551879883, + "logits/rejected": -0.7083263397216797, + "logps/chosen": -8.057332992553711, + "logps/rejected": -18.372177124023438, + "loss": 0.6066, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.03589973598718643, + "rewards/margins": -0.6209905743598938, + "rewards/rejected": 0.656890332698822, + "step": 3453 + }, + { + "epoch": 0.56, + "learning_rate": 8.445203311140944e-06, + "logits/chosen": -1.0960338115692139, + "logits/rejected": -1.1630842685699463, + "logps/chosen": -106.49891662597656, + "logps/rejected": -116.8251953125, + "loss": 3.0588, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.6911866664886475, + "rewards/margins": -4.200879096984863, + "rewards/rejected": 6.892065525054932, + "step": 3454 + }, + { + "epoch": 0.56, + "learning_rate": 8.444250722485483e-06, + "logits/chosen": -1.0915980339050293, + "logits/rejected": -1.0657225847244263, + "logps/chosen": -60.925376892089844, + "logps/rejected": -96.7242431640625, + "loss": 0.5785, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.8690376281738281, + "rewards/margins": -0.594048261642456, + "rewards/rejected": 2.463085889816284, + "step": 3455 + }, + { + "epoch": 0.56, + "learning_rate": 8.443297895864886e-06, + "logits/chosen": -1.0251866579055786, + "logits/rejected": -1.0012096166610718, + "logps/chosen": -62.735877990722656, + "logps/rejected": -38.68630599975586, + "loss": 0.8597, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.6746826171875, + "rewards/margins": 0.5559917688369751, + "rewards/rejected": 1.118690848350525, + "step": 3456 + }, + { + "epoch": 0.56, + "learning_rate": 8.442344831344985e-06, + "logits/chosen": -0.8927662968635559, + "logits/rejected": -0.8567318320274353, + "logps/chosen": -59.44844055175781, + "logps/rejected": -50.98896026611328, + "loss": 0.4291, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1447465419769287, + "rewards/margins": 1.1383850574493408, + "rewards/rejected": 2.006361484527588, + "step": 3457 + }, + { + "epoch": 0.56, + "learning_rate": 8.441391528991629e-06, + "logits/chosen": -1.1208382844924927, + "logits/rejected": -1.1742182970046997, + "logps/chosen": -72.66266632080078, + "logps/rejected": -125.62989807128906, + "loss": 0.3337, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.638770341873169, + "rewards/margins": 0.3051551580429077, + "rewards/rejected": 1.3336151838302612, + "step": 3458 + }, + { + "epoch": 0.56, + "learning_rate": 8.44043798887068e-06, + "logits/chosen": -1.2639305591583252, + "logits/rejected": -1.2139705419540405, + "logps/chosen": -180.48294067382812, + "logps/rejected": -153.44564819335938, + "loss": 0.5117, + "rewards/accuracies": 1.0, + "rewards/chosen": 7.604333400726318, + "rewards/margins": 0.17722463607788086, + "rewards/rejected": 7.4271087646484375, + "step": 3459 + }, + { + "epoch": 0.56, + "learning_rate": 8.439484211048019e-06, + "logits/chosen": -1.2202116250991821, + "logits/rejected": -1.114227056503296, + "logps/chosen": -237.08724975585938, + "logps/rejected": -87.08578491210938, + "loss": 0.0633, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.520968914031982, + "rewards/margins": 4.42439603805542, + "rewards/rejected": 1.0965728759765625, + "step": 3460 + }, + { + "epoch": 0.56, + "learning_rate": 8.438530195589546e-06, + "logits/chosen": -1.3372644186019897, + "logits/rejected": -1.298789620399475, + "logps/chosen": -28.781394958496094, + "logps/rejected": -35.378536224365234, + "loss": 0.233, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.508352756500244, + "rewards/margins": 0.7268357276916504, + "rewards/rejected": 1.7815170288085938, + "step": 3461 + }, + { + "epoch": 0.56, + "learning_rate": 8.437575942561171e-06, + "logits/chosen": -1.2472867965698242, + "logits/rejected": -1.2135248184204102, + "logps/chosen": -111.65786743164062, + "logps/rejected": -76.62007141113281, + "loss": 0.3618, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.848059058189392, + "rewards/margins": 0.07803487777709961, + "rewards/rejected": 1.7700241804122925, + "step": 3462 + }, + { + "epoch": 0.56, + "learning_rate": 8.436621452028825e-06, + "logits/chosen": -1.160902976989746, + "logits/rejected": -0.8954107761383057, + "logps/chosen": -148.70101928710938, + "logps/rejected": -71.8154296875, + "loss": 0.303, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.801997661590576, + "rewards/margins": 3.4142520427703857, + "rewards/rejected": 2.3877456188201904, + "step": 3463 + }, + { + "epoch": 0.56, + "learning_rate": 8.435666724058454e-06, + "logits/chosen": -0.5072794556617737, + "logits/rejected": -0.5063509345054626, + "logps/chosen": -3.215085983276367, + "logps/rejected": -5.786087512969971, + "loss": 0.3802, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3404991328716278, + "rewards/margins": 0.11614786088466644, + "rewards/rejected": 0.22435127198696136, + "step": 3464 + }, + { + "epoch": 0.56, + "learning_rate": 8.434711758716021e-06, + "logits/chosen": -0.6480392217636108, + "logits/rejected": -0.6480392217636108, + "logps/chosen": -3.1409952640533447, + "logps/rejected": -3.1409952640533447, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.1525191068649292, + "rewards/margins": 0.0, + "rewards/rejected": 0.1525191068649292, + "step": 3465 + }, + { + "epoch": 0.56, + "learning_rate": 8.433756556067506e-06, + "logits/chosen": -1.302295446395874, + "logits/rejected": -1.3340903520584106, + "logps/chosen": -209.50009155273438, + "logps/rejected": -152.4956817626953, + "loss": 0.195, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.411868572235107, + "rewards/margins": 1.0207414627075195, + "rewards/rejected": 5.391127109527588, + "step": 3466 + }, + { + "epoch": 0.56, + "learning_rate": 8.432801116178903e-06, + "logits/chosen": -1.3799481391906738, + "logits/rejected": -1.308580994606018, + "logps/chosen": -118.50587463378906, + "logps/rejected": -26.049121856689453, + "loss": 0.3751, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.7143447399139404, + "rewards/margins": 2.833772659301758, + "rewards/rejected": -0.1194278746843338, + "step": 3467 + }, + { + "epoch": 0.56, + "learning_rate": 8.431845439116224e-06, + "logits/chosen": -1.525354027748108, + "logits/rejected": -1.521715521812439, + "logps/chosen": -77.52747344970703, + "logps/rejected": -127.76905822753906, + "loss": 0.3045, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.2032113075256348, + "rewards/margins": 0.2328881025314331, + "rewards/rejected": 1.9703232049942017, + "step": 3468 + }, + { + "epoch": 0.56, + "learning_rate": 8.430889524945499e-06, + "logits/chosen": -0.9896279573440552, + "logits/rejected": -0.8537466526031494, + "logps/chosen": -121.97573852539062, + "logps/rejected": -94.37638854980469, + "loss": 0.1741, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.8358582258224487, + "rewards/margins": 0.8945831656455994, + "rewards/rejected": 0.9412750601768494, + "step": 3469 + }, + { + "epoch": 0.56, + "learning_rate": 8.429933373732768e-06, + "logits/chosen": -1.0025148391723633, + "logits/rejected": -1.0046409368515015, + "logps/chosen": -41.83057403564453, + "logps/rejected": -22.793041229248047, + "loss": 0.4335, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.6761135458946228, + "rewards/margins": 0.573293924331665, + "rewards/rejected": 0.10281963646411896, + "step": 3470 + }, + { + "epoch": 0.56, + "learning_rate": 8.428976985544097e-06, + "logits/chosen": -0.740451455116272, + "logits/rejected": -0.8169787526130676, + "logps/chosen": -17.863208770751953, + "logps/rejected": -64.98330688476562, + "loss": 2.0582, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.31217899918556213, + "rewards/margins": -4.097658157348633, + "rewards/rejected": 4.409837245941162, + "step": 3471 + }, + { + "epoch": 0.56, + "learning_rate": 8.428020360445563e-06, + "logits/chosen": -1.0180211067199707, + "logits/rejected": -0.9799374938011169, + "logps/chosen": -107.35466766357422, + "logps/rejected": -129.4786376953125, + "loss": 0.8841, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.3635261058807373, + "rewards/margins": 1.1148627996444702, + "rewards/rejected": 1.248663306236267, + "step": 3472 + }, + { + "epoch": 0.56, + "learning_rate": 8.427063498503257e-06, + "logits/chosen": -1.3054721355438232, + "logits/rejected": -1.255484700202942, + "logps/chosen": -120.21586608886719, + "logps/rejected": -152.65411376953125, + "loss": 0.1247, + "rewards/accuracies": 1.0, + "rewards/chosen": 7.5996294021606445, + "rewards/margins": 1.741682529449463, + "rewards/rejected": 5.857946872711182, + "step": 3473 + }, + { + "epoch": 0.56, + "learning_rate": 8.42610639978329e-06, + "logits/chosen": -1.0409775972366333, + "logits/rejected": -1.0656014680862427, + "logps/chosen": -54.51158142089844, + "logps/rejected": -60.5142936706543, + "loss": 0.7241, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.3952468633651733, + "rewards/margins": -1.1474508047103882, + "rewards/rejected": 2.5426976680755615, + "step": 3474 + }, + { + "epoch": 0.56, + "learning_rate": 8.425149064351789e-06, + "logits/chosen": -0.7824538946151733, + "logits/rejected": -0.8611904382705688, + "logps/chosen": -25.886669158935547, + "logps/rejected": -69.88462829589844, + "loss": 0.9946, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.35854530334472656, + "rewards/margins": -1.836324691772461, + "rewards/rejected": 2.1948699951171875, + "step": 3475 + }, + { + "epoch": 0.56, + "learning_rate": 8.424191492274898e-06, + "logits/chosen": -1.1083961725234985, + "logits/rejected": -1.0103238821029663, + "logps/chosen": -65.23370361328125, + "logps/rejected": -59.23358154296875, + "loss": 0.4923, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.7899506092071533, + "rewards/margins": 0.25072479248046875, + "rewards/rejected": 2.5392258167266846, + "step": 3476 + }, + { + "epoch": 0.56, + "learning_rate": 8.423233683618773e-06, + "logits/chosen": -0.8131034970283508, + "logits/rejected": -0.5278574228286743, + "logps/chosen": -69.56639862060547, + "logps/rejected": -56.80322265625, + "loss": 0.4985, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.0454277992248535, + "rewards/margins": 2.6176443099975586, + "rewards/rejected": -0.5722164511680603, + "step": 3477 + }, + { + "epoch": 0.56, + "learning_rate": 8.422275638449592e-06, + "logits/chosen": -1.0956370830535889, + "logits/rejected": -1.2541173696517944, + "logps/chosen": -36.03068923950195, + "logps/rejected": -179.85963439941406, + "loss": 3.7905, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.021451234817505, + "rewards/margins": -6.560826301574707, + "rewards/rejected": 8.582277297973633, + "step": 3478 + }, + { + "epoch": 0.56, + "learning_rate": 8.421317356833547e-06, + "logits/chosen": -1.1634817123413086, + "logits/rejected": -1.1634817123413086, + "logps/chosen": -57.20505905151367, + "logps/rejected": -57.20505905151367, + "loss": 0.3518, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.3470609188079834, + "rewards/margins": 0.0, + "rewards/rejected": 3.3470609188079834, + "step": 3479 + }, + { + "epoch": 0.56, + "learning_rate": 8.420358838836846e-06, + "logits/chosen": -1.1466162204742432, + "logits/rejected": -1.1466162204742432, + "logps/chosen": -18.720746994018555, + "logps/rejected": -18.720746994018555, + "loss": 0.3845, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.9533720016479492, + "rewards/margins": 0.0, + "rewards/rejected": 1.9533720016479492, + "step": 3480 + }, + { + "epoch": 0.57, + "learning_rate": 8.419400084525712e-06, + "logits/chosen": -0.695404052734375, + "logits/rejected": -0.6906203031539917, + "logps/chosen": -6.674311637878418, + "logps/rejected": -2.4103896617889404, + "loss": 0.8121, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.1616470366716385, + "rewards/margins": -0.327688992023468, + "rewards/rejected": 0.4893360137939453, + "step": 3481 + }, + { + "epoch": 0.57, + "learning_rate": 8.418441093966387e-06, + "logits/chosen": -0.6915566325187683, + "logits/rejected": -0.700400710105896, + "logps/chosen": -15.654474258422852, + "logps/rejected": -22.511503219604492, + "loss": 0.9143, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.3333999812602997, + "rewards/margins": -0.2909923493862152, + "rewards/rejected": 0.6243923306465149, + "step": 3482 + }, + { + "epoch": 0.57, + "learning_rate": 8.417481867225129e-06, + "logits/chosen": -1.6884912252426147, + "logits/rejected": -1.6599678993225098, + "logps/chosen": -59.38798141479492, + "logps/rejected": -32.39363479614258, + "loss": 0.8558, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.812488317489624, + "rewards/margins": 2.104231834411621, + "rewards/rejected": 0.7082565426826477, + "step": 3483 + }, + { + "epoch": 0.57, + "learning_rate": 8.416522404368208e-06, + "logits/chosen": -1.064410924911499, + "logits/rejected": -1.0505790710449219, + "logps/chosen": -81.89884948730469, + "logps/rejected": -102.84442138671875, + "loss": 0.254, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.5235214233398438, + "rewards/margins": 0.4510223865509033, + "rewards/rejected": 3.0724990367889404, + "step": 3484 + }, + { + "epoch": 0.57, + "learning_rate": 8.415562705461918e-06, + "logits/chosen": -0.9978698492050171, + "logits/rejected": -0.9976441860198975, + "logps/chosen": -74.3283462524414, + "logps/rejected": -82.76930236816406, + "loss": 0.5823, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.7311943769454956, + "rewards/margins": -0.6523321866989136, + "rewards/rejected": 2.383526563644409, + "step": 3485 + }, + { + "epoch": 0.57, + "learning_rate": 8.414602770572562e-06, + "logits/chosen": -1.1124762296676636, + "logits/rejected": -1.2165991067886353, + "logps/chosen": -181.4599609375, + "logps/rejected": -108.01258850097656, + "loss": 0.1638, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.232705593109131, + "rewards/margins": 1.3826642036437988, + "rewards/rejected": 4.850041389465332, + "step": 3486 + }, + { + "epoch": 0.57, + "learning_rate": 8.413642599766464e-06, + "logits/chosen": -1.368149995803833, + "logits/rejected": -1.2319282293319702, + "logps/chosen": -88.13389587402344, + "logps/rejected": -46.65592956542969, + "loss": 0.025, + "rewards/accuracies": 1.0, + "rewards/chosen": 7.4889678955078125, + "rewards/margins": 4.607688903808594, + "rewards/rejected": 2.8812789916992188, + "step": 3487 + }, + { + "epoch": 0.57, + "learning_rate": 8.412682193109965e-06, + "logits/chosen": -1.0937719345092773, + "logits/rejected": -1.091857671737671, + "logps/chosen": -139.68600463867188, + "logps/rejected": -94.0086898803711, + "loss": 1.4638, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.2472991943359375, + "rewards/margins": 1.9648001194000244, + "rewards/rejected": 2.282499074935913, + "step": 3488 + }, + { + "epoch": 0.57, + "learning_rate": 8.411721550669416e-06, + "logits/chosen": -1.2592400312423706, + "logits/rejected": -1.339752197265625, + "logps/chosen": -113.13945007324219, + "logps/rejected": -118.7552490234375, + "loss": 0.8999, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.945648193359375, + "rewards/margins": -1.6160187721252441, + "rewards/rejected": 4.561666965484619, + "step": 3489 + }, + { + "epoch": 0.57, + "learning_rate": 8.410760672511188e-06, + "logits/chosen": -1.293036937713623, + "logits/rejected": -1.3926141262054443, + "logps/chosen": -230.72613525390625, + "logps/rejected": -124.0470962524414, + "loss": 2.0312, + "rewards/accuracies": 0.0, + "rewards/chosen": 8.14019775390625, + "rewards/margins": -2.2056331634521484, + "rewards/rejected": 10.345830917358398, + "step": 3490 + }, + { + "epoch": 0.57, + "learning_rate": 8.409799558701674e-06, + "logits/chosen": -1.0907188653945923, + "logits/rejected": -1.0046749114990234, + "logps/chosen": -37.52880096435547, + "logps/rejected": -61.390689849853516, + "loss": 0.3383, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.3288543224334717, + "rewards/margins": 0.7320140600204468, + "rewards/rejected": 1.596840262413025, + "step": 3491 + }, + { + "epoch": 0.57, + "learning_rate": 8.408838209307273e-06, + "logits/chosen": -1.2651615142822266, + "logits/rejected": -1.0973122119903564, + "logps/chosen": -140.15899658203125, + "logps/rejected": -65.47831726074219, + "loss": 0.0732, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.336248874664307, + "rewards/margins": 2.5316848754882812, + "rewards/rejected": 3.8045639991760254, + "step": 3492 + }, + { + "epoch": 0.57, + "learning_rate": 8.407876624394407e-06, + "logits/chosen": -1.1900800466537476, + "logits/rejected": -1.179697871208191, + "logps/chosen": -118.4236068725586, + "logps/rejected": -52.49474334716797, + "loss": 0.4289, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.0648598670959473, + "rewards/margins": -0.23657441139221191, + "rewards/rejected": 2.301434278488159, + "step": 3493 + }, + { + "epoch": 0.57, + "learning_rate": 8.40691480402951e-06, + "logits/chosen": -1.0180872678756714, + "logits/rejected": -0.928482711315155, + "logps/chosen": -50.965850830078125, + "logps/rejected": -16.483922958374023, + "loss": 0.7373, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.1789878606796265, + "rewards/margins": 0.5219358205795288, + "rewards/rejected": 0.6570520401000977, + "step": 3494 + }, + { + "epoch": 0.57, + "learning_rate": 8.405952748279039e-06, + "logits/chosen": -1.2840850353240967, + "logits/rejected": -1.2350506782531738, + "logps/chosen": -77.20735168457031, + "logps/rejected": -70.13949584960938, + "loss": 0.5192, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.0908095836639404, + "rewards/margins": -0.3848228454589844, + "rewards/rejected": 3.475632429122925, + "step": 3495 + }, + { + "epoch": 0.57, + "learning_rate": 8.404990457209458e-06, + "logits/chosen": -1.0029727220535278, + "logits/rejected": -0.9394962191581726, + "logps/chosen": -71.29963684082031, + "logps/rejected": -45.421329498291016, + "loss": 0.2779, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.007956027984619, + "rewards/margins": 0.30609631538391113, + "rewards/rejected": 1.701859712600708, + "step": 3496 + }, + { + "epoch": 0.57, + "learning_rate": 8.404027930887256e-06, + "logits/chosen": -1.4137729406356812, + "logits/rejected": -0.8533468246459961, + "logps/chosen": -183.79241943359375, + "logps/rejected": -99.96075439453125, + "loss": 0.0278, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.725192546844482, + "rewards/margins": 2.8685672283172607, + "rewards/rejected": 2.8566253185272217, + "step": 3497 + }, + { + "epoch": 0.57, + "learning_rate": 8.403065169378932e-06, + "logits/chosen": -1.087866187095642, + "logits/rejected": -1.0849292278289795, + "logps/chosen": -51.15596008300781, + "logps/rejected": -94.97949981689453, + "loss": 0.291, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.441056966781616, + "rewards/margins": 0.26990222930908203, + "rewards/rejected": 3.171154737472534, + "step": 3498 + }, + { + "epoch": 0.57, + "learning_rate": 8.402102172751005e-06, + "logits/chosen": -1.1114915609359741, + "logits/rejected": -1.0278401374816895, + "logps/chosen": -77.19706726074219, + "logps/rejected": -131.10641479492188, + "loss": 0.5334, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.057330369949341, + "rewards/margins": 0.7916930913925171, + "rewards/rejected": 1.2656372785568237, + "step": 3499 + }, + { + "epoch": 0.57, + "learning_rate": 8.40113894107001e-06, + "logits/chosen": -1.0874378681182861, + "logits/rejected": -1.0123231410980225, + "logps/chosen": -64.9970703125, + "logps/rejected": -80.6809310913086, + "loss": 0.7994, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.0085723400115967, + "rewards/margins": -1.3721184730529785, + "rewards/rejected": 3.380690813064575, + "step": 3500 + }, + { + "epoch": 0.57, + "learning_rate": 8.400175474402496e-06, + "logits/chosen": -1.1820284128189087, + "logits/rejected": -0.7673302888870239, + "logps/chosen": -104.24715423583984, + "logps/rejected": -50.775909423828125, + "loss": 1.609, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.3601585626602173, + "rewards/margins": -3.1406846046447754, + "rewards/rejected": 4.500843048095703, + "step": 3501 + }, + { + "epoch": 0.57, + "learning_rate": 8.39921177281503e-06, + "logits/chosen": -1.4424784183502197, + "logits/rejected": -1.4306505918502808, + "logps/chosen": -78.61270904541016, + "logps/rejected": -99.44721984863281, + "loss": 0.1497, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.094257354736328, + "rewards/margins": 1.180060625076294, + "rewards/rejected": 0.914196789264679, + "step": 3502 + }, + { + "epoch": 0.57, + "learning_rate": 8.398247836374194e-06, + "logits/chosen": -0.9801880717277527, + "logits/rejected": -0.9801880717277527, + "logps/chosen": -74.55628967285156, + "logps/rejected": -74.55628967285156, + "loss": 0.7467, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.0072433948516846, + "rewards/margins": 0.0, + "rewards/rejected": 2.0072433948516846, + "step": 3503 + }, + { + "epoch": 0.57, + "learning_rate": 8.397283665146585e-06, + "logits/chosen": -1.1546356678009033, + "logits/rejected": -1.1785025596618652, + "logps/chosen": -62.90696716308594, + "logps/rejected": -40.64116668701172, + "loss": 0.2712, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.796726942062378, + "rewards/margins": 0.6042594909667969, + "rewards/rejected": 3.192467451095581, + "step": 3504 + }, + { + "epoch": 0.57, + "learning_rate": 8.396319259198822e-06, + "logits/chosen": -0.8272956609725952, + "logits/rejected": -0.795080304145813, + "logps/chosen": -69.97061157226562, + "logps/rejected": -156.3142852783203, + "loss": 0.6979, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.3535674810409546, + "rewards/margins": -1.1044663190841675, + "rewards/rejected": 2.458033800125122, + "step": 3505 + }, + { + "epoch": 0.57, + "learning_rate": 8.395354618597533e-06, + "logits/chosen": -1.2052971124649048, + "logits/rejected": -1.0347658395767212, + "logps/chosen": -107.45976257324219, + "logps/rejected": -90.6156005859375, + "loss": 0.0475, + "rewards/accuracies": 1.0, + "rewards/chosen": 8.791302680969238, + "rewards/margins": 6.470318794250488, + "rewards/rejected": 2.32098388671875, + "step": 3506 + }, + { + "epoch": 0.57, + "learning_rate": 8.39438974340937e-06, + "logits/chosen": -0.7289877533912659, + "logits/rejected": -0.7273012399673462, + "logps/chosen": -55.458290100097656, + "logps/rejected": -18.33124542236328, + "loss": 0.3851, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.2770133912563324, + "rewards/margins": -0.13607484102249146, + "rewards/rejected": 0.41308823227882385, + "step": 3507 + }, + { + "epoch": 0.57, + "learning_rate": 8.39342463370099e-06, + "logits/chosen": -0.6540461182594299, + "logits/rejected": -0.6087471842765808, + "logps/chosen": -91.9216079711914, + "logps/rejected": -46.88481140136719, + "loss": 0.7461, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.0595420598983765, + "rewards/margins": 0.21458125114440918, + "rewards/rejected": 0.8449608087539673, + "step": 3508 + }, + { + "epoch": 0.57, + "learning_rate": 8.39245928953908e-06, + "logits/chosen": -0.8930953145027161, + "logits/rejected": -0.8930953145027161, + "logps/chosen": -17.182876586914062, + "logps/rejected": -17.182876586914062, + "loss": 0.458, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.6579275131225586, + "rewards/margins": 0.0, + "rewards/rejected": 0.6579275131225586, + "step": 3509 + }, + { + "epoch": 0.57, + "learning_rate": 8.39149371099033e-06, + "logits/chosen": -1.0003294944763184, + "logits/rejected": -0.9102373123168945, + "logps/chosen": -58.290794372558594, + "logps/rejected": -25.592294692993164, + "loss": 0.0567, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.668524265289307, + "rewards/margins": 2.2465150356292725, + "rewards/rejected": 2.422009229660034, + "step": 3510 + }, + { + "epoch": 0.57, + "learning_rate": 8.390527898121456e-06, + "logits/chosen": -1.1731950044631958, + "logits/rejected": -1.1202691793441772, + "logps/chosen": -197.57533264160156, + "logps/rejected": -98.60606384277344, + "loss": 0.237, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.13821268081665, + "rewards/margins": 3.3498406410217285, + "rewards/rejected": 1.7883720397949219, + "step": 3511 + }, + { + "epoch": 0.57, + "learning_rate": 8.389561850999188e-06, + "logits/chosen": -0.7064866423606873, + "logits/rejected": -0.6969102621078491, + "logps/chosen": -2.0460197925567627, + "logps/rejected": -10.533886909484863, + "loss": 0.7382, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3533784747123718, + "rewards/margins": 0.02969902753829956, + "rewards/rejected": 0.32367944717407227, + "step": 3512 + }, + { + "epoch": 0.57, + "learning_rate": 8.388595569690265e-06, + "logits/chosen": -1.0958868265151978, + "logits/rejected": -1.1133884191513062, + "logps/chosen": -45.34035110473633, + "logps/rejected": -95.84913635253906, + "loss": 1.2806, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.2175991535186768, + "rewards/margins": -0.9401736259460449, + "rewards/rejected": 2.1577727794647217, + "step": 3513 + }, + { + "epoch": 0.57, + "learning_rate": 8.387629054261454e-06, + "logits/chosen": -0.7573415040969849, + "logits/rejected": -0.7415181994438171, + "logps/chosen": -52.3975830078125, + "logps/rejected": -37.73177719116211, + "loss": 0.2773, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.724334716796875, + "rewards/margins": 0.6578556299209595, + "rewards/rejected": 1.0664790868759155, + "step": 3514 + }, + { + "epoch": 0.57, + "learning_rate": 8.386662304779529e-06, + "logits/chosen": -0.875349760055542, + "logits/rejected": -0.8467966318130493, + "logps/chosen": -58.505332946777344, + "logps/rejected": -161.79986572265625, + "loss": 0.0709, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.931771993637085, + "rewards/margins": 1.884445309638977, + "rewards/rejected": 1.047326683998108, + "step": 3515 + }, + { + "epoch": 0.57, + "learning_rate": 8.385695321311282e-06, + "logits/chosen": -1.0492844581604004, + "logits/rejected": -1.056187391281128, + "logps/chosen": -84.20831298828125, + "logps/rejected": -85.60951232910156, + "loss": 0.76, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.6658661365509033, + "rewards/margins": -0.18948054313659668, + "rewards/rejected": 1.8553466796875, + "step": 3516 + }, + { + "epoch": 0.57, + "learning_rate": 8.384728103923525e-06, + "logits/chosen": -0.9202835559844971, + "logits/rejected": -0.8520441055297852, + "logps/chosen": -35.4825439453125, + "logps/rejected": -39.442543029785156, + "loss": 1.2009, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.9336304068565369, + "rewards/margins": -2.1383414268493652, + "rewards/rejected": 3.071971893310547, + "step": 3517 + }, + { + "epoch": 0.57, + "learning_rate": 8.383760652683082e-06, + "logits/chosen": -1.1310577392578125, + "logits/rejected": -1.0773049592971802, + "logps/chosen": -82.15090942382812, + "logps/rejected": -122.38214111328125, + "loss": 0.1475, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.546525478363037, + "rewards/margins": 1.0853376388549805, + "rewards/rejected": 3.4611878395080566, + "step": 3518 + }, + { + "epoch": 0.57, + "learning_rate": 8.382792967656797e-06, + "logits/chosen": -1.1079198122024536, + "logits/rejected": -0.9574032425880432, + "logps/chosen": -129.9550323486328, + "logps/rejected": -49.939300537109375, + "loss": 0.8293, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.476742744445801, + "rewards/margins": 1.396866798400879, + "rewards/rejected": 3.079875946044922, + "step": 3519 + }, + { + "epoch": 0.57, + "learning_rate": 8.381825048911525e-06, + "logits/chosen": -1.7020952701568604, + "logits/rejected": -1.6668747663497925, + "logps/chosen": -197.3990936279297, + "logps/rejected": -162.1866455078125, + "loss": 0.6528, + "rewards/accuracies": 0.0, + "rewards/chosen": 8.175407409667969, + "rewards/margins": -0.9611310958862305, + "rewards/rejected": 9.1365385055542, + "step": 3520 + }, + { + "epoch": 0.57, + "learning_rate": 8.380856896514141e-06, + "logits/chosen": -1.1877456903457642, + "logits/rejected": -1.174912691116333, + "logps/chosen": -47.52720642089844, + "logps/rejected": -71.75363159179688, + "loss": 0.6757, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.568011522293091, + "rewards/margins": 0.6533173322677612, + "rewards/rejected": 1.9146941900253296, + "step": 3521 + }, + { + "epoch": 0.57, + "learning_rate": 8.379888510531536e-06, + "logits/chosen": -1.2270903587341309, + "logits/rejected": -0.9963226318359375, + "logps/chosen": -90.18775177001953, + "logps/rejected": -43.46971130371094, + "loss": 0.0126, + "rewards/accuracies": 1.0, + "rewards/chosen": 7.58425235748291, + "rewards/margins": 4.59304666519165, + "rewards/rejected": 2.9912056922912598, + "step": 3522 + }, + { + "epoch": 0.57, + "learning_rate": 8.378919891030614e-06, + "logits/chosen": -1.0530192852020264, + "logits/rejected": -1.1705121994018555, + "logps/chosen": -125.36885070800781, + "logps/rejected": -105.48663330078125, + "loss": 2.5053, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.2782516479492188, + "rewards/margins": -5.003694534301758, + "rewards/rejected": 8.281946182250977, + "step": 3523 + }, + { + "epoch": 0.57, + "learning_rate": 8.377951038078303e-06, + "logits/chosen": -0.8587679266929626, + "logits/rejected": -0.8270875215530396, + "logps/chosen": -28.189620971679688, + "logps/rejected": -51.51347351074219, + "loss": 0.4978, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.111132860183716, + "rewards/margins": -0.5206634998321533, + "rewards/rejected": 2.631796360015869, + "step": 3524 + }, + { + "epoch": 0.57, + "learning_rate": 8.376981951741535e-06, + "logits/chosen": -1.1149208545684814, + "logits/rejected": -1.020656943321228, + "logps/chosen": -70.03302001953125, + "logps/rejected": -58.22932815551758, + "loss": 0.394, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.101599931716919, + "rewards/margins": -0.17172503471374512, + "rewards/rejected": 3.273324966430664, + "step": 3525 + }, + { + "epoch": 0.57, + "learning_rate": 8.376012632087266e-06, + "logits/chosen": -1.1388291120529175, + "logits/rejected": -1.0003540515899658, + "logps/chosen": -53.45115661621094, + "logps/rejected": -27.319787979125977, + "loss": 0.1273, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.7983100414276123, + "rewards/margins": 2.5145504474639893, + "rewards/rejected": 0.2837596833705902, + "step": 3526 + }, + { + "epoch": 0.57, + "learning_rate": 8.37504307918247e-06, + "logits/chosen": -0.916290283203125, + "logits/rejected": -0.8618866205215454, + "logps/chosen": -73.3243637084961, + "logps/rejected": -50.92877960205078, + "loss": 1.0589, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.8176735043525696, + "rewards/margins": -1.6952667236328125, + "rewards/rejected": 2.5129401683807373, + "step": 3527 + }, + { + "epoch": 0.57, + "learning_rate": 8.374073293094135e-06, + "logits/chosen": -0.6401549577713013, + "logits/rejected": -0.6487866044044495, + "logps/chosen": -18.842914581298828, + "logps/rejected": -4.413873672485352, + "loss": 0.8581, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.0019481659401208162, + "rewards/margins": -0.39783957600593567, + "rewards/rejected": 0.395891398191452, + "step": 3528 + }, + { + "epoch": 0.57, + "learning_rate": 8.373103273889257e-06, + "logits/chosen": -1.265060544013977, + "logits/rejected": -1.1999927759170532, + "logps/chosen": -49.86712646484375, + "logps/rejected": -84.04548645019531, + "loss": 1.0182, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.623983860015869, + "rewards/margins": -0.01982259750366211, + "rewards/rejected": 2.6438064575195312, + "step": 3529 + }, + { + "epoch": 0.57, + "learning_rate": 8.37213302163486e-06, + "logits/chosen": -1.4152929782867432, + "logits/rejected": -1.3922744989395142, + "logps/chosen": -64.50935363769531, + "logps/rejected": -86.53619384765625, + "loss": 0.4542, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.363970160484314, + "rewards/margins": -0.00619661808013916, + "rewards/rejected": 1.3701667785644531, + "step": 3530 + }, + { + "epoch": 0.57, + "learning_rate": 8.371162536397981e-06, + "logits/chosen": -1.3536062240600586, + "logits/rejected": -1.3474522829055786, + "logps/chosen": -98.57079315185547, + "logps/rejected": -99.95057678222656, + "loss": 0.4325, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.827954888343811, + "rewards/margins": 0.9681388735771179, + "rewards/rejected": 0.8598160147666931, + "step": 3531 + }, + { + "epoch": 0.57, + "learning_rate": 8.370191818245667e-06, + "logits/chosen": -1.0371099710464478, + "logits/rejected": -1.0149867534637451, + "logps/chosen": -69.4618148803711, + "logps/rejected": -85.86160278320312, + "loss": 0.4919, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.9460365772247314, + "rewards/margins": -0.5143730640411377, + "rewards/rejected": 3.460409641265869, + "step": 3532 + }, + { + "epoch": 0.57, + "learning_rate": 8.369220867244989e-06, + "logits/chosen": -1.2232282161712646, + "logits/rejected": -1.291336178779602, + "logps/chosen": -53.059165954589844, + "logps/rejected": -76.36189270019531, + "loss": 4.2153, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.8454301357269287, + "rewards/margins": -4.714101791381836, + "rewards/rejected": 7.559532165527344, + "step": 3533 + }, + { + "epoch": 0.57, + "learning_rate": 8.368249683463028e-06, + "logits/chosen": -1.2334598302841187, + "logits/rejected": -1.208180546760559, + "logps/chosen": -80.76243591308594, + "logps/rejected": -83.35873413085938, + "loss": 0.9947, + "rewards/accuracies": 0.0, + "rewards/chosen": 4.095117092132568, + "rewards/margins": -1.8323001861572266, + "rewards/rejected": 5.927417278289795, + "step": 3534 + }, + { + "epoch": 0.57, + "learning_rate": 8.367278266966882e-06, + "logits/chosen": -0.6613094806671143, + "logits/rejected": -0.8399642109870911, + "logps/chosen": -78.75751495361328, + "logps/rejected": -82.96409606933594, + "loss": 1.4217, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.4762756824493408, + "rewards/margins": -2.7250559329986572, + "rewards/rejected": 4.201331615447998, + "step": 3535 + }, + { + "epoch": 0.57, + "learning_rate": 8.366306617823673e-06, + "logits/chosen": -0.8736233115196228, + "logits/rejected": -0.7836530208587646, + "logps/chosen": -74.03914642333984, + "logps/rejected": -31.967979431152344, + "loss": 0.8196, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.433327555656433, + "rewards/margins": 0.5944748520851135, + "rewards/rejected": 0.8388527035713196, + "step": 3536 + }, + { + "epoch": 0.57, + "learning_rate": 8.365334736100528e-06, + "logits/chosen": -1.1422585248947144, + "logits/rejected": -1.1669375896453857, + "logps/chosen": -54.35297393798828, + "logps/rejected": -47.617122650146484, + "loss": 0.6572, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.302945852279663, + "rewards/margins": 0.5426350831985474, + "rewards/rejected": 1.7603107690811157, + "step": 3537 + }, + { + "epoch": 0.57, + "learning_rate": 8.364362621864595e-06, + "logits/chosen": -1.0484676361083984, + "logits/rejected": -1.0520238876342773, + "logps/chosen": -74.53600311279297, + "logps/rejected": -56.297210693359375, + "loss": 0.8923, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.7696495056152344, + "rewards/margins": -0.03195035457611084, + "rewards/rejected": 1.8015998601913452, + "step": 3538 + }, + { + "epoch": 0.57, + "learning_rate": 8.36339027518304e-06, + "logits/chosen": -1.0277986526489258, + "logits/rejected": -0.9598159790039062, + "logps/chosen": -69.098876953125, + "logps/rejected": -122.46279907226562, + "loss": 0.4093, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.6499886512756348, + "rewards/margins": 0.21285176277160645, + "rewards/rejected": 2.4371368885040283, + "step": 3539 + }, + { + "epoch": 0.57, + "learning_rate": 8.36241769612304e-06, + "logits/chosen": -0.7120608687400818, + "logits/rejected": -0.5416086316108704, + "logps/chosen": -65.47221374511719, + "logps/rejected": -26.548574447631836, + "loss": 0.1904, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.3915252685546875, + "rewards/margins": 3.790097236633301, + "rewards/rejected": 1.6014280319213867, + "step": 3540 + }, + { + "epoch": 0.57, + "learning_rate": 8.361444884751793e-06, + "logits/chosen": -0.8601364493370056, + "logits/rejected": -0.9735944271087646, + "logps/chosen": -70.06676483154297, + "logps/rejected": -56.56679916381836, + "loss": 1.4425, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.2527923583984375, + "rewards/margins": -2.5664753913879395, + "rewards/rejected": 4.819267749786377, + "step": 3541 + }, + { + "epoch": 0.57, + "learning_rate": 8.360471841136513e-06, + "logits/chosen": -1.0183759927749634, + "logits/rejected": -0.9447857737541199, + "logps/chosen": -74.34025573730469, + "logps/rejected": -53.725120544433594, + "loss": 0.4917, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.4756317138671875, + "rewards/margins": -0.4117577075958252, + "rewards/rejected": 1.8873894214630127, + "step": 3542 + }, + { + "epoch": 0.58, + "learning_rate": 8.359498565344424e-06, + "logits/chosen": -1.0514358282089233, + "logits/rejected": -1.0570855140686035, + "logps/chosen": -93.89350128173828, + "logps/rejected": -122.77464294433594, + "loss": 0.4715, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.5392951965332031, + "rewards/margins": 0.23245465755462646, + "rewards/rejected": 1.3068405389785767, + "step": 3543 + }, + { + "epoch": 0.58, + "learning_rate": 8.35852505744277e-06, + "logits/chosen": -0.9975317120552063, + "logits/rejected": -0.9509432911872864, + "logps/chosen": -42.23021697998047, + "logps/rejected": -52.23305130004883, + "loss": 0.7449, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.021909713745117, + "rewards/margins": -1.232908010482788, + "rewards/rejected": 3.2548177242279053, + "step": 3544 + }, + { + "epoch": 0.58, + "learning_rate": 8.357551317498818e-06, + "logits/chosen": -0.835101842880249, + "logits/rejected": -0.7638065218925476, + "logps/chosen": -68.00057220458984, + "logps/rejected": -40.053131103515625, + "loss": 0.6575, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.0978516340255737, + "rewards/margins": -0.3439887762069702, + "rewards/rejected": 1.441840410232544, + "step": 3545 + }, + { + "epoch": 0.58, + "learning_rate": 8.356577345579836e-06, + "logits/chosen": -0.8801630139350891, + "logits/rejected": -0.9301292896270752, + "logps/chosen": -88.08467864990234, + "logps/rejected": -131.0153350830078, + "loss": 0.2613, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.051158905029297, + "rewards/margins": 0.39268720149993896, + "rewards/rejected": 1.658471703529358, + "step": 3546 + }, + { + "epoch": 0.58, + "learning_rate": 8.355603141753122e-06, + "logits/chosen": -0.9687728881835938, + "logits/rejected": -0.9687728881835938, + "logps/chosen": -62.363372802734375, + "logps/rejected": -62.363372802734375, + "loss": 0.357, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.586592197418213, + "rewards/margins": 0.0, + "rewards/rejected": 3.586592197418213, + "step": 3547 + }, + { + "epoch": 0.58, + "learning_rate": 8.354628706085978e-06, + "logits/chosen": -0.9979954361915588, + "logits/rejected": -0.8802986145019531, + "logps/chosen": -40.86606979370117, + "logps/rejected": -62.44403839111328, + "loss": 0.3689, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.9996391534805298, + "rewards/margins": -0.08368909358978271, + "rewards/rejected": 2.0833282470703125, + "step": 3548 + }, + { + "epoch": 0.58, + "learning_rate": 8.353654038645736e-06, + "logits/chosen": -0.8222123980522156, + "logits/rejected": -0.8153120279312134, + "logps/chosen": -34.02928924560547, + "logps/rejected": -45.33151626586914, + "loss": 0.6377, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.4252617359161377, + "rewards/margins": 0.4593113660812378, + "rewards/rejected": 1.9659503698349, + "step": 3549 + }, + { + "epoch": 0.58, + "learning_rate": 8.352679139499731e-06, + "logits/chosen": -1.136013388633728, + "logits/rejected": -1.1507596969604492, + "logps/chosen": -68.78791809082031, + "logps/rejected": -100.85916137695312, + "loss": 0.3007, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.8690507411956787, + "rewards/margins": 1.9953300952911377, + "rewards/rejected": 0.8737205862998962, + "step": 3550 + }, + { + "epoch": 0.58, + "learning_rate": 8.35170400871532e-06, + "logits/chosen": -0.9252747297286987, + "logits/rejected": -0.9411820769309998, + "logps/chosen": -80.54187774658203, + "logps/rejected": -89.37803649902344, + "loss": 0.6534, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.6635330319404602, + "rewards/margins": -0.986285388469696, + "rewards/rejected": 1.6498184204101562, + "step": 3551 + }, + { + "epoch": 0.58, + "learning_rate": 8.350728646359877e-06, + "logits/chosen": -1.2739852666854858, + "logits/rejected": -1.2718697786331177, + "logps/chosen": -106.2386474609375, + "logps/rejected": -68.43251037597656, + "loss": 0.9848, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.756793260574341, + "rewards/margins": 0.7599624395370483, + "rewards/rejected": 1.9968308210372925, + "step": 3552 + }, + { + "epoch": 0.58, + "learning_rate": 8.34975305250079e-06, + "logits/chosen": -0.7792224884033203, + "logits/rejected": -0.8690018653869629, + "logps/chosen": -71.09783172607422, + "logps/rejected": -89.01724243164062, + "loss": 2.8871, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.3772530555725098, + "rewards/margins": -3.5795493125915527, + "rewards/rejected": 5.9568023681640625, + "step": 3553 + }, + { + "epoch": 0.58, + "learning_rate": 8.348777227205462e-06, + "logits/chosen": -1.0592697858810425, + "logits/rejected": -0.9780476093292236, + "logps/chosen": -132.428955078125, + "logps/rejected": -98.9525146484375, + "loss": 0.5077, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.7608917951583862, + "rewards/margins": 0.1595292091369629, + "rewards/rejected": 1.6013625860214233, + "step": 3554 + }, + { + "epoch": 0.58, + "learning_rate": 8.347801170541315e-06, + "logits/chosen": -1.4486346244812012, + "logits/rejected": -1.4846196174621582, + "logps/chosen": -79.11589813232422, + "logps/rejected": -95.24040985107422, + "loss": 0.9461, + "rewards/accuracies": 0.0, + "rewards/chosen": 4.640342712402344, + "rewards/margins": -1.5826988220214844, + "rewards/rejected": 6.223041534423828, + "step": 3555 + }, + { + "epoch": 0.58, + "learning_rate": 8.346824882575783e-06, + "logits/chosen": -0.7777573466300964, + "logits/rejected": -0.89215087890625, + "logps/chosen": -93.78317260742188, + "logps/rejected": -98.24402618408203, + "loss": 1.278, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.937164306640625, + "rewards/margins": -1.6330409049987793, + "rewards/rejected": 4.570205211639404, + "step": 3556 + }, + { + "epoch": 0.58, + "learning_rate": 8.345848363376318e-06, + "logits/chosen": -1.1647319793701172, + "logits/rejected": -1.0507818460464478, + "logps/chosen": -200.13418579101562, + "logps/rejected": -37.969871520996094, + "loss": 1.0968, + "rewards/accuracies": 1.0, + "rewards/chosen": 8.512890815734863, + "rewards/margins": 6.772672653198242, + "rewards/rejected": 1.7402184009552002, + "step": 3557 + }, + { + "epoch": 0.58, + "learning_rate": 8.344871613010393e-06, + "logits/chosen": -1.212616205215454, + "logits/rejected": -1.1007198095321655, + "logps/chosen": -78.947509765625, + "logps/rejected": -35.044158935546875, + "loss": 1.5634, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.691265821456909, + "rewards/margins": 0.3813807964324951, + "rewards/rejected": 2.309885025024414, + "step": 3558 + }, + { + "epoch": 0.58, + "learning_rate": 8.343894631545489e-06, + "logits/chosen": -1.253270149230957, + "logits/rejected": -1.1671336889266968, + "logps/chosen": -96.22859954833984, + "logps/rejected": -29.97292709350586, + "loss": 0.7518, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.408071279525757, + "rewards/margins": 1.8947560787200928, + "rewards/rejected": 0.5133152008056641, + "step": 3559 + }, + { + "epoch": 0.58, + "learning_rate": 8.342917419049104e-06, + "logits/chosen": -1.2126182317733765, + "logits/rejected": -1.1845639944076538, + "logps/chosen": -160.4319610595703, + "logps/rejected": -120.12480163574219, + "loss": 0.2841, + "rewards/accuracies": 1.0, + "rewards/chosen": 8.337190628051758, + "rewards/margins": 0.4213075637817383, + "rewards/rejected": 7.9158830642700195, + "step": 3560 + }, + { + "epoch": 0.58, + "learning_rate": 8.341939975588758e-06, + "logits/chosen": -1.1138068437576294, + "logits/rejected": -1.0650761127471924, + "logps/chosen": -78.82160186767578, + "logps/rejected": -168.28538513183594, + "loss": 0.1928, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9314162731170654, + "rewards/margins": 0.7749602794647217, + "rewards/rejected": 2.1564559936523438, + "step": 3561 + }, + { + "epoch": 0.58, + "learning_rate": 8.34096230123198e-06, + "logits/chosen": -0.8337793350219727, + "logits/rejected": -0.806303858757019, + "logps/chosen": -46.164710998535156, + "logps/rejected": -3.9096193313598633, + "loss": 1.7682, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.22906914353370667, + "rewards/margins": -0.19660606980323792, + "rewards/rejected": 0.4256752133369446, + "step": 3562 + }, + { + "epoch": 0.58, + "learning_rate": 8.33998439604632e-06, + "logits/chosen": -1.1114383935928345, + "logits/rejected": -1.0540717840194702, + "logps/chosen": -119.48512268066406, + "logps/rejected": -41.04243469238281, + "loss": 0.1166, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.73241126537323, + "rewards/margins": 1.3637580871582031, + "rewards/rejected": 0.3686531186103821, + "step": 3563 + }, + { + "epoch": 0.58, + "learning_rate": 8.339006260099343e-06, + "logits/chosen": -1.0569714307785034, + "logits/rejected": -0.8059337139129639, + "logps/chosen": -52.554954528808594, + "logps/rejected": -77.57896423339844, + "loss": 0.8388, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.2680680751800537, + "rewards/margins": -1.4678122997283936, + "rewards/rejected": 3.7358803749084473, + "step": 3564 + }, + { + "epoch": 0.58, + "learning_rate": 8.338027893458624e-06, + "logits/chosen": -1.3228480815887451, + "logits/rejected": -1.3698383569717407, + "logps/chosen": -109.47728729248047, + "logps/rejected": -158.29208374023438, + "loss": 0.8485, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.9278579950332642, + "rewards/margins": -0.6693741083145142, + "rewards/rejected": 1.5972321033477783, + "step": 3565 + }, + { + "epoch": 0.58, + "learning_rate": 8.337049296191766e-06, + "logits/chosen": -0.9113367199897766, + "logits/rejected": -0.9432976841926575, + "logps/chosen": -34.92372131347656, + "logps/rejected": -47.67559814453125, + "loss": 0.9569, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.8263649344444275, + "rewards/margins": -1.1285088062286377, + "rewards/rejected": 1.9548736810684204, + "step": 3566 + }, + { + "epoch": 0.58, + "learning_rate": 8.336070468366374e-06, + "logits/chosen": -0.8330450057983398, + "logits/rejected": -0.8735519051551819, + "logps/chosen": -28.878299713134766, + "logps/rejected": -54.66375732421875, + "loss": 1.3743, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.2508598566055298, + "rewards/margins": -2.6086549758911133, + "rewards/rejected": 3.8595147132873535, + "step": 3567 + }, + { + "epoch": 0.58, + "learning_rate": 8.33509141005008e-06, + "logits/chosen": -0.9903058409690857, + "logits/rejected": -0.9877771735191345, + "logps/chosen": -97.0749282836914, + "logps/rejected": -138.9075927734375, + "loss": 0.8108, + "rewards/accuracies": 0.0, + "rewards/chosen": 4.704757213592529, + "rewards/margins": -1.3982319831848145, + "rewards/rejected": 6.102989196777344, + "step": 3568 + }, + { + "epoch": 0.58, + "learning_rate": 8.334112121310527e-06, + "logits/chosen": -1.2082648277282715, + "logits/rejected": -1.304650902748108, + "logps/chosen": -179.87022399902344, + "logps/rejected": -131.14764404296875, + "loss": 0.0546, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.613412380218506, + "rewards/margins": 2.917633056640625, + "rewards/rejected": 1.6957794427871704, + "step": 3569 + }, + { + "epoch": 0.58, + "learning_rate": 8.333132602215374e-06, + "logits/chosen": -0.9774518609046936, + "logits/rejected": -0.9793988466262817, + "logps/chosen": -57.39086151123047, + "logps/rejected": -77.97381591796875, + "loss": 0.5217, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.35343337059021, + "rewards/margins": 1.2181763648986816, + "rewards/rejected": 1.1352570056915283, + "step": 3570 + }, + { + "epoch": 0.58, + "learning_rate": 8.332152852832297e-06, + "logits/chosen": -1.156826376914978, + "logits/rejected": -1.0029404163360596, + "logps/chosen": -155.35653686523438, + "logps/rejected": -84.18826293945312, + "loss": 0.9707, + "rewards/accuracies": 1.0, + "rewards/chosen": 7.16237211227417, + "rewards/margins": 3.886261224746704, + "rewards/rejected": 3.276110887527466, + "step": 3571 + }, + { + "epoch": 0.58, + "learning_rate": 8.331172873228987e-06, + "logits/chosen": -1.4031319618225098, + "logits/rejected": -1.219671368598938, + "logps/chosen": -106.80401611328125, + "logps/rejected": -14.405838012695312, + "loss": 0.2198, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.9293365478515625, + "rewards/margins": 5.044928073883057, + "rewards/rejected": 0.8844084143638611, + "step": 3572 + }, + { + "epoch": 0.58, + "learning_rate": 8.33019266347315e-06, + "logits/chosen": -1.4774830341339111, + "logits/rejected": -1.6019179821014404, + "logps/chosen": -80.4444351196289, + "logps/rejected": -34.46819305419922, + "loss": 0.4242, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.147298574447632, + "rewards/margins": 1.7836755514144897, + "rewards/rejected": 0.3636230528354645, + "step": 3573 + }, + { + "epoch": 0.58, + "learning_rate": 8.329212223632511e-06, + "logits/chosen": -1.050108790397644, + "logits/rejected": -0.9488932490348816, + "logps/chosen": -87.31123352050781, + "logps/rejected": -55.0466423034668, + "loss": 0.3311, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.088580369949341, + "rewards/margins": 0.06494784355163574, + "rewards/rejected": 2.023632526397705, + "step": 3574 + }, + { + "epoch": 0.58, + "learning_rate": 8.328231553774809e-06, + "logits/chosen": -1.1155258417129517, + "logits/rejected": -1.117732286453247, + "logps/chosen": -98.58193969726562, + "logps/rejected": -76.56505584716797, + "loss": 0.4397, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.2858383655548096, + "rewards/margins": -0.2388617992401123, + "rewards/rejected": 1.5247001647949219, + "step": 3575 + }, + { + "epoch": 0.58, + "learning_rate": 8.327250653967798e-06, + "logits/chosen": -0.5138430595397949, + "logits/rejected": -0.46869558095932007, + "logps/chosen": -40.89727020263672, + "logps/rejected": -16.163936614990234, + "loss": 0.7657, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.6763527393341064, + "rewards/margins": 0.5396435260772705, + "rewards/rejected": 1.136709213256836, + "step": 3576 + }, + { + "epoch": 0.58, + "learning_rate": 8.326269524279251e-06, + "logits/chosen": -0.9422988891601562, + "logits/rejected": -0.8602176308631897, + "logps/chosen": -87.26953125, + "logps/rejected": -34.38984298706055, + "loss": 0.2109, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.8142249584198, + "rewards/margins": 2.538067579269409, + "rewards/rejected": 0.2761573791503906, + "step": 3577 + }, + { + "epoch": 0.58, + "learning_rate": 8.325288164776952e-06, + "logits/chosen": -1.1244187355041504, + "logits/rejected": -1.150115728378296, + "logps/chosen": -50.47754669189453, + "logps/rejected": -52.76030731201172, + "loss": 1.1687, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.9535598754882812, + "rewards/margins": -0.9705650806427002, + "rewards/rejected": 2.9241249561309814, + "step": 3578 + }, + { + "epoch": 0.58, + "learning_rate": 8.324306575528707e-06, + "logits/chosen": -1.2748395204544067, + "logits/rejected": -1.3625425100326538, + "logps/chosen": -118.93035125732422, + "logps/rejected": -150.91409301757812, + "loss": 0.6713, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.945053815841675, + "rewards/margins": -1.016394853591919, + "rewards/rejected": 3.9614486694335938, + "step": 3579 + }, + { + "epoch": 0.58, + "learning_rate": 8.32332475660233e-06, + "logits/chosen": -1.3329771757125854, + "logits/rejected": -1.2332724332809448, + "logps/chosen": -71.1369400024414, + "logps/rejected": -18.01244354248047, + "loss": 0.4995, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.6923843622207642, + "rewards/margins": 0.3058176338672638, + "rewards/rejected": 0.38656672835350037, + "step": 3580 + }, + { + "epoch": 0.58, + "learning_rate": 8.322342708065659e-06, + "logits/chosen": -1.116237998008728, + "logits/rejected": -1.1211858987808228, + "logps/chosen": -74.15902709960938, + "logps/rejected": -80.76107788085938, + "loss": 0.7752, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.7559837698936462, + "rewards/margins": -0.2984122633934021, + "rewards/rejected": 1.0543960332870483, + "step": 3581 + }, + { + "epoch": 0.58, + "learning_rate": 8.321360429986542e-06, + "logits/chosen": -0.8653913140296936, + "logits/rejected": -0.9751315116882324, + "logps/chosen": -111.79110717773438, + "logps/rejected": -132.4893798828125, + "loss": 1.3512, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.910552978515625, + "rewards/margins": -2.5467162132263184, + "rewards/rejected": 4.457269191741943, + "step": 3582 + }, + { + "epoch": 0.58, + "learning_rate": 8.320377922432847e-06, + "logits/chosen": -1.2166019678115845, + "logits/rejected": -1.2636914253234863, + "logps/chosen": -103.35203552246094, + "logps/rejected": -160.32371520996094, + "loss": 0.6974, + "rewards/accuracies": 0.0, + "rewards/chosen": 5.370898723602295, + "rewards/margins": -1.1062331199645996, + "rewards/rejected": 6.4771318435668945, + "step": 3583 + }, + { + "epoch": 0.58, + "learning_rate": 8.319395185472456e-06, + "logits/chosen": -0.9550497531890869, + "logits/rejected": -0.9922373294830322, + "logps/chosen": -47.47472381591797, + "logps/rejected": -54.667388916015625, + "loss": 0.7444, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.942151665687561, + "rewards/margins": -1.100305199623108, + "rewards/rejected": 3.042456865310669, + "step": 3584 + }, + { + "epoch": 0.58, + "learning_rate": 8.318412219173266e-06, + "logits/chosen": -1.3609906435012817, + "logits/rejected": -1.3519352674484253, + "logps/chosen": -198.3021240234375, + "logps/rejected": -72.59950256347656, + "loss": 0.021, + "rewards/accuracies": 1.0, + "rewards/chosen": 7.729342937469482, + "rewards/margins": 3.5429887771606445, + "rewards/rejected": 4.186354160308838, + "step": 3585 + }, + { + "epoch": 0.58, + "learning_rate": 8.31742902360319e-06, + "logits/chosen": -0.9606279730796814, + "logits/rejected": -0.8995246887207031, + "logps/chosen": -66.8166275024414, + "logps/rejected": -75.21985626220703, + "loss": 0.6515, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.5576095581054688, + "rewards/margins": -0.9430336952209473, + "rewards/rejected": 3.500643253326416, + "step": 3586 + }, + { + "epoch": 0.58, + "learning_rate": 8.316445598830158e-06, + "logits/chosen": -1.053885579109192, + "logits/rejected": -1.030017375946045, + "logps/chosen": -75.45323181152344, + "logps/rejected": -87.34207153320312, + "loss": 0.9225, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.51806640625, + "rewards/margins": 2.015718936920166, + "rewards/rejected": 0.5023475885391235, + "step": 3587 + }, + { + "epoch": 0.58, + "learning_rate": 8.315461944922118e-06, + "logits/chosen": -0.8650799989700317, + "logits/rejected": -0.8641846776008606, + "logps/chosen": -110.27103424072266, + "logps/rejected": -114.38639831542969, + "loss": 2.0831, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.7920310497283936, + "rewards/margins": 1.2486518621444702, + "rewards/rejected": 1.5433791875839233, + "step": 3588 + }, + { + "epoch": 0.58, + "learning_rate": 8.314478061947027e-06, + "logits/chosen": -0.6614581942558289, + "logits/rejected": -0.6226271390914917, + "logps/chosen": -29.857345581054688, + "logps/rejected": -19.306875228881836, + "loss": 0.6351, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.617056667804718, + "rewards/margins": 0.2551496624946594, + "rewards/rejected": 0.3619070053100586, + "step": 3589 + }, + { + "epoch": 0.58, + "learning_rate": 8.313493949972863e-06, + "logits/chosen": -0.9223896265029907, + "logits/rejected": -0.859632134437561, + "logps/chosen": -60.5788688659668, + "logps/rejected": -79.4100341796875, + "loss": 0.4772, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.558778762817383, + "rewards/margins": -0.23679471015930176, + "rewards/rejected": 2.7955734729766846, + "step": 3590 + }, + { + "epoch": 0.58, + "learning_rate": 8.312509609067621e-06, + "logits/chosen": -0.95671147108078, + "logits/rejected": -0.9429455399513245, + "logps/chosen": -84.3026351928711, + "logps/rejected": -139.86090087890625, + "loss": 0.3874, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.897469401359558, + "rewards/margins": -0.1552649736404419, + "rewards/rejected": 2.052734375, + "step": 3591 + }, + { + "epoch": 0.58, + "learning_rate": 8.311525039299308e-06, + "logits/chosen": -1.611922264099121, + "logits/rejected": -1.5745795965194702, + "logps/chosen": -71.0079116821289, + "logps/rejected": -137.87673950195312, + "loss": 0.7088, + "rewards/accuracies": 0.0, + "rewards/chosen": 5.068000316619873, + "rewards/margins": -0.840451717376709, + "rewards/rejected": 5.908452033996582, + "step": 3592 + }, + { + "epoch": 0.58, + "learning_rate": 8.310540240735948e-06, + "logits/chosen": -1.246363878250122, + "logits/rejected": -1.320698857307434, + "logps/chosen": -78.41980743408203, + "logps/rejected": -69.52174377441406, + "loss": 0.4282, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.4327125549316406, + "rewards/margins": -0.2210395336151123, + "rewards/rejected": 3.653752088546753, + "step": 3593 + }, + { + "epoch": 0.58, + "learning_rate": 8.309555213445583e-06, + "logits/chosen": -1.113417148590088, + "logits/rejected": -0.965665876865387, + "logps/chosen": -109.06129455566406, + "logps/rejected": -68.0580062866211, + "loss": 0.028, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.101176738739014, + "rewards/margins": 2.8692080974578857, + "rewards/rejected": 3.231968641281128, + "step": 3594 + }, + { + "epoch": 0.58, + "learning_rate": 8.308569957496268e-06, + "logits/chosen": -0.9831488132476807, + "logits/rejected": -1.0247437953948975, + "logps/chosen": -66.37710571289062, + "logps/rejected": -163.93875122070312, + "loss": 0.7077, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.1522934436798096, + "rewards/margins": -1.1173248291015625, + "rewards/rejected": 2.269618272781372, + "step": 3595 + }, + { + "epoch": 0.58, + "learning_rate": 8.307584472956073e-06, + "logits/chosen": -1.024793267250061, + "logits/rejected": -0.9426535964012146, + "logps/chosen": -123.72491455078125, + "logps/rejected": -72.95989227294922, + "loss": 0.6325, + "rewards/accuracies": 1.0, + "rewards/chosen": 7.403268337249756, + "rewards/margins": 4.198037147521973, + "rewards/rejected": 3.2052314281463623, + "step": 3596 + }, + { + "epoch": 0.58, + "learning_rate": 8.30659875989309e-06, + "logits/chosen": -0.9239441156387329, + "logits/rejected": -0.9181857705116272, + "logps/chosen": -47.015647888183594, + "logps/rejected": -32.496986389160156, + "loss": 0.528, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.32110291719436646, + "rewards/margins": -0.5948963165283203, + "rewards/rejected": 0.9159992337226868, + "step": 3597 + }, + { + "epoch": 0.58, + "learning_rate": 8.305612818375419e-06, + "logits/chosen": -1.0418407917022705, + "logits/rejected": -1.0313252210617065, + "logps/chosen": -77.81019592285156, + "logps/rejected": -59.24536895751953, + "loss": 0.1521, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.518308401107788, + "rewards/margins": 1.3573991060256958, + "rewards/rejected": 1.1609092950820923, + "step": 3598 + }, + { + "epoch": 0.58, + "learning_rate": 8.304626648471182e-06, + "logits/chosen": -0.8504645228385925, + "logits/rejected": -0.8504645228385925, + "logps/chosen": -48.68907928466797, + "logps/rejected": -48.68907928466797, + "loss": 0.3526, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.5562232732772827, + "rewards/margins": 0.0, + "rewards/rejected": 1.5562232732772827, + "step": 3599 + }, + { + "epoch": 0.58, + "learning_rate": 8.30364025024851e-06, + "logits/chosen": -0.9894342422485352, + "logits/rejected": -0.9919606447219849, + "logps/chosen": -11.909529685974121, + "logps/rejected": -5.593132019042969, + "loss": 0.6694, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.547984778881073, + "rewards/margins": 0.1012585461139679, + "rewards/rejected": 0.4467262327671051, + "step": 3600 + }, + { + "epoch": 0.58, + "learning_rate": 8.302653623775556e-06, + "logits/chosen": -1.066288948059082, + "logits/rejected": -1.066288948059082, + "logps/chosen": -45.20465850830078, + "logps/rejected": -45.20465850830078, + "loss": 0.4649, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.626600742340088, + "rewards/margins": 0.0, + "rewards/rejected": 2.626600742340088, + "step": 3601 + }, + { + "epoch": 0.58, + "learning_rate": 8.301666769120488e-06, + "logits/chosen": -1.0480268001556396, + "logits/rejected": -1.2153890132904053, + "logps/chosen": -69.42596435546875, + "logps/rejected": -125.48484802246094, + "loss": 2.809, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.5580108165740967, + "rewards/margins": -5.606210708618164, + "rewards/rejected": 9.16422176361084, + "step": 3602 + }, + { + "epoch": 0.58, + "learning_rate": 8.300679686351484e-06, + "logits/chosen": -1.0128077268600464, + "logits/rejected": -0.9537009596824646, + "logps/chosen": -117.37300109863281, + "logps/rejected": -68.31151580810547, + "loss": 0.0491, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.218299865722656, + "rewards/margins": 3.6333601474761963, + "rewards/rejected": 2.58493971824646, + "step": 3603 + }, + { + "epoch": 0.58, + "learning_rate": 8.299692375536749e-06, + "logits/chosen": -1.249573826789856, + "logits/rejected": -1.0813910961151123, + "logps/chosen": -70.24594116210938, + "logps/rejected": -21.321489334106445, + "loss": 0.1368, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2494583129882812, + "rewards/margins": 2.7754411697387695, + "rewards/rejected": 0.4740171432495117, + "step": 3604 + }, + { + "epoch": 0.59, + "learning_rate": 8.29870483674449e-06, + "logits/chosen": -1.2277430295944214, + "logits/rejected": -1.303995966911316, + "logps/chosen": -161.78329467773438, + "logps/rejected": -52.31597900390625, + "loss": 0.1605, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.701886177062988, + "rewards/margins": 2.0896265506744385, + "rewards/rejected": 2.61225962638855, + "step": 3605 + }, + { + "epoch": 0.59, + "learning_rate": 8.297717070042942e-06, + "logits/chosen": -1.0734103918075562, + "logits/rejected": -0.9901937246322632, + "logps/chosen": -34.50043869018555, + "logps/rejected": -7.174890041351318, + "loss": 0.2224, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.034501314163208, + "rewards/margins": 1.4515966176986694, + "rewards/rejected": 0.5829046964645386, + "step": 3606 + }, + { + "epoch": 0.59, + "learning_rate": 8.296729075500345e-06, + "logits/chosen": -1.3618431091308594, + "logits/rejected": -1.3840781450271606, + "logps/chosen": -94.03349304199219, + "logps/rejected": -36.88605880737305, + "loss": 0.2968, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.6124465465545654, + "rewards/margins": 0.26981115341186523, + "rewards/rejected": 2.3426353931427, + "step": 3607 + }, + { + "epoch": 0.59, + "learning_rate": 8.295740853184963e-06, + "logits/chosen": -0.8365685939788818, + "logits/rejected": -0.8346179127693176, + "logps/chosen": -2.091599225997925, + "logps/rejected": -4.411052703857422, + "loss": 0.3628, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3408243954181671, + "rewards/margins": 0.019024550914764404, + "rewards/rejected": 0.3217998445034027, + "step": 3608 + }, + { + "epoch": 0.59, + "learning_rate": 8.294752403165075e-06, + "logits/chosen": -1.3144930601119995, + "logits/rejected": -1.2782196998596191, + "logps/chosen": -91.27586364746094, + "logps/rejected": -78.29261779785156, + "loss": 1.0375, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.3891212940216064, + "rewards/margins": -1.9402854442596436, + "rewards/rejected": 4.32940673828125, + "step": 3609 + }, + { + "epoch": 0.59, + "learning_rate": 8.29376372550897e-06, + "logits/chosen": -0.8705814480781555, + "logits/rejected": -0.8755319118499756, + "logps/chosen": -53.93533706665039, + "logps/rejected": -93.49034881591797, + "loss": 1.9151, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.5068908929824829, + "rewards/margins": -1.134345293045044, + "rewards/rejected": 1.6412361860275269, + "step": 3610 + }, + { + "epoch": 0.59, + "learning_rate": 8.292774820284956e-06, + "logits/chosen": -0.9819475412368774, + "logits/rejected": -0.9819475412368774, + "logps/chosen": -24.719667434692383, + "logps/rejected": -24.719667434692383, + "loss": 0.6507, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.009665108285844326, + "rewards/margins": 0.0, + "rewards/rejected": -0.009665108285844326, + "step": 3611 + }, + { + "epoch": 0.59, + "learning_rate": 8.29178568756136e-06, + "logits/chosen": -0.9952448606491089, + "logits/rejected": -1.0077133178710938, + "logps/chosen": -75.28819274902344, + "logps/rejected": -101.112060546875, + "loss": 1.0848, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.585790991783142, + "rewards/margins": 0.4799933433532715, + "rewards/rejected": 1.1057976484298706, + "step": 3612 + }, + { + "epoch": 0.59, + "learning_rate": 8.290796327406521e-06, + "logits/chosen": -1.0433769226074219, + "logits/rejected": -1.009645700454712, + "logps/chosen": -47.684593200683594, + "logps/rejected": -112.03071594238281, + "loss": 1.0289, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.385722041130066, + "rewards/margins": -0.5634555816650391, + "rewards/rejected": 1.949177622795105, + "step": 3613 + }, + { + "epoch": 0.59, + "learning_rate": 8.289806739888791e-06, + "logits/chosen": -0.8709517121315002, + "logits/rejected": -0.8709517121315002, + "logps/chosen": -20.417972564697266, + "logps/rejected": -20.417972564697266, + "loss": 1.1112, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.618172526359558, + "rewards/margins": 0.0, + "rewards/rejected": 1.618172526359558, + "step": 3614 + }, + { + "epoch": 0.59, + "learning_rate": 8.288816925076547e-06, + "logits/chosen": -1.3806655406951904, + "logits/rejected": -1.3952547311782837, + "logps/chosen": -83.12811279296875, + "logps/rejected": -67.66714477539062, + "loss": 0.4424, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.825801134109497, + "rewards/margins": 1.609283447265625, + "rewards/rejected": 1.216517686843872, + "step": 3615 + }, + { + "epoch": 0.59, + "learning_rate": 8.28782688303817e-06, + "logits/chosen": -0.810432493686676, + "logits/rejected": -0.810432493686676, + "logps/chosen": -22.302047729492188, + "logps/rejected": -22.302047729492188, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.2677648067474365, + "rewards/margins": 0.0, + "rewards/rejected": 2.2677648067474365, + "step": 3616 + }, + { + "epoch": 0.59, + "learning_rate": 8.286836613842065e-06, + "logits/chosen": -0.8320448994636536, + "logits/rejected": -0.9121785759925842, + "logps/chosen": -110.89167785644531, + "logps/rejected": -127.67875671386719, + "loss": 1.293, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.9292770624160767, + "rewards/margins": -2.492830276489258, + "rewards/rejected": 4.422107219696045, + "step": 3617 + }, + { + "epoch": 0.59, + "learning_rate": 8.28584611755665e-06, + "logits/chosen": -0.8078868985176086, + "logits/rejected": -0.8078868985176086, + "logps/chosen": -2.4373202323913574, + "logps/rejected": -2.4373202323913574, + "loss": 0.3608, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.3170793056488037, + "rewards/margins": 0.0, + "rewards/rejected": 0.3170793056488037, + "step": 3618 + }, + { + "epoch": 0.59, + "learning_rate": 8.284855394250362e-06, + "logits/chosen": -0.6678931713104248, + "logits/rejected": -0.6678931713104248, + "logps/chosen": -61.618995666503906, + "logps/rejected": -61.618995666503906, + "loss": 0.9357, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.2963783740997314, + "rewards/margins": 0.0, + "rewards/rejected": 3.2963783740997314, + "step": 3619 + }, + { + "epoch": 0.59, + "learning_rate": 8.283864443991645e-06, + "logits/chosen": -1.184861183166504, + "logits/rejected": -1.0221610069274902, + "logps/chosen": -89.57546997070312, + "logps/rejected": -47.59326171875, + "loss": 0.0062, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.759744167327881, + "rewards/margins": 4.500129699707031, + "rewards/rejected": 2.2596147060394287, + "step": 3620 + }, + { + "epoch": 0.59, + "learning_rate": 8.28287326684897e-06, + "logits/chosen": -1.251222014427185, + "logits/rejected": -1.0605360269546509, + "logps/chosen": -211.3837432861328, + "logps/rejected": -36.0512580871582, + "loss": 0.0216, + "rewards/accuracies": 1.0, + "rewards/chosen": 7.737602233886719, + "rewards/margins": 3.4447293281555176, + "rewards/rejected": 4.292872905731201, + "step": 3621 + }, + { + "epoch": 0.59, + "learning_rate": 8.281881862890813e-06, + "logits/chosen": -1.003663182258606, + "logits/rejected": -1.0418875217437744, + "logps/chosen": -186.64305114746094, + "logps/rejected": -101.55023193359375, + "loss": 0.0221, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.110777378082275, + "rewards/margins": 4.3039045333862305, + "rewards/rejected": 1.8068726062774658, + "step": 3622 + }, + { + "epoch": 0.59, + "learning_rate": 8.280890232185673e-06, + "logits/chosen": -0.9724187254905701, + "logits/rejected": -0.9157528281211853, + "logps/chosen": -113.23709869384766, + "logps/rejected": -75.3676528930664, + "loss": 0.2286, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.0078911781311035, + "rewards/margins": 0.9384210109710693, + "rewards/rejected": 3.069470167160034, + "step": 3623 + }, + { + "epoch": 0.59, + "learning_rate": 8.279898374802062e-06, + "logits/chosen": -0.9714276790618896, + "logits/rejected": -1.0921130180358887, + "logps/chosen": -118.4698486328125, + "logps/rejected": -103.3348388671875, + "loss": 2.7666, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.3727325201034546, + "rewards/margins": -5.503819465637207, + "rewards/rejected": 6.876552104949951, + "step": 3624 + }, + { + "epoch": 0.59, + "learning_rate": 8.278906290808508e-06, + "logits/chosen": -1.270508885383606, + "logits/rejected": -1.219542145729065, + "logps/chosen": -75.52546691894531, + "logps/rejected": -49.361412048339844, + "loss": 1.0687, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.9134705066680908, + "rewards/margins": -1.9081573486328125, + "rewards/rejected": 3.8216278553009033, + "step": 3625 + }, + { + "epoch": 0.59, + "learning_rate": 8.277913980273556e-06, + "logits/chosen": -0.9520442485809326, + "logits/rejected": -1.0995450019836426, + "logps/chosen": -43.546443939208984, + "logps/rejected": -105.32969665527344, + "loss": 0.9454, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.165715456008911, + "rewards/margins": -0.9029445648193359, + "rewards/rejected": 3.068660020828247, + "step": 3626 + }, + { + "epoch": 0.59, + "learning_rate": 8.276921443265761e-06, + "logits/chosen": -1.155240535736084, + "logits/rejected": -1.1469252109527588, + "logps/chosen": -53.118141174316406, + "logps/rejected": -66.26309967041016, + "loss": 0.717, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.1845985651016235, + "rewards/margins": -1.0185829401016235, + "rewards/rejected": 2.203181505203247, + "step": 3627 + }, + { + "epoch": 0.59, + "learning_rate": 8.275928679853704e-06, + "logits/chosen": -1.2083128690719604, + "logits/rejected": -1.1862133741378784, + "logps/chosen": -67.4111328125, + "logps/rejected": -63.784610748291016, + "loss": 1.0182, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.362945556640625, + "rewards/margins": -1.6993191242218018, + "rewards/rejected": 3.0622646808624268, + "step": 3628 + }, + { + "epoch": 0.59, + "learning_rate": 8.27493569010597e-06, + "logits/chosen": -1.2306005954742432, + "logits/rejected": -1.2838765382766724, + "logps/chosen": -42.5850715637207, + "logps/rejected": -87.43568420410156, + "loss": 0.557, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.764713704586029, + "rewards/margins": -0.704194962978363, + "rewards/rejected": 1.468908667564392, + "step": 3629 + }, + { + "epoch": 0.59, + "learning_rate": 8.273942474091168e-06, + "logits/chosen": -0.8513854146003723, + "logits/rejected": -0.8429811000823975, + "logps/chosen": -58.273983001708984, + "logps/rejected": -37.92662811279297, + "loss": 0.8156, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.713328242301941, + "rewards/margins": 0.010734200477600098, + "rewards/rejected": 1.7025940418243408, + "step": 3630 + }, + { + "epoch": 0.59, + "learning_rate": 8.27294903187792e-06, + "logits/chosen": -0.7119784355163574, + "logits/rejected": -0.7175961136817932, + "logps/chosen": -3.4453601837158203, + "logps/rejected": -2.7122538089752197, + "loss": 0.661, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.23990760743618011, + "rewards/margins": -0.06589238345623016, + "rewards/rejected": 0.3057999908924103, + "step": 3631 + }, + { + "epoch": 0.59, + "learning_rate": 8.271955363534862e-06, + "logits/chosen": -0.9786986112594604, + "logits/rejected": -1.0173923969268799, + "logps/chosen": -44.549068450927734, + "logps/rejected": -44.965545654296875, + "loss": 0.6841, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.6177352666854858, + "rewards/margins": -0.678421139717102, + "rewards/rejected": 2.296156406402588, + "step": 3632 + }, + { + "epoch": 0.59, + "learning_rate": 8.270961469130649e-06, + "logits/chosen": -1.1719400882720947, + "logits/rejected": -1.1719400882720947, + "logps/chosen": -5.120321750640869, + "logps/rejected": -5.120321750640869, + "loss": 1.1835, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.5470715165138245, + "rewards/margins": 0.0, + "rewards/rejected": 0.5470715165138245, + "step": 3633 + }, + { + "epoch": 0.59, + "learning_rate": 8.269967348733947e-06, + "logits/chosen": -0.8129758834838867, + "logits/rejected": -0.7215462327003479, + "logps/chosen": -54.60261154174805, + "logps/rejected": -42.78710174560547, + "loss": 1.3893, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.9726139307022095, + "rewards/margins": -1.7765017747879028, + "rewards/rejected": 3.7491157054901123, + "step": 3634 + }, + { + "epoch": 0.59, + "learning_rate": 8.268973002413444e-06, + "logits/chosen": -0.9869852662086487, + "logits/rejected": -1.0038222074508667, + "logps/chosen": -81.83240509033203, + "logps/rejected": -71.89322662353516, + "loss": 0.9544, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.2949020862579346, + "rewards/margins": -0.34289395809173584, + "rewards/rejected": 1.6377960443496704, + "step": 3635 + }, + { + "epoch": 0.59, + "learning_rate": 8.267978430237835e-06, + "logits/chosen": -1.2637821435928345, + "logits/rejected": -1.281198501586914, + "logps/chosen": -70.60968780517578, + "logps/rejected": -131.27978515625, + "loss": 1.169, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.318645477294922, + "rewards/margins": -0.6127052307128906, + "rewards/rejected": 2.9313507080078125, + "step": 3636 + }, + { + "epoch": 0.59, + "learning_rate": 8.266983632275842e-06, + "logits/chosen": -1.1822030544281006, + "logits/rejected": -1.1142255067825317, + "logps/chosen": -60.7225456237793, + "logps/rejected": -18.873943328857422, + "loss": 0.187, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.4945735931396484, + "rewards/margins": 0.9861785769462585, + "rewards/rejected": 0.5083950161933899, + "step": 3637 + }, + { + "epoch": 0.59, + "learning_rate": 8.265988608596189e-06, + "logits/chosen": -1.0793285369873047, + "logits/rejected": -0.9428990483283997, + "logps/chosen": -61.0528564453125, + "logps/rejected": -20.927906036376953, + "loss": 0.3081, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.258481740951538, + "rewards/margins": 1.6216042041778564, + "rewards/rejected": 0.6368774771690369, + "step": 3638 + }, + { + "epoch": 0.59, + "learning_rate": 8.264993359267627e-06, + "logits/chosen": -1.1065759658813477, + "logits/rejected": -0.9836918115615845, + "logps/chosen": -84.681396484375, + "logps/rejected": -52.065216064453125, + "loss": 0.7542, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.6908950805664062, + "rewards/margins": -0.23077940940856934, + "rewards/rejected": 2.9216744899749756, + "step": 3639 + }, + { + "epoch": 0.59, + "learning_rate": 8.26399788435892e-06, + "logits/chosen": -0.8595405220985413, + "logits/rejected": -0.8595405220985413, + "logps/chosen": -2.44199800491333, + "logps/rejected": -2.44199800491333, + "loss": 0.3466, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.2531648576259613, + "rewards/margins": 0.0, + "rewards/rejected": 0.2531648576259613, + "step": 3640 + }, + { + "epoch": 0.59, + "learning_rate": 8.263002183938841e-06, + "logits/chosen": -1.2006691694259644, + "logits/rejected": -1.1658930778503418, + "logps/chosen": -62.418609619140625, + "logps/rejected": -79.3564453125, + "loss": 0.37, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.007943868637085, + "rewards/margins": 0.08650219440460205, + "rewards/rejected": 1.921441674232483, + "step": 3641 + }, + { + "epoch": 0.59, + "learning_rate": 8.262006258076187e-06, + "logits/chosen": -1.2871180772781372, + "logits/rejected": -1.1730352640151978, + "logps/chosen": -102.6290283203125, + "logps/rejected": -81.426025390625, + "loss": 1.4432, + "rewards/accuracies": 0.0, + "rewards/chosen": 4.285400390625, + "rewards/margins": -0.35271310806274414, + "rewards/rejected": 4.638113498687744, + "step": 3642 + }, + { + "epoch": 0.59, + "learning_rate": 8.261010106839766e-06, + "logits/chosen": -1.28307044506073, + "logits/rejected": -1.3088041543960571, + "logps/chosen": -68.44804382324219, + "logps/rejected": -122.03208923339844, + "loss": 0.5559, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.792299747467041, + "rewards/margins": 0.1416022777557373, + "rewards/rejected": 2.6506974697113037, + "step": 3643 + }, + { + "epoch": 0.59, + "learning_rate": 8.260013730298403e-06, + "logits/chosen": -1.0035977363586426, + "logits/rejected": -0.9332768321037292, + "logps/chosen": -107.02859497070312, + "logps/rejected": -64.69633483886719, + "loss": 0.4867, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.9174530506134033, + "rewards/margins": -0.49080348014831543, + "rewards/rejected": 3.4082565307617188, + "step": 3644 + }, + { + "epoch": 0.59, + "learning_rate": 8.259017128520936e-06, + "logits/chosen": -0.7641477584838867, + "logits/rejected": -0.6515612602233887, + "logps/chosen": -62.25365447998047, + "logps/rejected": -42.13494873046875, + "loss": 1.1015, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.1767539978027344, + "rewards/margins": 0.5944015383720398, + "rewards/rejected": 0.5823524594306946, + "step": 3645 + }, + { + "epoch": 0.59, + "learning_rate": 8.258020301576224e-06, + "logits/chosen": -0.7879962921142578, + "logits/rejected": -0.71427983045578, + "logps/chosen": -45.22829055786133, + "logps/rejected": -40.72522735595703, + "loss": 0.3218, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.994516372680664, + "rewards/margins": 1.0892726182937622, + "rewards/rejected": 1.9052437543869019, + "step": 3646 + }, + { + "epoch": 0.59, + "learning_rate": 8.257023249533137e-06, + "logits/chosen": -1.0129456520080566, + "logits/rejected": -1.0321093797683716, + "logps/chosen": -70.98037719726562, + "logps/rejected": -56.273983001708984, + "loss": 0.2126, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.8027405738830566, + "rewards/margins": 0.6918256282806396, + "rewards/rejected": 2.110914945602417, + "step": 3647 + }, + { + "epoch": 0.59, + "learning_rate": 8.256025972460561e-06, + "logits/chosen": -0.8030633330345154, + "logits/rejected": -0.7212675213813782, + "logps/chosen": -80.99713134765625, + "logps/rejected": -71.32093811035156, + "loss": 1.2696, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.9593521356582642, + "rewards/margins": -2.452056884765625, + "rewards/rejected": 4.4114089012146, + "step": 3648 + }, + { + "epoch": 0.59, + "learning_rate": 8.2550284704274e-06, + "logits/chosen": -0.9541094899177551, + "logits/rejected": -0.9552827477455139, + "logps/chosen": -6.7558088302612305, + "logps/rejected": -4.011384010314941, + "loss": 1.5805, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.052599240094423294, + "rewards/margins": -0.19827796518802643, + "rewards/rejected": 0.2508772015571594, + "step": 3649 + }, + { + "epoch": 0.59, + "learning_rate": 8.25403074350257e-06, + "logits/chosen": -0.9604385495185852, + "logits/rejected": -0.8358603119850159, + "logps/chosen": -52.16123580932617, + "logps/rejected": -15.433727264404297, + "loss": 0.212, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.3144009113311768, + "rewards/margins": 0.6607744693756104, + "rewards/rejected": 0.6536264419555664, + "step": 3650 + }, + { + "epoch": 0.59, + "learning_rate": 8.253032791755004e-06, + "logits/chosen": -0.9823417663574219, + "logits/rejected": -0.8781678080558777, + "logps/chosen": -88.33082580566406, + "logps/rejected": -77.92591094970703, + "loss": 0.1758, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.226830244064331, + "rewards/margins": 1.72579026222229, + "rewards/rejected": 0.5010399222373962, + "step": 3651 + }, + { + "epoch": 0.59, + "learning_rate": 8.252034615253657e-06, + "logits/chosen": -0.989219605922699, + "logits/rejected": -1.1032987833023071, + "logps/chosen": -71.37213134765625, + "logps/rejected": -84.38078308105469, + "loss": 1.7345, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.8471214771270752, + "rewards/margins": -3.431795358657837, + "rewards/rejected": 5.278916835784912, + "step": 3652 + }, + { + "epoch": 0.59, + "learning_rate": 8.251036214067485e-06, + "logits/chosen": -1.0075920820236206, + "logits/rejected": -1.0344141721725464, + "logps/chosen": -52.442588806152344, + "logps/rejected": -107.50843048095703, + "loss": 3.7996, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.4877891540527344, + "rewards/margins": -5.1698150634765625, + "rewards/rejected": 7.657604217529297, + "step": 3653 + }, + { + "epoch": 0.59, + "learning_rate": 8.250037588265473e-06, + "logits/chosen": -1.2141354084014893, + "logits/rejected": -1.242273211479187, + "logps/chosen": -108.78639221191406, + "logps/rejected": -79.964111328125, + "loss": 1.0781, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.5877487659454346, + "rewards/margins": -1.9953911304473877, + "rewards/rejected": 3.5831398963928223, + "step": 3654 + }, + { + "epoch": 0.59, + "learning_rate": 8.249038737916617e-06, + "logits/chosen": -1.1499148607254028, + "logits/rejected": -1.1613094806671143, + "logps/chosen": -115.86236572265625, + "logps/rejected": -54.78992462158203, + "loss": 0.1103, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.541586399078369, + "rewards/margins": 2.020636796951294, + "rewards/rejected": 2.520949602127075, + "step": 3655 + }, + { + "epoch": 0.59, + "learning_rate": 8.248039663089927e-06, + "logits/chosen": -1.4947441816329956, + "logits/rejected": -1.5027068853378296, + "logps/chosen": -164.17919921875, + "logps/rejected": -91.4398193359375, + "loss": 0.2972, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.433886766433716, + "rewards/margins": 1.1174018383026123, + "rewards/rejected": 2.3164849281311035, + "step": 3656 + }, + { + "epoch": 0.59, + "learning_rate": 8.247040363854428e-06, + "logits/chosen": -1.4511172771453857, + "logits/rejected": -1.3789546489715576, + "logps/chosen": -67.16124725341797, + "logps/rejected": -73.44700622558594, + "loss": 0.5367, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.993793487548828, + "rewards/margins": 0.686704158782959, + "rewards/rejected": 2.307089328765869, + "step": 3657 + }, + { + "epoch": 0.59, + "learning_rate": 8.246040840279165e-06, + "logits/chosen": -0.3502196669578552, + "logits/rejected": -0.3524513244628906, + "logps/chosen": -4.252894878387451, + "logps/rejected": -2.387223482131958, + "loss": 0.3662, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.18283496797084808, + "rewards/margins": -0.053854480385780334, + "rewards/rejected": 0.23668944835662842, + "step": 3658 + }, + { + "epoch": 0.59, + "learning_rate": 8.245041092433194e-06, + "logits/chosen": -1.0268840789794922, + "logits/rejected": -1.0268840789794922, + "logps/chosen": -69.29199981689453, + "logps/rejected": -69.29199981689453, + "loss": 0.7292, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.6520973443984985, + "rewards/margins": 0.0, + "rewards/rejected": 1.6520973443984985, + "step": 3659 + }, + { + "epoch": 0.59, + "learning_rate": 8.244041120385589e-06, + "logits/chosen": -0.7694761753082275, + "logits/rejected": -0.7657777667045593, + "logps/chosen": -9.393807411193848, + "logps/rejected": -6.065830230712891, + "loss": 0.4681, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.4279918670654297, + "rewards/margins": -0.019007772207260132, + "rewards/rejected": 0.4469996392726898, + "step": 3660 + }, + { + "epoch": 0.59, + "learning_rate": 8.243040924205436e-06, + "logits/chosen": -1.117472529411316, + "logits/rejected": -1.1139121055603027, + "logps/chosen": -78.54319763183594, + "logps/rejected": -95.43388366699219, + "loss": 0.4035, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.168247938156128, + "rewards/margins": 0.6102477312088013, + "rewards/rejected": 1.5580002069473267, + "step": 3661 + }, + { + "epoch": 0.59, + "learning_rate": 8.242040503961843e-06, + "logits/chosen": -0.8320045471191406, + "logits/rejected": -0.8169984817504883, + "logps/chosen": -18.31663703918457, + "logps/rejected": -2.1388537883758545, + "loss": 0.5768, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3777574598789215, + "rewards/margins": 0.006628096103668213, + "rewards/rejected": 0.3711293637752533, + "step": 3662 + }, + { + "epoch": 0.59, + "learning_rate": 8.241039859723928e-06, + "logits/chosen": -1.281855583190918, + "logits/rejected": -1.4121514558792114, + "logps/chosen": -267.4840087890625, + "logps/rejected": -151.43621826171875, + "loss": 0.53, + "rewards/accuracies": 0.0, + "rewards/chosen": 5.550344944000244, + "rewards/margins": -0.609837532043457, + "rewards/rejected": 6.160182476043701, + "step": 3663 + }, + { + "epoch": 0.59, + "learning_rate": 8.240038991560823e-06, + "logits/chosen": -0.9050081372261047, + "logits/rejected": -0.7791141271591187, + "logps/chosen": -36.964393615722656, + "logps/rejected": -49.34918975830078, + "loss": 2.5488, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.4219970703125, + "rewards/margins": 1.3932143449783325, + "rewards/rejected": 1.0287827253341675, + "step": 3664 + }, + { + "epoch": 0.59, + "learning_rate": 8.239037899541683e-06, + "logits/chosen": -0.971416175365448, + "logits/rejected": -0.9466843008995056, + "logps/chosen": -150.49594116210938, + "logps/rejected": -49.95372009277344, + "loss": 0.0251, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.927796840667725, + "rewards/margins": 3.6616921424865723, + "rewards/rejected": 1.2661045789718628, + "step": 3665 + }, + { + "epoch": 0.6, + "learning_rate": 8.238036583735673e-06, + "logits/chosen": -0.8040683269500732, + "logits/rejected": -0.8034688234329224, + "logps/chosen": -1.9263179302215576, + "logps/rejected": -12.788187980651855, + "loss": 0.4985, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.29412949085235596, + "rewards/margins": 0.21935322880744934, + "rewards/rejected": 0.07477626949548721, + "step": 3666 + }, + { + "epoch": 0.6, + "learning_rate": 8.237035044211972e-06, + "logits/chosen": -0.8752327561378479, + "logits/rejected": -0.8466672897338867, + "logps/chosen": -109.4451675415039, + "logps/rejected": -73.85563659667969, + "loss": 0.1909, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.407355785369873, + "rewards/margins": 0.9194526672363281, + "rewards/rejected": 4.487903118133545, + "step": 3667 + }, + { + "epoch": 0.6, + "learning_rate": 8.23603328103978e-06, + "logits/chosen": -0.8810357451438904, + "logits/rejected": -0.7525648474693298, + "logps/chosen": -109.69406127929688, + "logps/rejected": -86.3785400390625, + "loss": 0.0685, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.249728679656982, + "rewards/margins": 2.1523959636688232, + "rewards/rejected": 3.097332715988159, + "step": 3668 + }, + { + "epoch": 0.6, + "learning_rate": 8.235031294288306e-06, + "logits/chosen": -1.4697715044021606, + "logits/rejected": -1.3326780796051025, + "logps/chosen": -98.98423767089844, + "logps/rejected": -16.09670639038086, + "loss": 0.0107, + "rewards/accuracies": 1.0, + "rewards/chosen": 8.170252799987793, + "rewards/margins": 7.139132976531982, + "rewards/rejected": 1.031119704246521, + "step": 3669 + }, + { + "epoch": 0.6, + "learning_rate": 8.234029084026782e-06, + "logits/chosen": -1.3929247856140137, + "logits/rejected": -1.444778561592102, + "logps/chosen": -164.67135620117188, + "logps/rejected": -128.59564208984375, + "loss": 0.7705, + "rewards/accuracies": 1.0, + "rewards/chosen": 7.080863952636719, + "rewards/margins": 0.08700847625732422, + "rewards/rejected": 6.9938554763793945, + "step": 3670 + }, + { + "epoch": 0.6, + "learning_rate": 8.233026650324446e-06, + "logits/chosen": -1.0658113956451416, + "logits/rejected": -1.0881550312042236, + "logps/chosen": -153.90045166015625, + "logps/rejected": -74.20463562011719, + "loss": 0.0767, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.4601166248321533, + "rewards/margins": 1.8240318298339844, + "rewards/rejected": 1.636084794998169, + "step": 3671 + }, + { + "epoch": 0.6, + "learning_rate": 8.232023993250561e-06, + "logits/chosen": -1.0126161575317383, + "logits/rejected": -1.005844235420227, + "logps/chosen": -69.66931915283203, + "logps/rejected": -51.52434539794922, + "loss": 0.214, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.4552459716796875, + "rewards/margins": 1.3332068920135498, + "rewards/rejected": 1.1220390796661377, + "step": 3672 + }, + { + "epoch": 0.6, + "learning_rate": 8.231021112874401e-06, + "logits/chosen": -0.935823917388916, + "logits/rejected": -0.9375699162483215, + "logps/chosen": -87.75944519042969, + "logps/rejected": -74.20970916748047, + "loss": 0.2377, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1618354320526123, + "rewards/margins": 2.539377450942993, + "rewards/rejected": 0.6224579215049744, + "step": 3673 + }, + { + "epoch": 0.6, + "learning_rate": 8.230018009265255e-06, + "logits/chosen": -0.7366033792495728, + "logits/rejected": -0.6553540229797363, + "logps/chosen": -67.5908432006836, + "logps/rejected": -40.72596740722656, + "loss": 0.3488, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.001575469970703, + "rewards/margins": 1.3103049993515015, + "rewards/rejected": 0.6912704706192017, + "step": 3674 + }, + { + "epoch": 0.6, + "learning_rate": 8.229014682492425e-06, + "logits/chosen": -1.3059585094451904, + "logits/rejected": -1.3261473178863525, + "logps/chosen": -92.63965606689453, + "logps/rejected": -83.11174011230469, + "loss": 0.2993, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.6399192810058594, + "rewards/margins": 0.2863593101501465, + "rewards/rejected": 2.353559970855713, + "step": 3675 + }, + { + "epoch": 0.6, + "learning_rate": 8.228011132625234e-06, + "logits/chosen": -1.1073378324508667, + "logits/rejected": -1.1017272472381592, + "logps/chosen": -95.63616943359375, + "logps/rejected": -83.12298583984375, + "loss": 0.2706, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.738320827484131, + "rewards/margins": 0.3535337448120117, + "rewards/rejected": 4.384787082672119, + "step": 3676 + }, + { + "epoch": 0.6, + "learning_rate": 8.227007359733018e-06, + "logits/chosen": -0.5550425052642822, + "logits/rejected": -0.7185045480728149, + "logps/chosen": -73.6610107421875, + "logps/rejected": -74.39176940917969, + "loss": 0.524, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.9593108892440796, + "rewards/margins": -0.6044143438339233, + "rewards/rejected": 2.563725233078003, + "step": 3677 + }, + { + "epoch": 0.6, + "learning_rate": 8.226003363885128e-06, + "logits/chosen": -1.2302132844924927, + "logits/rejected": -1.1996432542800903, + "logps/chosen": -54.65399169921875, + "logps/rejected": -47.15337371826172, + "loss": 0.7507, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.8396164178848267, + "rewards/margins": -0.33206260204315186, + "rewards/rejected": 2.1716790199279785, + "step": 3678 + }, + { + "epoch": 0.6, + "learning_rate": 8.22499914515093e-06, + "logits/chosen": -1.4092164039611816, + "logits/rejected": -1.5040940046310425, + "logps/chosen": -64.68781280517578, + "logps/rejected": -139.4572296142578, + "loss": 3.4302, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.6824455261230469, + "rewards/margins": -6.45761775970459, + "rewards/rejected": 8.140063285827637, + "step": 3679 + }, + { + "epoch": 0.6, + "learning_rate": 8.223994703599806e-06, + "logits/chosen": -0.875626802444458, + "logits/rejected": -0.8754526972770691, + "logps/chosen": -1.8100377321243286, + "logps/rejected": -13.977252006530762, + "loss": 0.5845, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.2405511438846588, + "rewards/margins": -0.10453677177429199, + "rewards/rejected": 0.3450879156589508, + "step": 3680 + }, + { + "epoch": 0.6, + "learning_rate": 8.222990039301153e-06, + "logits/chosen": -0.6862250566482544, + "logits/rejected": -0.718500554561615, + "logps/chosen": -1.2858206033706665, + "logps/rejected": -20.947227478027344, + "loss": 0.3913, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.2606881260871887, + "rewards/margins": -0.1369839608669281, + "rewards/rejected": 0.3976720869541168, + "step": 3681 + }, + { + "epoch": 0.6, + "learning_rate": 8.221985152324385e-06, + "logits/chosen": -1.076117992401123, + "logits/rejected": -1.0806275606155396, + "logps/chosen": -9.260920524597168, + "logps/rejected": -2.907435417175293, + "loss": 0.8066, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.22117853164672852, + "rewards/margins": -0.1977536678314209, + "rewards/rejected": 0.4189321994781494, + "step": 3682 + }, + { + "epoch": 0.6, + "learning_rate": 8.220980042738931e-06, + "logits/chosen": -0.9907081723213196, + "logits/rejected": -0.937674343585968, + "logps/chosen": -57.717803955078125, + "logps/rejected": -69.08047485351562, + "loss": 1.3579, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.001028537750244, + "rewards/margins": -1.4060606956481934, + "rewards/rejected": 3.4070892333984375, + "step": 3683 + }, + { + "epoch": 0.6, + "learning_rate": 8.219974710614232e-06, + "logits/chosen": -1.3045135736465454, + "logits/rejected": -1.2526296377182007, + "logps/chosen": -146.11451721191406, + "logps/rejected": -86.71530151367188, + "loss": 1.3343, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.5834624767303467, + "rewards/margins": -0.27509021759033203, + "rewards/rejected": 2.8585526943206787, + "step": 3684 + }, + { + "epoch": 0.6, + "learning_rate": 8.21896915601975e-06, + "logits/chosen": -0.761659562587738, + "logits/rejected": -0.8003969788551331, + "logps/chosen": -69.13321685791016, + "logps/rejected": -66.28252410888672, + "loss": 0.7416, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.4485191106796265, + "rewards/margins": 0.07335805892944336, + "rewards/rejected": 1.375161051750183, + "step": 3685 + }, + { + "epoch": 0.6, + "learning_rate": 8.217963379024955e-06, + "logits/chosen": -0.8230567574501038, + "logits/rejected": -0.6531266570091248, + "logps/chosen": -63.05384063720703, + "logps/rejected": -30.681251525878906, + "loss": 0.3327, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.3123191595077515, + "rewards/margins": 0.9592929482460022, + "rewards/rejected": 0.35302621126174927, + "step": 3686 + }, + { + "epoch": 0.6, + "learning_rate": 8.216957379699339e-06, + "logits/chosen": -0.9249072074890137, + "logits/rejected": -0.9033749103546143, + "logps/chosen": -156.14830017089844, + "logps/rejected": -81.18233489990234, + "loss": 0.3117, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.62017822265625, + "rewards/margins": 3.423290252685547, + "rewards/rejected": 1.1968879699707031, + "step": 3687 + }, + { + "epoch": 0.6, + "learning_rate": 8.21595115811241e-06, + "logits/chosen": -0.5209156274795532, + "logits/rejected": -0.5209156274795532, + "logps/chosen": -1.6909613609313965, + "logps/rejected": -1.6909613609313965, + "loss": 0.4629, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.2633090019226074, + "rewards/margins": 0.0, + "rewards/rejected": 0.2633090019226074, + "step": 3688 + }, + { + "epoch": 0.6, + "learning_rate": 8.214944714333683e-06, + "logits/chosen": -1.0143061876296997, + "logits/rejected": -0.9732896685600281, + "logps/chosen": -52.39342498779297, + "logps/rejected": -53.43711853027344, + "loss": 1.6518, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.4977127313613892, + "rewards/margins": -3.0108652114868164, + "rewards/rejected": 4.508577823638916, + "step": 3689 + }, + { + "epoch": 0.6, + "learning_rate": 8.213938048432697e-06, + "logits/chosen": -1.130632758140564, + "logits/rejected": -1.130632758140564, + "logps/chosen": -69.56619262695312, + "logps/rejected": -69.56619262695312, + "loss": 0.5196, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.909282684326172, + "rewards/margins": 0.0, + "rewards/rejected": 2.909282684326172, + "step": 3690 + }, + { + "epoch": 0.6, + "learning_rate": 8.212931160479003e-06, + "logits/chosen": -1.0749388933181763, + "logits/rejected": -1.0618009567260742, + "logps/chosen": -59.6002197265625, + "logps/rejected": -49.39122772216797, + "loss": 0.8585, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.575993299484253, + "rewards/margins": -1.4212219715118408, + "rewards/rejected": 3.9972152709960938, + "step": 3691 + }, + { + "epoch": 0.6, + "learning_rate": 8.211924050542165e-06, + "logits/chosen": -1.0124760866165161, + "logits/rejected": -0.7765843868255615, + "logps/chosen": -58.05536651611328, + "logps/rejected": -46.74137496948242, + "loss": 1.2728, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.8041818141937256, + "rewards/margins": 1.6785907745361328, + "rewards/rejected": 2.1255910396575928, + "step": 3692 + }, + { + "epoch": 0.6, + "learning_rate": 8.21091671869177e-06, + "logits/chosen": -1.542894721031189, + "logits/rejected": -1.5803557634353638, + "logps/chosen": -67.2708969116211, + "logps/rejected": -53.68025207519531, + "loss": 1.5563, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.9257591366767883, + "rewards/margins": -1.972665548324585, + "rewards/rejected": 2.8984246253967285, + "step": 3693 + }, + { + "epoch": 0.6, + "learning_rate": 8.209909164997409e-06, + "logits/chosen": -1.1813652515411377, + "logits/rejected": -1.350821614265442, + "logps/chosen": -128.98779296875, + "logps/rejected": -121.84600830078125, + "loss": 2.2822, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.2542405128479004, + "rewards/margins": -2.6830368041992188, + "rewards/rejected": 4.937277317047119, + "step": 3694 + }, + { + "epoch": 0.6, + "learning_rate": 8.208901389528699e-06, + "logits/chosen": -0.7092471122741699, + "logits/rejected": -0.7092471122741699, + "logps/chosen": -0.557942271232605, + "logps/rejected": -0.557942271232605, + "loss": 1.7849, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.09246689081192017, + "rewards/margins": 0.0, + "rewards/rejected": 0.09246689081192017, + "step": 3695 + }, + { + "epoch": 0.6, + "learning_rate": 8.207893392355264e-06, + "logits/chosen": -1.116191029548645, + "logits/rejected": -0.9774516224861145, + "logps/chosen": -115.80471801757812, + "logps/rejected": -45.646331787109375, + "loss": 0.1584, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.643426418304443, + "rewards/margins": 1.1295666694641113, + "rewards/rejected": 4.513859748840332, + "step": 3696 + }, + { + "epoch": 0.6, + "learning_rate": 8.206885173546751e-06, + "logits/chosen": -0.9493253231048584, + "logits/rejected": -0.911186933517456, + "logps/chosen": -105.03683471679688, + "logps/rejected": -50.362308502197266, + "loss": 1.0771, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.1140191555023193, + "rewards/margins": -0.0058383941650390625, + "rewards/rejected": 2.1198575496673584, + "step": 3697 + }, + { + "epoch": 0.6, + "learning_rate": 8.205876733172813e-06, + "logits/chosen": -0.9386415481567383, + "logits/rejected": -0.9734980463981628, + "logps/chosen": -40.40729522705078, + "logps/rejected": -93.56411743164062, + "loss": 0.6901, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.4285202026367188, + "rewards/margins": -0.7615318298339844, + "rewards/rejected": 2.190052032470703, + "step": 3698 + }, + { + "epoch": 0.6, + "learning_rate": 8.204868071303131e-06, + "logits/chosen": -0.7699649930000305, + "logits/rejected": -0.780940592288971, + "logps/chosen": -1.5974392890930176, + "logps/rejected": -34.833133697509766, + "loss": 0.5486, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.3380657732486725, + "rewards/margins": -0.5713210105895996, + "rewards/rejected": 0.9093868136405945, + "step": 3699 + }, + { + "epoch": 0.6, + "learning_rate": 8.203859188007388e-06, + "logits/chosen": -1.0875238180160522, + "logits/rejected": -1.049527883529663, + "logps/chosen": -96.9525146484375, + "logps/rejected": -94.25666046142578, + "loss": 0.4002, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.92943274974823, + "rewards/margins": 0.053496599197387695, + "rewards/rejected": 1.8759361505508423, + "step": 3700 + }, + { + "epoch": 0.6, + "learning_rate": 8.20285008335529e-06, + "logits/chosen": -1.031423568725586, + "logits/rejected": -1.0704333782196045, + "logps/chosen": -126.79010009765625, + "logps/rejected": -82.27345275878906, + "loss": 0.4988, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.979266405105591, + "rewards/margins": 0.7177824974060059, + "rewards/rejected": 2.261483907699585, + "step": 3701 + }, + { + "epoch": 0.6, + "learning_rate": 8.201840757416558e-06, + "logits/chosen": -1.2991794347763062, + "logits/rejected": -0.6705231666564941, + "logps/chosen": -83.01566314697266, + "logps/rejected": -149.90110778808594, + "loss": 1.6625, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.4697471857070923, + "rewards/margins": 0.3606163263320923, + "rewards/rejected": 1.109130859375, + "step": 3702 + }, + { + "epoch": 0.6, + "learning_rate": 8.200831210260924e-06, + "logits/chosen": -1.5027889013290405, + "logits/rejected": -1.3470641374588013, + "logps/chosen": -121.87884521484375, + "logps/rejected": -17.4053955078125, + "loss": 0.4183, + "rewards/accuracies": 1.0, + "rewards/chosen": 9.279074668884277, + "rewards/margins": 8.399552345275879, + "rewards/rejected": 0.8795223236083984, + "step": 3703 + }, + { + "epoch": 0.6, + "learning_rate": 8.199821441958142e-06, + "logits/chosen": -0.9847660064697266, + "logits/rejected": -0.9986342191696167, + "logps/chosen": -80.05868530273438, + "logps/rejected": -96.22364807128906, + "loss": 0.22, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.4031295776367188, + "rewards/margins": 0.5982093811035156, + "rewards/rejected": 1.8049201965332031, + "step": 3704 + }, + { + "epoch": 0.6, + "learning_rate": 8.198811452577974e-06, + "logits/chosen": -1.133398175239563, + "logits/rejected": -1.1182152032852173, + "logps/chosen": -156.07070922851562, + "logps/rejected": -95.14497375488281, + "loss": 0.2679, + "rewards/accuracies": 1.0, + "rewards/chosen": 9.058809280395508, + "rewards/margins": 0.36026763916015625, + "rewards/rejected": 8.698541641235352, + "step": 3705 + }, + { + "epoch": 0.6, + "learning_rate": 8.197801242190204e-06, + "logits/chosen": -1.0211920738220215, + "logits/rejected": -1.0211920738220215, + "logps/chosen": -62.26670455932617, + "logps/rejected": -62.26670455932617, + "loss": 0.6763, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.1053829193115234, + "rewards/margins": 0.0, + "rewards/rejected": 3.1053829193115234, + "step": 3706 + }, + { + "epoch": 0.6, + "learning_rate": 8.196790810864624e-06, + "logits/chosen": -1.3241043090820312, + "logits/rejected": -1.2711776494979858, + "logps/chosen": -101.09618377685547, + "logps/rejected": -189.48934936523438, + "loss": 0.7692, + "rewards/accuracies": 0.0, + "rewards/chosen": 6.272341251373291, + "rewards/margins": -1.2600107192993164, + "rewards/rejected": 7.532351970672607, + "step": 3707 + }, + { + "epoch": 0.6, + "learning_rate": 8.195780158671047e-06, + "logits/chosen": -1.009334683418274, + "logits/rejected": -0.8474266529083252, + "logps/chosen": -72.78907775878906, + "logps/rejected": -33.74139404296875, + "loss": 0.3347, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.710836887359619, + "rewards/margins": 2.306807041168213, + "rewards/rejected": 0.40402984619140625, + "step": 3708 + }, + { + "epoch": 0.6, + "learning_rate": 8.1947692856793e-06, + "logits/chosen": -0.5916063189506531, + "logits/rejected": -0.6399379372596741, + "logps/chosen": -75.6246109008789, + "logps/rejected": -58.95545959472656, + "loss": 0.9707, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.221315860748291, + "rewards/margins": 0.4952981472015381, + "rewards/rejected": 2.726017713546753, + "step": 3709 + }, + { + "epoch": 0.6, + "learning_rate": 8.193758191959227e-06, + "logits/chosen": -1.3102073669433594, + "logits/rejected": -1.2758798599243164, + "logps/chosen": -80.48079681396484, + "logps/rejected": -143.54351806640625, + "loss": 0.8413, + "rewards/accuracies": 0.0, + "rewards/chosen": 6.2082743644714355, + "rewards/margins": -1.3387107849121094, + "rewards/rejected": 7.546985149383545, + "step": 3710 + }, + { + "epoch": 0.6, + "learning_rate": 8.19274687758068e-06, + "logits/chosen": -1.1791545152664185, + "logits/rejected": -1.1233603954315186, + "logps/chosen": -63.60313415527344, + "logps/rejected": -72.9569091796875, + "loss": 0.1718, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.323133945465088, + "rewards/margins": 0.9022324085235596, + "rewards/rejected": 1.4209015369415283, + "step": 3711 + }, + { + "epoch": 0.6, + "learning_rate": 8.191735342613533e-06, + "logits/chosen": -1.2821224927902222, + "logits/rejected": -1.200777530670166, + "logps/chosen": -157.18649291992188, + "logps/rejected": -107.23763275146484, + "loss": 0.3617, + "rewards/accuracies": 0.0, + "rewards/chosen": 6.886210918426514, + "rewards/margins": -0.04853248596191406, + "rewards/rejected": 6.934743404388428, + "step": 3712 + }, + { + "epoch": 0.6, + "learning_rate": 8.190723587127678e-06, + "logits/chosen": -1.103135585784912, + "logits/rejected": -1.1279120445251465, + "logps/chosen": -78.08845520019531, + "logps/rejected": -171.91726684570312, + "loss": 0.7661, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.9644958972930908, + "rewards/margins": 1.0308289527893066, + "rewards/rejected": 0.933667004108429, + "step": 3713 + }, + { + "epoch": 0.6, + "learning_rate": 8.189711611193012e-06, + "logits/chosen": -1.1599740982055664, + "logits/rejected": -1.0023179054260254, + "logps/chosen": -56.216835021972656, + "logps/rejected": -55.22406768798828, + "loss": 0.359, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.334169864654541, + "rewards/margins": 0.5019562244415283, + "rewards/rejected": 2.8322136402130127, + "step": 3714 + }, + { + "epoch": 0.6, + "learning_rate": 8.188699414879454e-06, + "logits/chosen": -0.8378043174743652, + "logits/rejected": -0.8397362232208252, + "logps/chosen": -73.47489929199219, + "logps/rejected": -84.01228332519531, + "loss": 0.3211, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.7911773920059204, + "rewards/margins": 0.1749328374862671, + "rewards/rejected": 1.6162445545196533, + "step": 3715 + }, + { + "epoch": 0.6, + "learning_rate": 8.187686998256939e-06, + "logits/chosen": -1.2691696882247925, + "logits/rejected": -1.2807424068450928, + "logps/chosen": -150.1270294189453, + "logps/rejected": -150.27664184570312, + "loss": 0.425, + "rewards/accuracies": 0.0, + "rewards/chosen": 6.973118782043457, + "rewards/margins": -0.2808837890625, + "rewards/rejected": 7.254002571105957, + "step": 3716 + }, + { + "epoch": 0.6, + "learning_rate": 8.186674361395414e-06, + "logits/chosen": -1.0861806869506836, + "logits/rejected": -1.0491658449172974, + "logps/chosen": -47.138397216796875, + "logps/rejected": -79.99822235107422, + "loss": 0.2861, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.289004921913147, + "rewards/margins": 0.26743054389953613, + "rewards/rejected": 1.0215743780136108, + "step": 3717 + }, + { + "epoch": 0.6, + "learning_rate": 8.185661504364845e-06, + "logits/chosen": -1.2101802825927734, + "logits/rejected": -1.091764211654663, + "logps/chosen": -71.55409240722656, + "logps/rejected": -64.62820434570312, + "loss": 0.8422, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.4124183654785156, + "rewards/margins": -1.381990909576416, + "rewards/rejected": 3.7944092750549316, + "step": 3718 + }, + { + "epoch": 0.6, + "learning_rate": 8.184648427235208e-06, + "logits/chosen": -1.1585031747817993, + "logits/rejected": -1.0426217317581177, + "logps/chosen": -160.7220916748047, + "logps/rejected": -94.06912231445312, + "loss": 0.0953, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.277604579925537, + "rewards/margins": 1.8091199398040771, + "rewards/rejected": 3.46848464012146, + "step": 3719 + }, + { + "epoch": 0.6, + "learning_rate": 8.183635130076496e-06, + "logits/chosen": -1.38620924949646, + "logits/rejected": -1.3328948020935059, + "logps/chosen": -138.90463256835938, + "logps/rejected": -164.71978759765625, + "loss": 0.1095, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.161197185516357, + "rewards/margins": 1.9741168022155762, + "rewards/rejected": 3.1870803833007812, + "step": 3720 + }, + { + "epoch": 0.6, + "learning_rate": 8.182621612958724e-06, + "logits/chosen": -1.341293454170227, + "logits/rejected": -1.295749545097351, + "logps/chosen": -49.31013488769531, + "logps/rejected": -69.58625793457031, + "loss": 0.5766, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.249218702316284, + "rewards/margins": 0.2871764898300171, + "rewards/rejected": 1.962042212486267, + "step": 3721 + }, + { + "epoch": 0.6, + "learning_rate": 8.181607875951911e-06, + "logits/chosen": -0.8579310774803162, + "logits/rejected": -0.918820858001709, + "logps/chosen": -50.118377685546875, + "logps/rejected": -236.60708618164062, + "loss": 2.2305, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.882958173751831, + "rewards/margins": -4.380866050720215, + "rewards/rejected": 7.263824462890625, + "step": 3722 + }, + { + "epoch": 0.6, + "learning_rate": 8.180593919126098e-06, + "logits/chosen": -0.9226903319358826, + "logits/rejected": -0.8411650061607361, + "logps/chosen": -151.09295654296875, + "logps/rejected": -116.00420379638672, + "loss": 0.1737, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.7511414289474487, + "rewards/margins": 0.9365540146827698, + "rewards/rejected": 0.814587414264679, + "step": 3723 + }, + { + "epoch": 0.6, + "learning_rate": 8.179579742551341e-06, + "logits/chosen": -0.9474272131919861, + "logits/rejected": -1.0636537075042725, + "logps/chosen": -41.40513229370117, + "logps/rejected": -87.85455322265625, + "loss": 1.0441, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.939000368118286, + "rewards/margins": -1.8469388484954834, + "rewards/rejected": 4.7859392166137695, + "step": 3724 + }, + { + "epoch": 0.6, + "learning_rate": 8.178565346297709e-06, + "logits/chosen": -1.0286297798156738, + "logits/rejected": -0.9563832879066467, + "logps/chosen": -42.31031036376953, + "logps/rejected": -54.03907012939453, + "loss": 0.6005, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.1967926025390625, + "rewards/margins": -0.6425659656524658, + "rewards/rejected": 1.8393585681915283, + "step": 3725 + }, + { + "epoch": 0.6, + "learning_rate": 8.177550730435289e-06, + "logits/chosen": -1.0200170278549194, + "logits/rejected": -0.9954548478126526, + "logps/chosen": -56.43749237060547, + "logps/rejected": -71.74046325683594, + "loss": 0.053, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.974386692047119, + "rewards/margins": 2.5911178588867188, + "rewards/rejected": 2.3832688331604004, + "step": 3726 + }, + { + "epoch": 0.6, + "learning_rate": 8.176535895034177e-06, + "logits/chosen": -1.1733719110488892, + "logits/rejected": -1.1129413843154907, + "logps/chosen": -88.2436752319336, + "logps/rejected": -97.43406677246094, + "loss": 0.1024, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.777756690979004, + "rewards/margins": 1.524507999420166, + "rewards/rejected": 3.253248691558838, + "step": 3727 + }, + { + "epoch": 0.61, + "learning_rate": 8.175520840164492e-06, + "logits/chosen": -1.2945631742477417, + "logits/rejected": -1.3614449501037598, + "logps/chosen": -147.2039794921875, + "logps/rejected": -115.54208374023438, + "loss": 1.8404, + "rewards/accuracies": 0.0, + "rewards/chosen": 5.222357273101807, + "rewards/margins": -3.179699420928955, + "rewards/rejected": 8.402056694030762, + "step": 3728 + }, + { + "epoch": 0.61, + "learning_rate": 8.174505565896364e-06, + "logits/chosen": -1.1334776878356934, + "logits/rejected": -1.1007273197174072, + "logps/chosen": -27.452350616455078, + "logps/rejected": -102.84031677246094, + "loss": 0.4504, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.3971989154815674, + "rewards/margins": -0.027632474899291992, + "rewards/rejected": 2.4248313903808594, + "step": 3729 + }, + { + "epoch": 0.61, + "learning_rate": 8.17349007229994e-06, + "logits/chosen": -1.3076865673065186, + "logits/rejected": -1.2599854469299316, + "logps/chosen": -130.40994262695312, + "logps/rejected": -143.6881103515625, + "loss": 0.1771, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.249249219894409, + "rewards/margins": 1.034164309501648, + "rewards/rejected": 1.2150849103927612, + "step": 3730 + }, + { + "epoch": 0.61, + "learning_rate": 8.172474359445381e-06, + "logits/chosen": -0.8971030116081238, + "logits/rejected": -0.8540742993354797, + "logps/chosen": -64.97508239746094, + "logps/rejected": -59.595333099365234, + "loss": 0.6518, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.6242141723632812, + "rewards/margins": -0.9230495691299438, + "rewards/rejected": 1.547263741493225, + "step": 3731 + }, + { + "epoch": 0.61, + "learning_rate": 8.17145842740286e-06, + "logits/chosen": -1.388007640838623, + "logits/rejected": -1.388007640838623, + "logps/chosen": -29.614967346191406, + "logps/rejected": -29.614967346191406, + "loss": 0.5782, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.7109062671661377, + "rewards/margins": 0.0, + "rewards/rejected": 3.7109062671661377, + "step": 3732 + }, + { + "epoch": 0.61, + "learning_rate": 8.17044227624257e-06, + "logits/chosen": -1.3913911581039429, + "logits/rejected": -1.3810467720031738, + "logps/chosen": -45.668251037597656, + "logps/rejected": -68.96676635742188, + "loss": 0.1036, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.6493927240371704, + "rewards/margins": 1.4704254865646362, + "rewards/rejected": 0.17896728217601776, + "step": 3733 + }, + { + "epoch": 0.61, + "learning_rate": 8.169425906034718e-06, + "logits/chosen": -0.6174512505531311, + "logits/rejected": -0.6300954222679138, + "logps/chosen": -3.0325071811676025, + "logps/rejected": -0.9622558355331421, + "loss": 0.7346, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.31276386976242065, + "rewards/margins": 0.014346212148666382, + "rewards/rejected": 0.2984176576137543, + "step": 3734 + }, + { + "epoch": 0.61, + "learning_rate": 8.168409316849526e-06, + "logits/chosen": -1.6047974824905396, + "logits/rejected": -1.6037267446517944, + "logps/chosen": -68.97563171386719, + "logps/rejected": -42.554931640625, + "loss": 0.3509, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.112443685531616, + "rewards/margins": 1.3810365200042725, + "rewards/rejected": 0.7314071655273438, + "step": 3735 + }, + { + "epoch": 0.61, + "learning_rate": 8.16739250875723e-06, + "logits/chosen": -1.392768383026123, + "logits/rejected": -1.4089816808700562, + "logps/chosen": -36.04059600830078, + "logps/rejected": -73.4254379272461, + "loss": 0.8797, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.8814849853515625, + "rewards/margins": -0.8662636280059814, + "rewards/rejected": 2.747748613357544, + "step": 3736 + }, + { + "epoch": 0.61, + "learning_rate": 8.166375481828082e-06, + "logits/chosen": -0.7446456551551819, + "logits/rejected": -0.7578347325325012, + "logps/chosen": -54.46891784667969, + "logps/rejected": -55.551780700683594, + "loss": 0.2667, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.8565719723701477, + "rewards/margins": 0.524298906326294, + "rewards/rejected": 0.33227309584617615, + "step": 3737 + }, + { + "epoch": 0.61, + "learning_rate": 8.165358236132347e-06, + "logits/chosen": -1.1344175338745117, + "logits/rejected": -1.0871480703353882, + "logps/chosen": -84.00019073486328, + "logps/rejected": -57.583778381347656, + "loss": 0.4024, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.278459906578064, + "rewards/margins": 0.07236099243164062, + "rewards/rejected": 1.2060989141464233, + "step": 3738 + }, + { + "epoch": 0.61, + "learning_rate": 8.16434077174031e-06, + "logits/chosen": -0.9610482454299927, + "logits/rejected": -0.9683172702789307, + "logps/chosen": -4.922260284423828, + "logps/rejected": -4.8815999031066895, + "loss": 2.31, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.32774925231933594, + "rewards/margins": -0.20020586252212524, + "rewards/rejected": 0.5279551148414612, + "step": 3739 + }, + { + "epoch": 0.61, + "learning_rate": 8.163323088722268e-06, + "logits/chosen": -1.0980192422866821, + "logits/rejected": -1.102602243423462, + "logps/chosen": -54.12694549560547, + "logps/rejected": -103.33888244628906, + "loss": 0.5363, + "rewards/accuracies": 0.0, + "rewards/chosen": 4.678512096405029, + "rewards/margins": -0.5983724594116211, + "rewards/rejected": 5.27688455581665, + "step": 3740 + }, + { + "epoch": 0.61, + "learning_rate": 8.16230518714853e-06, + "logits/chosen": -1.1848194599151611, + "logits/rejected": -1.142704725265503, + "logps/chosen": -9.371553421020508, + "logps/rejected": -17.597902297973633, + "loss": 0.2728, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.9090871810913086, + "rewards/margins": 0.419369101524353, + "rewards/rejected": 1.4897180795669556, + "step": 3741 + }, + { + "epoch": 0.61, + "learning_rate": 8.161287067089426e-06, + "logits/chosen": -1.1900181770324707, + "logits/rejected": -1.104774832725525, + "logps/chosen": -61.20817184448242, + "logps/rejected": -90.9205322265625, + "loss": 0.2418, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.7634129524230957, + "rewards/margins": 0.530571460723877, + "rewards/rejected": 2.2328414916992188, + "step": 3742 + }, + { + "epoch": 0.61, + "learning_rate": 8.1602687286153e-06, + "logits/chosen": -1.3117648363113403, + "logits/rejected": -1.116767406463623, + "logps/chosen": -143.42465209960938, + "logps/rejected": -67.72518157958984, + "loss": 0.013, + "rewards/accuracies": 1.0, + "rewards/chosen": 8.71905517578125, + "rewards/margins": 3.990345001220703, + "rewards/rejected": 4.728710174560547, + "step": 3743 + }, + { + "epoch": 0.61, + "learning_rate": 8.159250171796505e-06, + "logits/chosen": -0.7276376485824585, + "logits/rejected": -0.7276376485824585, + "logps/chosen": -47.494476318359375, + "logps/rejected": -47.494476318359375, + "loss": 0.7475, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.3537612855434418, + "rewards/margins": 0.0, + "rewards/rejected": -0.3537612855434418, + "step": 3744 + }, + { + "epoch": 0.61, + "learning_rate": 8.158231396703418e-06, + "logits/chosen": -1.1487791538238525, + "logits/rejected": -1.141557216644287, + "logps/chosen": -63.34608840942383, + "logps/rejected": -53.52471923828125, + "loss": 0.6241, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.3397908210754395, + "rewards/margins": 1.0974575281143188, + "rewards/rejected": 1.2423332929611206, + "step": 3745 + }, + { + "epoch": 0.61, + "learning_rate": 8.157212403406424e-06, + "logits/chosen": -0.8869359493255615, + "logits/rejected": -0.9264988303184509, + "logps/chosen": -48.794464111328125, + "logps/rejected": -64.56511688232422, + "loss": 0.8366, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.2784485816955566, + "rewards/margins": -1.4568915367126465, + "rewards/rejected": 3.735340118408203, + "step": 3746 + }, + { + "epoch": 0.61, + "learning_rate": 8.156193191975927e-06, + "logits/chosen": -0.9764440655708313, + "logits/rejected": -0.9311251640319824, + "logps/chosen": -88.12115478515625, + "logps/rejected": -84.4681396484375, + "loss": 0.4835, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.203120708465576, + "rewards/margins": 2.1272966861724854, + "rewards/rejected": 3.075824022293091, + "step": 3747 + }, + { + "epoch": 0.61, + "learning_rate": 8.155173762482344e-06, + "logits/chosen": -0.9254829287528992, + "logits/rejected": -0.9254829287528992, + "logps/chosen": -0.67958664894104, + "logps/rejected": -0.67958664894104, + "loss": 1.55, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.21855786442756653, + "rewards/margins": 0.0, + "rewards/rejected": 0.21855786442756653, + "step": 3748 + }, + { + "epoch": 0.61, + "learning_rate": 8.15415411499611e-06, + "logits/chosen": -1.079232931137085, + "logits/rejected": -1.0505644083023071, + "logps/chosen": -41.46781921386719, + "logps/rejected": -49.25056457519531, + "loss": 0.9832, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.041064500808716, + "rewards/margins": -0.9512436389923096, + "rewards/rejected": 2.9923081398010254, + "step": 3749 + }, + { + "epoch": 0.61, + "learning_rate": 8.153134249587671e-06, + "logits/chosen": -1.2062658071517944, + "logits/rejected": -1.1807292699813843, + "logps/chosen": -43.844810485839844, + "logps/rejected": -22.9891357421875, + "loss": 0.451, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.9629242420196533, + "rewards/margins": -0.3639216423034668, + "rewards/rejected": 2.32684588432312, + "step": 3750 + }, + { + "epoch": 0.61, + "learning_rate": 8.15211416632749e-06, + "logits/chosen": -1.2982178926467896, + "logits/rejected": -1.2771302461624146, + "logps/chosen": -72.1202621459961, + "logps/rejected": -79.44025421142578, + "loss": 1.5196, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0085244178771973, + "rewards/margins": 0.6636078357696533, + "rewards/rejected": 2.344916582107544, + "step": 3751 + }, + { + "epoch": 0.61, + "learning_rate": 8.151093865286046e-06, + "logits/chosen": -0.7163351774215698, + "logits/rejected": -0.7163351774215698, + "logps/chosen": -16.980178833007812, + "logps/rejected": -16.980178833007812, + "loss": 0.4702, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.27832069993019104, + "rewards/margins": 0.0, + "rewards/rejected": 0.27832069993019104, + "step": 3752 + }, + { + "epoch": 0.61, + "learning_rate": 8.150073346533833e-06, + "logits/chosen": -0.9940633773803711, + "logits/rejected": -0.9965254068374634, + "logps/chosen": -21.635799407958984, + "logps/rejected": -21.55544090270996, + "loss": 0.6227, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.4767311215400696, + "rewards/margins": -0.06806051731109619, + "rewards/rejected": 0.5447916388511658, + "step": 3753 + }, + { + "epoch": 0.61, + "learning_rate": 8.149052610141357e-06, + "logits/chosen": -1.1908276081085205, + "logits/rejected": -1.2231956720352173, + "logps/chosen": -42.50967025756836, + "logps/rejected": -54.7176399230957, + "loss": 0.4998, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.1295979022979736, + "rewards/margins": -0.47231531143188477, + "rewards/rejected": 2.6019132137298584, + "step": 3754 + }, + { + "epoch": 0.61, + "learning_rate": 8.148031656179142e-06, + "logits/chosen": -1.2220877408981323, + "logits/rejected": -1.1379472017288208, + "logps/chosen": -88.84780883789062, + "logps/rejected": -91.90725708007812, + "loss": 0.4142, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.570010662078857, + "rewards/margins": 2.662925958633423, + "rewards/rejected": 2.9070847034454346, + "step": 3755 + }, + { + "epoch": 0.61, + "learning_rate": 8.147010484717727e-06, + "logits/chosen": -0.9268561601638794, + "logits/rejected": -0.9033440351486206, + "logps/chosen": -21.01223373413086, + "logps/rejected": -72.2978744506836, + "loss": 0.3615, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.1398632526397705, + "rewards/margins": 0.43766218423843384, + "rewards/rejected": 0.7022010684013367, + "step": 3756 + }, + { + "epoch": 0.61, + "learning_rate": 8.145989095827664e-06, + "logits/chosen": -1.0400421619415283, + "logits/rejected": -1.030281901359558, + "logps/chosen": -74.73831176757812, + "logps/rejected": -124.71880340576172, + "loss": 0.4762, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.2951881885528564, + "rewards/margins": -0.2282959222793579, + "rewards/rejected": 1.5234841108322144, + "step": 3757 + }, + { + "epoch": 0.61, + "learning_rate": 8.144967489579523e-06, + "logits/chosen": -0.934664785861969, + "logits/rejected": -0.9468706250190735, + "logps/chosen": -41.194793701171875, + "logps/rejected": -38.69005584716797, + "loss": 1.6882, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.4537800550460815, + "rewards/margins": -1.6272982358932495, + "rewards/rejected": 3.081078290939331, + "step": 3758 + }, + { + "epoch": 0.61, + "learning_rate": 8.143945666043887e-06, + "logits/chosen": -1.289632797241211, + "logits/rejected": -1.2951445579528809, + "logps/chosen": -91.09666442871094, + "logps/rejected": -83.41282653808594, + "loss": 0.3607, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.156749725341797, + "rewards/margins": 0.09459829330444336, + "rewards/rejected": 2.0621514320373535, + "step": 3759 + }, + { + "epoch": 0.61, + "learning_rate": 8.142923625291352e-06, + "logits/chosen": -1.441257119178772, + "logits/rejected": -1.443699598312378, + "logps/chosen": -68.72561645507812, + "logps/rejected": -86.51664733886719, + "loss": 0.2658, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.1322662830352783, + "rewards/margins": 0.5071877241134644, + "rewards/rejected": 1.625078558921814, + "step": 3760 + }, + { + "epoch": 0.61, + "learning_rate": 8.141901367392535e-06, + "logits/chosen": -0.8356510996818542, + "logits/rejected": -0.8877942562103271, + "logps/chosen": -52.93570327758789, + "logps/rejected": -43.350433349609375, + "loss": 1.34, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.3562848567962646, + "rewards/margins": 0.30329251289367676, + "rewards/rejected": 2.052992343902588, + "step": 3761 + }, + { + "epoch": 0.61, + "learning_rate": 8.14087889241806e-06, + "logits/chosen": -1.0107214450836182, + "logits/rejected": -0.875893771648407, + "logps/chosen": -58.22209548950195, + "logps/rejected": -30.610347747802734, + "loss": 0.2582, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.369257688522339, + "rewards/margins": 1.672302484512329, + "rewards/rejected": 0.696955144405365, + "step": 3762 + }, + { + "epoch": 0.61, + "learning_rate": 8.139856200438574e-06, + "logits/chosen": -1.160597562789917, + "logits/rejected": -1.0949963331222534, + "logps/chosen": -79.07963562011719, + "logps/rejected": -43.36650466918945, + "loss": 1.7838, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.0162140130996704, + "rewards/margins": -0.713942289352417, + "rewards/rejected": 1.7301563024520874, + "step": 3763 + }, + { + "epoch": 0.61, + "learning_rate": 8.138833291524735e-06, + "logits/chosen": -0.9246635437011719, + "logits/rejected": -0.9996563196182251, + "logps/chosen": -66.41116333007812, + "logps/rejected": -134.093505859375, + "loss": 2.1568, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.5089111328125, + "rewards/margins": -3.5611605644226074, + "rewards/rejected": 5.070071697235107, + "step": 3764 + }, + { + "epoch": 0.61, + "learning_rate": 8.137810165747215e-06, + "logits/chosen": -0.94280606508255, + "logits/rejected": -0.8861789107322693, + "logps/chosen": -219.8412628173828, + "logps/rejected": -95.13853454589844, + "loss": 0.3637, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.858863830566406, + "rewards/margins": 3.0227761268615723, + "rewards/rejected": 1.8360878229141235, + "step": 3765 + }, + { + "epoch": 0.61, + "learning_rate": 8.136786823176703e-06, + "logits/chosen": -0.9632181525230408, + "logits/rejected": -0.8734027147293091, + "logps/chosen": -100.314453125, + "logps/rejected": -96.35568237304688, + "loss": 1.0287, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.4801056385040283, + "rewards/margins": -0.4772597551345825, + "rewards/rejected": 1.9573653936386108, + "step": 3766 + }, + { + "epoch": 0.61, + "learning_rate": 8.135763263883902e-06, + "logits/chosen": -1.1676021814346313, + "logits/rejected": -1.2710555791854858, + "logps/chosen": -95.68794250488281, + "logps/rejected": -161.91351318359375, + "loss": 0.7204, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.963270664215088, + "rewards/margins": 0.09922933578491211, + "rewards/rejected": 5.864041328430176, + "step": 3767 + }, + { + "epoch": 0.61, + "learning_rate": 8.13473948793953e-06, + "logits/chosen": -1.070321798324585, + "logits/rejected": -1.0529911518096924, + "logps/chosen": -68.3082275390625, + "logps/rejected": -49.65525817871094, + "loss": 0.324, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.8348395824432373, + "rewards/margins": 1.423417568206787, + "rewards/rejected": 1.4114220142364502, + "step": 3768 + }, + { + "epoch": 0.61, + "learning_rate": 8.13371549541432e-06, + "logits/chosen": -1.3441044092178345, + "logits/rejected": -1.1631567478179932, + "logps/chosen": -104.52017211914062, + "logps/rejected": -56.939125061035156, + "loss": 0.0105, + "rewards/accuracies": 1.0, + "rewards/chosen": 7.179263591766357, + "rewards/margins": 4.3419952392578125, + "rewards/rejected": 2.837268114089966, + "step": 3769 + }, + { + "epoch": 0.61, + "learning_rate": 8.132691286379022e-06, + "logits/chosen": -1.0021631717681885, + "logits/rejected": -1.0021631717681885, + "logps/chosen": -59.82633972167969, + "logps/rejected": -59.82633972167969, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.4115021228790283, + "rewards/margins": 0.0, + "rewards/rejected": 1.4115021228790283, + "step": 3770 + }, + { + "epoch": 0.61, + "learning_rate": 8.131666860904397e-06, + "logits/chosen": -1.0716277360916138, + "logits/rejected": -1.0154774188995361, + "logps/chosen": -54.959678649902344, + "logps/rejected": -75.75019836425781, + "loss": 1.1597, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.7736839056015015, + "rewards/margins": -0.9378587007522583, + "rewards/rejected": 2.7115426063537598, + "step": 3771 + }, + { + "epoch": 0.61, + "learning_rate": 8.130642219061224e-06, + "logits/chosen": -1.0058863162994385, + "logits/rejected": -1.0091365575790405, + "logps/chosen": -98.2982406616211, + "logps/rejected": -91.73693084716797, + "loss": 0.3286, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.7803176641464233, + "rewards/margins": 0.4909477233886719, + "rewards/rejected": 1.2893699407577515, + "step": 3772 + }, + { + "epoch": 0.61, + "learning_rate": 8.129617360920297e-06, + "logits/chosen": -1.3896688222885132, + "logits/rejected": -1.426304817199707, + "logps/chosen": -131.69247436523438, + "logps/rejected": -32.98694610595703, + "loss": 0.4036, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.2782487869262695, + "rewards/margins": 2.1132893562316895, + "rewards/rejected": 2.16495943069458, + "step": 3773 + }, + { + "epoch": 0.61, + "learning_rate": 8.128592286552422e-06, + "logits/chosen": -1.0660181045532227, + "logits/rejected": -1.138465166091919, + "logps/chosen": -77.10353088378906, + "logps/rejected": -94.13468933105469, + "loss": 1.6866, + "rewards/accuracies": 0.0, + "rewards/chosen": 4.2657151222229, + "rewards/margins": -2.608969211578369, + "rewards/rejected": 6.8746843338012695, + "step": 3774 + }, + { + "epoch": 0.61, + "learning_rate": 8.127566996028423e-06, + "logits/chosen": -1.5757825374603271, + "logits/rejected": -1.4505584239959717, + "logps/chosen": -153.60630798339844, + "logps/rejected": -16.35247802734375, + "loss": 0.0056, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.337669372558594, + "rewards/margins": 5.350529193878174, + "rewards/rejected": 0.9871402978897095, + "step": 3775 + }, + { + "epoch": 0.61, + "learning_rate": 8.126541489419138e-06, + "logits/chosen": -1.344740390777588, + "logits/rejected": -1.3415062427520752, + "logps/chosen": -45.719482421875, + "logps/rejected": -101.25981903076172, + "loss": 0.7002, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.2783364057540894, + "rewards/margins": -0.025115966796875, + "rewards/rejected": 1.3034523725509644, + "step": 3776 + }, + { + "epoch": 0.61, + "learning_rate": 8.12551576679542e-06, + "logits/chosen": -1.457576870918274, + "logits/rejected": -1.448819875717163, + "logps/chosen": -55.11852264404297, + "logps/rejected": -59.9355583190918, + "loss": 0.2611, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.624969005584717, + "rewards/margins": 0.3803749084472656, + "rewards/rejected": 4.244594097137451, + "step": 3777 + }, + { + "epoch": 0.61, + "learning_rate": 8.124489828228136e-06, + "logits/chosen": -0.9589107036590576, + "logits/rejected": -0.9589107036590576, + "logps/chosen": -34.582672119140625, + "logps/rejected": -34.582672119140625, + "loss": 1.6872, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.073657274246216, + "rewards/margins": 0.0, + "rewards/rejected": 2.073657274246216, + "step": 3778 + }, + { + "epoch": 0.61, + "learning_rate": 8.12346367378817e-06, + "logits/chosen": -1.1630500555038452, + "logits/rejected": -1.1805976629257202, + "logps/chosen": -71.60425567626953, + "logps/rejected": -110.37919616699219, + "loss": 0.5201, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.4602516889572144, + "rewards/margins": -0.18901288509368896, + "rewards/rejected": 1.6492645740509033, + "step": 3779 + }, + { + "epoch": 0.61, + "learning_rate": 8.122437303546418e-06, + "logits/chosen": -0.9993087649345398, + "logits/rejected": -0.999260425567627, + "logps/chosen": -4.1906304359436035, + "logps/rejected": -6.165853977203369, + "loss": 0.3368, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.23651519417762756, + "rewards/margins": 0.1104094535112381, + "rewards/rejected": 0.12610574066638947, + "step": 3780 + }, + { + "epoch": 0.61, + "learning_rate": 8.121410717573794e-06, + "logits/chosen": -0.8615292906761169, + "logits/rejected": -0.6805441975593567, + "logps/chosen": -102.81822967529297, + "logps/rejected": -62.708290100097656, + "loss": 0.0383, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.850605010986328, + "rewards/margins": 2.750926971435547, + "rewards/rejected": 3.0996780395507812, + "step": 3781 + }, + { + "epoch": 0.61, + "learning_rate": 8.120383915941223e-06, + "logits/chosen": -0.9524636268615723, + "logits/rejected": -0.992199182510376, + "logps/chosen": -41.092742919921875, + "logps/rejected": -46.01563262939453, + "loss": 0.8065, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.8199928998947144, + "rewards/margins": -1.1681641340255737, + "rewards/rejected": 2.988157033920288, + "step": 3782 + }, + { + "epoch": 0.61, + "learning_rate": 8.11935689871965e-06, + "logits/chosen": -1.1730245351791382, + "logits/rejected": -1.1825439929962158, + "logps/chosen": -81.55078125, + "logps/rejected": -87.15321350097656, + "loss": 1.6074, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.5415337085723877, + "rewards/margins": -3.1715357303619385, + "rewards/rejected": 5.713069438934326, + "step": 3783 + }, + { + "epoch": 0.61, + "learning_rate": 8.11832966598003e-06, + "logits/chosen": -0.8936285376548767, + "logits/rejected": -0.9200056791305542, + "logps/chosen": -54.365745544433594, + "logps/rejected": -60.90568542480469, + "loss": 0.4283, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.930805206298828, + "rewards/margins": 1.7279640436172485, + "rewards/rejected": 1.2028411626815796, + "step": 3784 + }, + { + "epoch": 0.61, + "learning_rate": 8.117302217793336e-06, + "logits/chosen": -0.5321723222732544, + "logits/rejected": -0.5321723222732544, + "logps/chosen": -41.58612060546875, + "logps/rejected": -41.58612060546875, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.4694511592388153, + "rewards/margins": 0.0, + "rewards/rejected": 0.4694511592388153, + "step": 3785 + }, + { + "epoch": 0.61, + "learning_rate": 8.116274554230557e-06, + "logits/chosen": -1.0298874378204346, + "logits/rejected": -1.0804336071014404, + "logps/chosen": -116.71388244628906, + "logps/rejected": -186.26712036132812, + "loss": 1.3516, + "rewards/accuracies": 0.0, + "rewards/chosen": 4.0238542556762695, + "rewards/margins": -2.464877128601074, + "rewards/rejected": 6.488731384277344, + "step": 3786 + }, + { + "epoch": 0.61, + "learning_rate": 8.11524667536269e-06, + "logits/chosen": -1.0376927852630615, + "logits/rejected": -1.084538459777832, + "logps/chosen": -36.81189727783203, + "logps/rejected": -39.07695770263672, + "loss": 0.9282, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.382171630859375, + "rewards/margins": -1.6783630847930908, + "rewards/rejected": 3.060534715652466, + "step": 3787 + }, + { + "epoch": 0.61, + "learning_rate": 8.114218581260756e-06, + "logits/chosen": -1.2535276412963867, + "logits/rejected": -1.2535276412963867, + "logps/chosen": -37.91912841796875, + "logps/rejected": -37.91912841796875, + "loss": 1.1918, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.2268319129943848, + "rewards/margins": 0.0, + "rewards/rejected": 3.2268319129943848, + "step": 3788 + }, + { + "epoch": 0.61, + "learning_rate": 8.113190271995784e-06, + "logits/chosen": -0.8694642186164856, + "logits/rejected": -0.7727372646331787, + "logps/chosen": -55.91721725463867, + "logps/rejected": -39.64878463745117, + "loss": 0.6847, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9492619037628174, + "rewards/margins": 0.5476508140563965, + "rewards/rejected": 2.401611089706421, + "step": 3789 + }, + { + "epoch": 0.62, + "learning_rate": 8.112161747638823e-06, + "logits/chosen": -0.946861207485199, + "logits/rejected": -0.8614683151245117, + "logps/chosen": -72.41735076904297, + "logps/rejected": -11.00920295715332, + "loss": 1.6045, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.893715739250183, + "rewards/margins": 1.1738739013671875, + "rewards/rejected": 0.7198417782783508, + "step": 3790 + }, + { + "epoch": 0.62, + "learning_rate": 8.111133008260932e-06, + "logits/chosen": -1.1084474325180054, + "logits/rejected": -0.9755502939224243, + "logps/chosen": -54.17550277709961, + "logps/rejected": -27.945880889892578, + "loss": 0.8302, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.6721977591514587, + "rewards/margins": -0.19595563411712646, + "rewards/rejected": 0.8681533932685852, + "step": 3791 + }, + { + "epoch": 0.62, + "learning_rate": 8.110104053933188e-06, + "logits/chosen": -1.0009140968322754, + "logits/rejected": -0.9975748658180237, + "logps/chosen": -9.150967597961426, + "logps/rejected": -2.4434995651245117, + "loss": 0.6781, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.27908316254615784, + "rewards/margins": -0.26862576603889465, + "rewards/rejected": 0.5477089285850525, + "step": 3792 + }, + { + "epoch": 0.62, + "learning_rate": 8.109074884726681e-06, + "logits/chosen": -0.32938769459724426, + "logits/rejected": -0.31231582164764404, + "logps/chosen": -4.4633941650390625, + "logps/rejected": -4.378756046295166, + "loss": 0.4657, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.04096880182623863, + "rewards/margins": -0.3952256739139557, + "rewards/rejected": 0.4361944794654846, + "step": 3793 + }, + { + "epoch": 0.62, + "learning_rate": 8.108045500712518e-06, + "logits/chosen": -1.2103996276855469, + "logits/rejected": -1.1834936141967773, + "logps/chosen": -80.59523010253906, + "logps/rejected": -61.07551956176758, + "loss": 0.3359, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.6053131818771362, + "rewards/margins": 0.05482828617095947, + "rewards/rejected": 1.5504848957061768, + "step": 3794 + }, + { + "epoch": 0.62, + "learning_rate": 8.10701590196182e-06, + "logits/chosen": -0.9547373652458191, + "logits/rejected": -1.0417664051055908, + "logps/chosen": -73.54139709472656, + "logps/rejected": -139.10488891601562, + "loss": 2.465, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.531896948814392, + "rewards/margins": -3.4016036987304688, + "rewards/rejected": 4.93350076675415, + "step": 3795 + }, + { + "epoch": 0.62, + "learning_rate": 8.105986088545722e-06, + "logits/chosen": -0.6237193942070007, + "logits/rejected": -0.5703204274177551, + "logps/chosen": -65.44428253173828, + "logps/rejected": -60.22395706176758, + "loss": 0.046, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.280990123748779, + "rewards/margins": 2.7351346015930176, + "rewards/rejected": 1.5458554029464722, + "step": 3796 + }, + { + "epoch": 0.62, + "learning_rate": 8.104956060535375e-06, + "logits/chosen": -0.9958254098892212, + "logits/rejected": -0.9756004810333252, + "logps/chosen": -32.84952926635742, + "logps/rejected": -32.79594802856445, + "loss": 2.2506, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.5237838625907898, + "rewards/margins": -0.3083351254463196, + "rewards/rejected": 0.8321189880371094, + "step": 3797 + }, + { + "epoch": 0.62, + "learning_rate": 8.103925818001944e-06, + "logits/chosen": -1.1809629201889038, + "logits/rejected": -1.2476861476898193, + "logps/chosen": -155.99658203125, + "logps/rejected": -9.974974632263184, + "loss": 0.0054, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.543257236480713, + "rewards/margins": 4.562651634216309, + "rewards/rejected": 0.9806056022644043, + "step": 3798 + }, + { + "epoch": 0.62, + "learning_rate": 8.102895361016607e-06, + "logits/chosen": -0.8930872678756714, + "logits/rejected": -0.8989962935447693, + "logps/chosen": -186.939208984375, + "logps/rejected": -111.35548400878906, + "loss": 0.4633, + "rewards/accuracies": 0.0, + "rewards/chosen": 5.0137529373168945, + "rewards/margins": -0.4093780517578125, + "rewards/rejected": 5.423130989074707, + "step": 3799 + }, + { + "epoch": 0.62, + "learning_rate": 8.10186468965056e-06, + "logits/chosen": -1.1554920673370361, + "logits/rejected": -1.0758665800094604, + "logps/chosen": -87.23657989501953, + "logps/rejected": -39.95833206176758, + "loss": 0.6025, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.9319618940353394, + "rewards/margins": -0.383844256401062, + "rewards/rejected": 2.3158061504364014, + "step": 3800 + }, + { + "epoch": 0.62, + "learning_rate": 8.100833803975016e-06, + "logits/chosen": -1.0964897871017456, + "logits/rejected": -1.1563489437103271, + "logps/chosen": -84.1915283203125, + "logps/rejected": -67.96552276611328, + "loss": 1.1776, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.5021331310272217, + "rewards/margins": -1.8115732669830322, + "rewards/rejected": 4.313706398010254, + "step": 3801 + }, + { + "epoch": 0.62, + "learning_rate": 8.099802704061194e-06, + "logits/chosen": -0.46314728260040283, + "logits/rejected": -0.4848531484603882, + "logps/chosen": -66.06051635742188, + "logps/rejected": -61.932979583740234, + "loss": 2.3096, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.9377479553222656, + "rewards/margins": -0.7327938079833984, + "rewards/rejected": 1.670541763305664, + "step": 3802 + }, + { + "epoch": 0.62, + "learning_rate": 8.098771389980337e-06, + "logits/chosen": -1.049962043762207, + "logits/rejected": -1.0417202711105347, + "logps/chosen": -33.696617126464844, + "logps/rejected": -43.938785552978516, + "loss": 1.1196, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.5856647491455078, + "rewards/margins": -1.5582025051116943, + "rewards/rejected": 2.143867254257202, + "step": 3803 + }, + { + "epoch": 0.62, + "learning_rate": 8.097739861803696e-06, + "logits/chosen": -0.9464225769042969, + "logits/rejected": -0.976895809173584, + "logps/chosen": -52.72706604003906, + "logps/rejected": -84.92698669433594, + "loss": 0.609, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.1744812726974487, + "rewards/margins": 0.24224555492401123, + "rewards/rejected": 0.9322357177734375, + "step": 3804 + }, + { + "epoch": 0.62, + "learning_rate": 8.096708119602543e-06, + "logits/chosen": -0.7492639422416687, + "logits/rejected": -0.7492639422416687, + "logps/chosen": -36.09832763671875, + "logps/rejected": -36.09832763671875, + "loss": 0.3796, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.7108734250068665, + "rewards/margins": 0.0, + "rewards/rejected": 0.7108734250068665, + "step": 3805 + }, + { + "epoch": 0.62, + "learning_rate": 8.09567616344816e-06, + "logits/chosen": -0.810238242149353, + "logits/rejected": -0.6945622563362122, + "logps/chosen": -49.294464111328125, + "logps/rejected": -53.28801727294922, + "loss": 1.8471, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.646892547607422, + "rewards/margins": -0.4398980140686035, + "rewards/rejected": 3.0867905616760254, + "step": 3806 + }, + { + "epoch": 0.62, + "learning_rate": 8.094643993411846e-06, + "logits/chosen": -1.1197994947433472, + "logits/rejected": -1.0974291563034058, + "logps/chosen": -48.99010467529297, + "logps/rejected": -16.601669311523438, + "loss": 0.6006, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.6993149518966675, + "rewards/margins": 1.1399906873703003, + "rewards/rejected": 0.5593242645263672, + "step": 3807 + }, + { + "epoch": 0.62, + "learning_rate": 8.093611609564913e-06, + "logits/chosen": -0.8716269135475159, + "logits/rejected": -0.82716304063797, + "logps/chosen": -103.231201171875, + "logps/rejected": -64.43144226074219, + "loss": 0.3386, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.7985153198242188, + "rewards/margins": 1.023284912109375, + "rewards/rejected": 1.7752304077148438, + "step": 3808 + }, + { + "epoch": 0.62, + "learning_rate": 8.092579011978691e-06, + "logits/chosen": -1.1477123498916626, + "logits/rejected": -1.135826826095581, + "logps/chosen": -57.13896179199219, + "logps/rejected": -47.550086975097656, + "loss": 0.2656, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.423372745513916, + "rewards/margins": 0.35900354385375977, + "rewards/rejected": 2.0643692016601562, + "step": 3809 + }, + { + "epoch": 0.62, + "learning_rate": 8.091546200724521e-06, + "logits/chosen": -1.0012620687484741, + "logits/rejected": -0.9857296943664551, + "logps/chosen": -77.6968765258789, + "logps/rejected": -29.76700782775879, + "loss": 0.3412, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.6613410711288452, + "rewards/margins": 0.3640347719192505, + "rewards/rejected": 1.2973062992095947, + "step": 3810 + }, + { + "epoch": 0.62, + "learning_rate": 8.090513175873763e-06, + "logits/chosen": -0.9578589797019958, + "logits/rejected": -0.9520454406738281, + "logps/chosen": -39.51976776123047, + "logps/rejected": -42.696807861328125, + "loss": 0.308, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.8770713806152344, + "rewards/margins": 0.3548743724822998, + "rewards/rejected": 3.5221970081329346, + "step": 3811 + }, + { + "epoch": 0.62, + "learning_rate": 8.089479937497784e-06, + "logits/chosen": -1.3485794067382812, + "logits/rejected": -1.29445219039917, + "logps/chosen": -111.47945404052734, + "logps/rejected": -84.50772094726562, + "loss": 0.1078, + "rewards/accuracies": 1.0, + "rewards/chosen": 7.063270092010498, + "rewards/margins": 1.9126853942871094, + "rewards/rejected": 5.150584697723389, + "step": 3812 + }, + { + "epoch": 0.62, + "learning_rate": 8.088446485667976e-06, + "logits/chosen": -1.0355883836746216, + "logits/rejected": -1.0355883836746216, + "logps/chosen": -37.53710174560547, + "logps/rejected": -37.53710174560547, + "loss": 0.3496, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.6648945212364197, + "rewards/margins": 0.0, + "rewards/rejected": 0.6648945212364197, + "step": 3813 + }, + { + "epoch": 0.62, + "learning_rate": 8.087412820455738e-06, + "logits/chosen": -1.3607707023620605, + "logits/rejected": -1.1686112880706787, + "logps/chosen": -142.32876586914062, + "logps/rejected": -15.608442306518555, + "loss": 0.0131, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.943960666656494, + "rewards/margins": 5.057244300842285, + "rewards/rejected": 0.8867163062095642, + "step": 3814 + }, + { + "epoch": 0.62, + "learning_rate": 8.086378941932488e-06, + "logits/chosen": -1.0635067224502563, + "logits/rejected": -1.027954339981079, + "logps/chosen": -34.65088653564453, + "logps/rejected": -28.93472671508789, + "loss": 0.2793, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.019038438796997, + "rewards/margins": 0.46678435802459717, + "rewards/rejected": 1.5522540807724, + "step": 3815 + }, + { + "epoch": 0.62, + "learning_rate": 8.085344850169657e-06, + "logits/chosen": -0.5906230807304382, + "logits/rejected": -0.5906230807304382, + "logps/chosen": -3.228878974914551, + "logps/rejected": -3.228878974914551, + "loss": 0.9926, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.08588304370641708, + "rewards/margins": 0.0, + "rewards/rejected": 0.08588304370641708, + "step": 3816 + }, + { + "epoch": 0.62, + "learning_rate": 8.08431054523869e-06, + "logits/chosen": -0.7136486172676086, + "logits/rejected": -0.6963337659835815, + "logps/chosen": -26.535606384277344, + "logps/rejected": -2.221813917160034, + "loss": 0.8884, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.36620885133743286, + "rewards/margins": -0.10178050398826599, + "rewards/rejected": 0.46798935532569885, + "step": 3817 + }, + { + "epoch": 0.62, + "learning_rate": 8.083276027211049e-06, + "logits/chosen": -1.0511399507522583, + "logits/rejected": -0.991584837436676, + "logps/chosen": -74.67991638183594, + "logps/rejected": -125.54087829589844, + "loss": 0.1391, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.878472805023193, + "rewards/margins": 3.3181257247924805, + "rewards/rejected": 1.5603469610214233, + "step": 3818 + }, + { + "epoch": 0.62, + "learning_rate": 8.082241296158208e-06, + "logits/chosen": -1.087048888206482, + "logits/rejected": -1.0622398853302002, + "logps/chosen": -72.14241027832031, + "logps/rejected": -65.45943450927734, + "loss": 0.5088, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.5646607875823975, + "rewards/margins": 0.7396286725997925, + "rewards/rejected": 1.825032114982605, + "step": 3819 + }, + { + "epoch": 0.62, + "learning_rate": 8.081206352151659e-06, + "logits/chosen": -1.0776140689849854, + "logits/rejected": -1.0677322149276733, + "logps/chosen": -103.79206848144531, + "logps/rejected": -90.34538269042969, + "loss": 0.6633, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.1207473278045654, + "rewards/margins": -0.30876779556274414, + "rewards/rejected": 3.4295151233673096, + "step": 3820 + }, + { + "epoch": 0.62, + "learning_rate": 8.080171195262905e-06, + "logits/chosen": -0.9287109375, + "logits/rejected": -0.8779764175415039, + "logps/chosen": -85.01602172851562, + "logps/rejected": -65.72382354736328, + "loss": 1.0014, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.2875884771347046, + "rewards/margins": -0.9950844049453735, + "rewards/rejected": 2.282672882080078, + "step": 3821 + }, + { + "epoch": 0.62, + "learning_rate": 8.079135825563467e-06, + "logits/chosen": -1.2252190113067627, + "logits/rejected": -1.2082960605621338, + "logps/chosen": -109.90731048583984, + "logps/rejected": -57.822998046875, + "loss": 0.6604, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.9471061825752258, + "rewards/margins": -0.8457038998603821, + "rewards/rejected": 1.792810082435608, + "step": 3822 + }, + { + "epoch": 0.62, + "learning_rate": 8.078100243124876e-06, + "logits/chosen": -1.0302218198776245, + "logits/rejected": -1.0512325763702393, + "logps/chosen": -75.7927474975586, + "logps/rejected": -83.79922485351562, + "loss": 0.7362, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.7428817749023438, + "rewards/margins": -0.9157371520996094, + "rewards/rejected": 2.658618927001953, + "step": 3823 + }, + { + "epoch": 0.62, + "learning_rate": 8.077064448018686e-06, + "logits/chosen": -1.1030240058898926, + "logits/rejected": -0.8711084127426147, + "logps/chosen": -155.0850830078125, + "logps/rejected": -52.66645050048828, + "loss": 0.0192, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.414045810699463, + "rewards/margins": 3.7590889930725098, + "rewards/rejected": 2.654956817626953, + "step": 3824 + }, + { + "epoch": 0.62, + "learning_rate": 8.076028440316458e-06, + "logits/chosen": -0.8560024499893188, + "logits/rejected": -0.8560024499893188, + "logps/chosen": -0.2995677888393402, + "logps/rejected": -0.2995677888393402, + "loss": 0.5427, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0840563178062439, + "rewards/margins": 0.0, + "rewards/rejected": 0.0840563178062439, + "step": 3825 + }, + { + "epoch": 0.62, + "learning_rate": 8.07499222008977e-06, + "logits/chosen": -1.1733943223953247, + "logits/rejected": -0.9691970944404602, + "logps/chosen": -61.10955047607422, + "logps/rejected": -20.545841217041016, + "loss": 0.1724, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.1613166332244873, + "rewards/margins": 1.2756876945495605, + "rewards/rejected": 0.885628879070282, + "step": 3826 + }, + { + "epoch": 0.62, + "learning_rate": 8.073955787410215e-06, + "logits/chosen": -1.2081447839736938, + "logits/rejected": -1.19530189037323, + "logps/chosen": -81.73580169677734, + "logps/rejected": -108.72573852539062, + "loss": 0.2636, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.668932318687439, + "rewards/margins": 0.5961021184921265, + "rewards/rejected": 1.0728302001953125, + "step": 3827 + }, + { + "epoch": 0.62, + "learning_rate": 8.0729191423494e-06, + "logits/chosen": -1.114693284034729, + "logits/rejected": -1.076795220375061, + "logps/chosen": -95.9910659790039, + "logps/rejected": -55.25718688964844, + "loss": 0.0909, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.964545726776123, + "rewards/margins": 1.8434977531433105, + "rewards/rejected": 3.1210479736328125, + "step": 3828 + }, + { + "epoch": 0.62, + "learning_rate": 8.07188228497895e-06, + "logits/chosen": -1.2646774053573608, + "logits/rejected": -1.1240206956863403, + "logps/chosen": -56.02889633178711, + "logps/rejected": -33.44599151611328, + "loss": 0.6013, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.97521710395813, + "rewards/margins": -0.8348195552825928, + "rewards/rejected": 4.810036659240723, + "step": 3829 + }, + { + "epoch": 0.62, + "learning_rate": 8.0708452153705e-06, + "logits/chosen": -0.8592156171798706, + "logits/rejected": -1.0289838314056396, + "logps/chosen": -63.786136627197266, + "logps/rejected": -59.3231201171875, + "loss": 0.7996, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.393562078475952, + "rewards/margins": -0.2635035514831543, + "rewards/rejected": 2.6570656299591064, + "step": 3830 + }, + { + "epoch": 0.62, + "learning_rate": 8.069807933595704e-06, + "logits/chosen": -1.4346002340316772, + "logits/rejected": -1.465834617614746, + "logps/chosen": -207.39913940429688, + "logps/rejected": -89.92501068115234, + "loss": 0.2397, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.561642646789551, + "rewards/margins": 1.4810633659362793, + "rewards/rejected": 4.0805792808532715, + "step": 3831 + }, + { + "epoch": 0.62, + "learning_rate": 8.068770439726224e-06, + "logits/chosen": -1.229034185409546, + "logits/rejected": -0.9881589412689209, + "logps/chosen": -116.51957702636719, + "logps/rejected": -28.905284881591797, + "loss": 0.0699, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.5919511318206787, + "rewards/margins": 2.1872503757476807, + "rewards/rejected": 0.4047008454799652, + "step": 3832 + }, + { + "epoch": 0.62, + "learning_rate": 8.067732733833745e-06, + "logits/chosen": -0.9492124915122986, + "logits/rejected": -0.9492124915122986, + "logps/chosen": -27.29352378845215, + "logps/rejected": -27.29352378845215, + "loss": 0.6835, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.1737161874771118, + "rewards/margins": 0.0, + "rewards/rejected": 1.1737161874771118, + "step": 3833 + }, + { + "epoch": 0.62, + "learning_rate": 8.066694815989961e-06, + "logits/chosen": -0.9815859198570251, + "logits/rejected": -1.0053702592849731, + "logps/chosen": -67.07193756103516, + "logps/rejected": -45.27364730834961, + "loss": 0.7829, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.6879539489746094, + "rewards/margins": -0.21206176280975342, + "rewards/rejected": 1.9000157117843628, + "step": 3834 + }, + { + "epoch": 0.62, + "learning_rate": 8.065656686266583e-06, + "logits/chosen": -1.2917996644973755, + "logits/rejected": -1.2568414211273193, + "logps/chosen": -74.6427001953125, + "logps/rejected": -99.44404602050781, + "loss": 0.5556, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.9282302856445312, + "rewards/margins": -0.6082541942596436, + "rewards/rejected": 2.536484479904175, + "step": 3835 + }, + { + "epoch": 0.62, + "learning_rate": 8.064618344735335e-06, + "logits/chosen": -1.2384510040283203, + "logits/rejected": -1.2595726251602173, + "logps/chosen": -95.57735443115234, + "logps/rejected": -124.11095428466797, + "loss": 1.2676, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.473572731018066, + "rewards/margins": 0.6877899169921875, + "rewards/rejected": 4.785782814025879, + "step": 3836 + }, + { + "epoch": 0.62, + "learning_rate": 8.063579791467956e-06, + "logits/chosen": -1.5045387744903564, + "logits/rejected": -1.4821727275848389, + "logps/chosen": -83.60106658935547, + "logps/rejected": -51.77933120727539, + "loss": 0.645, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.78564453125, + "rewards/margins": 0.20272481441497803, + "rewards/rejected": 1.582919716835022, + "step": 3837 + }, + { + "epoch": 0.62, + "learning_rate": 8.062541026536204e-06, + "logits/chosen": -1.0684922933578491, + "logits/rejected": -1.0235358476638794, + "logps/chosen": -84.58999633789062, + "logps/rejected": -89.14567565917969, + "loss": 0.3711, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.324421763420105, + "rewards/margins": -0.09244680404663086, + "rewards/rejected": 1.4168685674667358, + "step": 3838 + }, + { + "epoch": 0.62, + "learning_rate": 8.061502050011842e-06, + "logits/chosen": -1.0801384449005127, + "logits/rejected": -1.08116614818573, + "logps/chosen": -42.85980224609375, + "logps/rejected": -53.359519958496094, + "loss": 0.2496, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.1766395568847656, + "rewards/margins": 0.467107355594635, + "rewards/rejected": 0.7095322012901306, + "step": 3839 + }, + { + "epoch": 0.62, + "learning_rate": 8.060462861966658e-06, + "logits/chosen": -1.1153781414031982, + "logits/rejected": -1.1153781414031982, + "logps/chosen": -68.33184814453125, + "logps/rejected": -68.33184814453125, + "loss": 0.5531, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.7046432495117188, + "rewards/margins": 0.0, + "rewards/rejected": 2.7046432495117188, + "step": 3840 + }, + { + "epoch": 0.62, + "learning_rate": 8.059423462472448e-06, + "logits/chosen": -0.7841304540634155, + "logits/rejected": -0.7152515649795532, + "logps/chosen": -36.206485748291016, + "logps/rejected": -48.55577850341797, + "loss": 0.2246, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.8812462091445923, + "rewards/margins": 1.3409771919250488, + "rewards/rejected": 0.5402690768241882, + "step": 3841 + }, + { + "epoch": 0.62, + "learning_rate": 8.058383851601027e-06, + "logits/chosen": -0.9575527906417847, + "logits/rejected": -0.8595393300056458, + "logps/chosen": -57.649723052978516, + "logps/rejected": -59.90321350097656, + "loss": 0.2521, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.891367793083191, + "rewards/margins": 1.8014527559280396, + "rewards/rejected": 0.08991508930921555, + "step": 3842 + }, + { + "epoch": 0.62, + "learning_rate": 8.057344029424219e-06, + "logits/chosen": -1.0395755767822266, + "logits/rejected": -1.063278079032898, + "logps/chosen": -77.57746887207031, + "logps/rejected": -88.7548828125, + "loss": 0.5165, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.467071533203125, + "rewards/margins": -0.5786285400390625, + "rewards/rejected": 2.0457000732421875, + "step": 3843 + }, + { + "epoch": 0.62, + "learning_rate": 8.056303996013868e-06, + "logits/chosen": -1.5400290489196777, + "logits/rejected": -1.3372228145599365, + "logps/chosen": -119.1150131225586, + "logps/rejected": -72.95833587646484, + "loss": 0.153, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.718959808349609, + "rewards/margins": 2.498757839202881, + "rewards/rejected": 3.2202019691467285, + "step": 3844 + }, + { + "epoch": 0.62, + "learning_rate": 8.055263751441831e-06, + "logits/chosen": -1.2153130769729614, + "logits/rejected": -0.8574750423431396, + "logps/chosen": -207.62771606445312, + "logps/rejected": -24.728116989135742, + "loss": 0.6157, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.578985691070557, + "rewards/margins": 5.044565200805664, + "rewards/rejected": 0.5344204306602478, + "step": 3845 + }, + { + "epoch": 0.62, + "learning_rate": 8.054223295779976e-06, + "logits/chosen": -1.0489239692687988, + "logits/rejected": -0.934959888458252, + "logps/chosen": -86.71211242675781, + "logps/rejected": -22.96654510498047, + "loss": 0.3283, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.6671135425567627, + "rewards/margins": 1.6737486124038696, + "rewards/rejected": -0.006635093595832586, + "step": 3846 + }, + { + "epoch": 0.62, + "learning_rate": 8.053182629100191e-06, + "logits/chosen": -1.3757569789886475, + "logits/rejected": -1.2295715808868408, + "logps/chosen": -114.9151840209961, + "logps/rejected": -91.14356994628906, + "loss": 0.1318, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.330413341522217, + "rewards/margins": 3.4892420768737793, + "rewards/rejected": 1.8411712646484375, + "step": 3847 + }, + { + "epoch": 0.62, + "learning_rate": 8.052141751474376e-06, + "logits/chosen": -1.5941338539123535, + "logits/rejected": -1.347468614578247, + "logps/chosen": -161.67672729492188, + "logps/rejected": -22.417240142822266, + "loss": 0.2567, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.467674255371094, + "rewards/margins": 5.1923723220825195, + "rewards/rejected": 1.2753018140792847, + "step": 3848 + }, + { + "epoch": 0.62, + "learning_rate": 8.051100662974446e-06, + "logits/chosen": -1.1527669429779053, + "logits/rejected": -1.1829110383987427, + "logps/chosen": -89.6033935546875, + "logps/rejected": -87.95396423339844, + "loss": 0.8627, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.411337375640869, + "rewards/margins": -1.4804534912109375, + "rewards/rejected": 3.8917908668518066, + "step": 3849 + }, + { + "epoch": 0.62, + "learning_rate": 8.05005936367233e-06, + "logits/chosen": -1.2046583890914917, + "logits/rejected": -1.3095688819885254, + "logps/chosen": -104.59028625488281, + "logps/rejected": -89.65324401855469, + "loss": 2.7252, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.3477554321289062, + "rewards/margins": -4.324774265289307, + "rewards/rejected": 6.672529697418213, + "step": 3850 + }, + { + "epoch": 0.63, + "learning_rate": 8.04901785363997e-06, + "logits/chosen": -0.6761255264282227, + "logits/rejected": -0.6717633008956909, + "logps/chosen": -33.073631286621094, + "logps/rejected": -39.18951416015625, + "loss": 0.3335, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.554238498210907, + "rewards/margins": 0.07025334239006042, + "rewards/rejected": 0.48398515582084656, + "step": 3851 + }, + { + "epoch": 0.63, + "learning_rate": 8.047976132949328e-06, + "logits/chosen": -1.322605013847351, + "logits/rejected": -1.3600102663040161, + "logps/chosen": -95.98201751708984, + "logps/rejected": -122.07841491699219, + "loss": 4.4167, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.9867851138114929, + "rewards/margins": -5.677576541900635, + "rewards/rejected": 6.664361476898193, + "step": 3852 + }, + { + "epoch": 0.63, + "learning_rate": 8.046934201672375e-06, + "logits/chosen": -1.034775733947754, + "logits/rejected": -1.0544567108154297, + "logps/chosen": -66.08026885986328, + "logps/rejected": -84.83747863769531, + "loss": 2.2987, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.4310569763183594, + "rewards/margins": 0.2655327320098877, + "rewards/rejected": 2.1655242443084717, + "step": 3853 + }, + { + "epoch": 0.63, + "learning_rate": 8.045892059881101e-06, + "logits/chosen": -1.191918134689331, + "logits/rejected": -1.0289109945297241, + "logps/chosen": -85.41301727294922, + "logps/rejected": -44.99647521972656, + "loss": 0.1739, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.927807569503784, + "rewards/margins": 1.0457267761230469, + "rewards/rejected": 2.8820807933807373, + "step": 3854 + }, + { + "epoch": 0.63, + "learning_rate": 8.044849707647505e-06, + "logits/chosen": -1.0860445499420166, + "logits/rejected": -1.0816482305526733, + "logps/chosen": -0.9683611392974854, + "logps/rejected": -4.751728534698486, + "loss": 0.5506, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.2139536589384079, + "rewards/margins": -0.059001341462135315, + "rewards/rejected": 0.2729550004005432, + "step": 3855 + }, + { + "epoch": 0.63, + "learning_rate": 8.043807145043604e-06, + "logits/chosen": -1.1164230108261108, + "logits/rejected": -1.1121852397918701, + "logps/chosen": -179.7017822265625, + "logps/rejected": -101.89686584472656, + "loss": 0.031, + "rewards/accuracies": 1.0, + "rewards/chosen": 7.567147731781006, + "rewards/margins": 4.411308288574219, + "rewards/rejected": 3.155839681625366, + "step": 3856 + }, + { + "epoch": 0.63, + "learning_rate": 8.04276437214143e-06, + "logits/chosen": -1.0986993312835693, + "logits/rejected": -1.1034574508666992, + "logps/chosen": -66.0453872680664, + "logps/rejected": -72.70317077636719, + "loss": 0.6343, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.069255828857422, + "rewards/margins": -0.055965423583984375, + "rewards/rejected": 2.1252212524414062, + "step": 3857 + }, + { + "epoch": 0.63, + "learning_rate": 8.041721389013029e-06, + "logits/chosen": -0.8836131691932678, + "logits/rejected": -0.8026435375213623, + "logps/chosen": -42.53192901611328, + "logps/rejected": -27.9764404296875, + "loss": 0.5136, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.1303714513778687, + "rewards/margins": -0.5088070631027222, + "rewards/rejected": 1.6391785144805908, + "step": 3858 + }, + { + "epoch": 0.63, + "learning_rate": 8.040678195730463e-06, + "logits/chosen": -1.1424140930175781, + "logits/rejected": -1.1796618700027466, + "logps/chosen": -119.70519256591797, + "logps/rejected": -75.49734497070312, + "loss": 0.7123, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.327503204345703, + "rewards/margins": -0.9849298000335693, + "rewards/rejected": 3.3124330043792725, + "step": 3859 + }, + { + "epoch": 0.63, + "learning_rate": 8.039634792365803e-06, + "logits/chosen": -1.0487208366394043, + "logits/rejected": -1.0160746574401855, + "logps/chosen": -20.92271614074707, + "logps/rejected": -18.774621963500977, + "loss": 0.6575, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.3104723691940308, + "rewards/margins": 0.19706785678863525, + "rewards/rejected": 1.1134045124053955, + "step": 3860 + }, + { + "epoch": 0.63, + "learning_rate": 8.03859117899114e-06, + "logits/chosen": -1.1422048807144165, + "logits/rejected": -1.035574197769165, + "logps/chosen": -76.61833190917969, + "logps/rejected": -45.68336486816406, + "loss": 0.7332, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.146254062652588, + "rewards/margins": 0.08740234375, + "rewards/rejected": 3.058851718902588, + "step": 3861 + }, + { + "epoch": 0.63, + "learning_rate": 8.037547355678578e-06, + "logits/chosen": -1.1679505109786987, + "logits/rejected": -1.1532243490219116, + "logps/chosen": -57.35382843017578, + "logps/rejected": -40.09434127807617, + "loss": 1.0253, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.871911644935608, + "rewards/margins": 0.6888141632080078, + "rewards/rejected": 1.1830974817276, + "step": 3862 + }, + { + "epoch": 0.63, + "learning_rate": 8.036503322500236e-06, + "logits/chosen": -1.4563336372375488, + "logits/rejected": -1.4031140804290771, + "logps/chosen": -105.29320526123047, + "logps/rejected": -89.98876953125, + "loss": 0.2939, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.3564049005508423, + "rewards/margins": 0.2656104564666748, + "rewards/rejected": 1.0907944440841675, + "step": 3863 + }, + { + "epoch": 0.63, + "learning_rate": 8.035459079528244e-06, + "logits/chosen": -1.3334848880767822, + "logits/rejected": -1.2830647230148315, + "logps/chosen": -87.54318237304688, + "logps/rejected": -31.516910552978516, + "loss": 0.1859, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.4013099670410156, + "rewards/margins": 1.3741008043289185, + "rewards/rejected": 1.0272091627120972, + "step": 3864 + }, + { + "epoch": 0.63, + "learning_rate": 8.034414626834754e-06, + "logits/chosen": -0.9313752055168152, + "logits/rejected": -0.9313752055168152, + "logps/chosen": -78.16693115234375, + "logps/rejected": -78.16693115234375, + "loss": 0.359, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.591996908187866, + "rewards/margins": 0.0, + "rewards/rejected": 3.591996908187866, + "step": 3865 + }, + { + "epoch": 0.63, + "learning_rate": 8.033369964491924e-06, + "logits/chosen": -1.005253553390503, + "logits/rejected": -1.0543372631072998, + "logps/chosen": -74.16902160644531, + "logps/rejected": -117.97035217285156, + "loss": 0.4848, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.1059327125549316, + "rewards/margins": -0.19072270393371582, + "rewards/rejected": 3.2966554164886475, + "step": 3866 + }, + { + "epoch": 0.63, + "learning_rate": 8.032325092571932e-06, + "logits/chosen": -1.1084973812103271, + "logits/rejected": -1.1569000482559204, + "logps/chosen": -67.96658325195312, + "logps/rejected": -60.77545166015625, + "loss": 1.5042, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.0989320278167725, + "rewards/margins": -2.911956548690796, + "rewards/rejected": 5.010888576507568, + "step": 3867 + }, + { + "epoch": 0.63, + "learning_rate": 8.031280011146968e-06, + "logits/chosen": -0.8056901097297668, + "logits/rejected": -1.0302643775939941, + "logps/chosen": -90.35702514648438, + "logps/rejected": -101.5574951171875, + "loss": 1.1472, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.984419345855713, + "rewards/margins": -0.8679611682891846, + "rewards/rejected": 3.8523805141448975, + "step": 3868 + }, + { + "epoch": 0.63, + "learning_rate": 8.030234720289237e-06, + "logits/chosen": -0.9599796533584595, + "logits/rejected": -0.9845519661903381, + "logps/chosen": -43.20631408691406, + "logps/rejected": -50.64037322998047, + "loss": 1.8625, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.6601853370666504, + "rewards/margins": -0.1710357666015625, + "rewards/rejected": 2.831221103668213, + "step": 3869 + }, + { + "epoch": 0.63, + "learning_rate": 8.02918922007096e-06, + "logits/chosen": -1.0916357040405273, + "logits/rejected": -1.1454652547836304, + "logps/chosen": -77.55428314208984, + "logps/rejected": -174.52410888671875, + "loss": 0.4418, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.3827400207519531, + "rewards/margins": -0.34844285249710083, + "rewards/rejected": 0.731182873249054, + "step": 3870 + }, + { + "epoch": 0.63, + "learning_rate": 8.02814351056437e-06, + "logits/chosen": -1.2140709161758423, + "logits/rejected": -1.206740379333496, + "logps/chosen": -33.100196838378906, + "logps/rejected": -25.548948287963867, + "loss": 0.4397, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.3448326587677, + "rewards/margins": 1.3314855098724365, + "rewards/rejected": 2.0133471488952637, + "step": 3871 + }, + { + "epoch": 0.63, + "learning_rate": 8.027097591841715e-06, + "logits/chosen": -0.8842201232910156, + "logits/rejected": -0.920939028263092, + "logps/chosen": -40.473876953125, + "logps/rejected": -75.31053161621094, + "loss": 0.9052, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.1512531042099, + "rewards/margins": -0.23880434036254883, + "rewards/rejected": 1.3900574445724487, + "step": 3872 + }, + { + "epoch": 0.63, + "learning_rate": 8.02605146397526e-06, + "logits/chosen": -1.1573799848556519, + "logits/rejected": -1.2008450031280518, + "logps/chosen": -66.51874542236328, + "logps/rejected": -55.403778076171875, + "loss": 0.1447, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9329330921173096, + "rewards/margins": 1.3043495416641235, + "rewards/rejected": 1.628583550453186, + "step": 3873 + }, + { + "epoch": 0.63, + "learning_rate": 8.025005127037282e-06, + "logits/chosen": -1.105791449546814, + "logits/rejected": -1.0888276100158691, + "logps/chosen": -43.04881286621094, + "logps/rejected": -61.82321548461914, + "loss": 0.7355, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.652712345123291, + "rewards/margins": 0.7432751655578613, + "rewards/rejected": 1.9094371795654297, + "step": 3874 + }, + { + "epoch": 0.63, + "learning_rate": 8.023958581100072e-06, + "logits/chosen": -1.3863756656646729, + "logits/rejected": -1.2600116729736328, + "logps/chosen": -129.2711944580078, + "logps/rejected": -71.60929870605469, + "loss": 0.0777, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.303773403167725, + "rewards/margins": 2.1274564266204834, + "rewards/rejected": 2.176316976547241, + "step": 3875 + }, + { + "epoch": 0.63, + "learning_rate": 8.022911826235938e-06, + "logits/chosen": -1.0422940254211426, + "logits/rejected": -1.0746876001358032, + "logps/chosen": -125.27010345458984, + "logps/rejected": -74.90644073486328, + "loss": 0.1326, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.108919620513916, + "rewards/margins": 1.3384758234024048, + "rewards/rejected": 1.7704437971115112, + "step": 3876 + }, + { + "epoch": 0.63, + "learning_rate": 8.021864862517197e-06, + "logits/chosen": -1.0017679929733276, + "logits/rejected": -1.0900150537490845, + "logps/chosen": -63.503238677978516, + "logps/rejected": -97.66822814941406, + "loss": 1.5543, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.6786854267120361, + "rewards/margins": -2.26019549369812, + "rewards/rejected": 3.9388809204101562, + "step": 3877 + }, + { + "epoch": 0.63, + "learning_rate": 8.02081769001619e-06, + "logits/chosen": -1.046937108039856, + "logits/rejected": -1.0165008306503296, + "logps/chosen": -32.69873046875, + "logps/rejected": -55.80841827392578, + "loss": 0.7825, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.5296554565429688, + "rewards/margins": -1.3273727893829346, + "rewards/rejected": 3.8570282459259033, + "step": 3878 + }, + { + "epoch": 0.63, + "learning_rate": 8.019770308805263e-06, + "logits/chosen": -0.7721019387245178, + "logits/rejected": -0.8054893612861633, + "logps/chosen": -2.9016122817993164, + "logps/rejected": -39.70745849609375, + "loss": 0.3265, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.856826901435852, + "rewards/margins": 0.6027299165725708, + "rewards/rejected": 0.25409698486328125, + "step": 3879 + }, + { + "epoch": 0.63, + "learning_rate": 8.01872271895678e-06, + "logits/chosen": -1.4929029941558838, + "logits/rejected": -1.3286052942276, + "logps/chosen": -126.7662124633789, + "logps/rejected": -82.7151870727539, + "loss": 0.3214, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.2539286613464355, + "rewards/margins": 0.11524534225463867, + "rewards/rejected": 5.138683319091797, + "step": 3880 + }, + { + "epoch": 0.63, + "learning_rate": 8.017674920543122e-06, + "logits/chosen": -0.9575188755989075, + "logits/rejected": -0.9010075330734253, + "logps/chosen": -135.09336853027344, + "logps/rejected": -70.26046752929688, + "loss": 0.7286, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.747656345367432, + "rewards/margins": 4.017193794250488, + "rewards/rejected": 1.730462670326233, + "step": 3881 + }, + { + "epoch": 0.63, + "learning_rate": 8.016626913636681e-06, + "logits/chosen": -1.2398712635040283, + "logits/rejected": -1.1356254816055298, + "logps/chosen": -58.112728118896484, + "logps/rejected": -102.0995864868164, + "loss": 1.1185, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.0272603034973145, + "rewards/margins": -0.9110872745513916, + "rewards/rejected": 2.938347578048706, + "step": 3882 + }, + { + "epoch": 0.63, + "learning_rate": 8.015578698309862e-06, + "logits/chosen": -0.8768525123596191, + "logits/rejected": -1.1496461629867554, + "logps/chosen": -122.80526733398438, + "logps/rejected": -43.0878791809082, + "loss": 0.2269, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.842588901519775, + "rewards/margins": 1.1975879669189453, + "rewards/rejected": 3.64500093460083, + "step": 3883 + }, + { + "epoch": 0.63, + "learning_rate": 8.01453027463509e-06, + "logits/chosen": -1.192334532737732, + "logits/rejected": -1.2191168069839478, + "logps/chosen": -86.26319885253906, + "logps/rejected": -70.74578094482422, + "loss": 0.6719, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.440805196762085, + "rewards/margins": 0.2640542984008789, + "rewards/rejected": 2.176750898361206, + "step": 3884 + }, + { + "epoch": 0.63, + "learning_rate": 8.013481642684799e-06, + "logits/chosen": -0.858206033706665, + "logits/rejected": -0.8500808477401733, + "logps/chosen": -70.78958129882812, + "logps/rejected": -27.90627098083496, + "loss": 3.1815, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.4915001392364502, + "rewards/margins": -0.14717042446136475, + "rewards/rejected": 1.638670563697815, + "step": 3885 + }, + { + "epoch": 0.63, + "learning_rate": 8.01243280253144e-06, + "logits/chosen": -0.6963539719581604, + "logits/rejected": -0.7123043537139893, + "logps/chosen": -13.527975082397461, + "logps/rejected": -1.776121973991394, + "loss": 1.0561, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.12681542336940765, + "rewards/margins": -0.3254290223121643, + "rewards/rejected": 0.19861361384391785, + "step": 3886 + }, + { + "epoch": 0.63, + "learning_rate": 8.011383754247479e-06, + "logits/chosen": -1.0664963722229004, + "logits/rejected": -0.9185314178466797, + "logps/chosen": -107.86825561523438, + "logps/rejected": -47.80872344970703, + "loss": 0.0284, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.254605293273926, + "rewards/margins": 3.0277702808380127, + "rewards/rejected": 2.226835012435913, + "step": 3887 + }, + { + "epoch": 0.63, + "learning_rate": 8.010334497905394e-06, + "logits/chosen": -1.2973963022232056, + "logits/rejected": -1.2918612957000732, + "logps/chosen": -105.79174041748047, + "logps/rejected": -76.85855102539062, + "loss": 0.9836, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.1904098987579346, + "rewards/margins": -0.44927752017974854, + "rewards/rejected": 1.639687418937683, + "step": 3888 + }, + { + "epoch": 0.63, + "learning_rate": 8.00928503357768e-06, + "logits/chosen": -1.2588108777999878, + "logits/rejected": -1.312889575958252, + "logps/chosen": -169.4742431640625, + "logps/rejected": -160.01649475097656, + "loss": 1.5646, + "rewards/accuracies": 0.0, + "rewards/chosen": 4.67024564743042, + "rewards/margins": -3.0839004516601562, + "rewards/rejected": 7.754146099090576, + "step": 3889 + }, + { + "epoch": 0.63, + "learning_rate": 8.008235361336845e-06, + "logits/chosen": -1.1851214170455933, + "logits/rejected": -1.1738407611846924, + "logps/chosen": -60.13782501220703, + "logps/rejected": -58.525779724121094, + "loss": 0.1586, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.6341941356658936, + "rewards/margins": 0.9909605979919434, + "rewards/rejected": 1.6432335376739502, + "step": 3890 + }, + { + "epoch": 0.63, + "learning_rate": 8.007185481255409e-06, + "logits/chosen": -1.1152063608169556, + "logits/rejected": -1.0981212854385376, + "logps/chosen": -64.27340698242188, + "logps/rejected": -81.6257553100586, + "loss": 0.2965, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2475967407226562, + "rewards/margins": 0.24634003639221191, + "rewards/rejected": 3.0012567043304443, + "step": 3891 + }, + { + "epoch": 0.63, + "learning_rate": 8.006135393405911e-06, + "logits/chosen": -1.148466944694519, + "logits/rejected": -1.110164999961853, + "logps/chosen": -69.2882080078125, + "logps/rejected": -70.12052917480469, + "loss": 0.4722, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0121140480041504, + "rewards/margins": 0.2161719799041748, + "rewards/rejected": 2.7959420680999756, + "step": 3892 + }, + { + "epoch": 0.63, + "learning_rate": 8.005085097860903e-06, + "logits/chosen": -1.1186258792877197, + "logits/rejected": -1.1186258792877197, + "logps/chosen": -45.25385284423828, + "logps/rejected": -45.25385284423828, + "loss": 2.5169, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.0293662548065186, + "rewards/margins": 0.0, + "rewards/rejected": 2.0293662548065186, + "step": 3893 + }, + { + "epoch": 0.63, + "learning_rate": 8.004034594692946e-06, + "logits/chosen": -0.9615347981452942, + "logits/rejected": -0.9086998105049133, + "logps/chosen": -54.059505462646484, + "logps/rejected": -56.42742919921875, + "loss": 0.1686, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2309911251068115, + "rewards/margins": 1.2716526985168457, + "rewards/rejected": 1.9593384265899658, + "step": 3894 + }, + { + "epoch": 0.63, + "learning_rate": 8.002983883974625e-06, + "logits/chosen": -0.8196134567260742, + "logits/rejected": -0.8196134567260742, + "logps/chosen": -79.04096984863281, + "logps/rejected": -79.04096984863281, + "loss": 0.3675, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.5005416870117188, + "rewards/margins": 0.0, + "rewards/rejected": 2.5005416870117188, + "step": 3895 + }, + { + "epoch": 0.63, + "learning_rate": 8.001932965778531e-06, + "logits/chosen": -1.1786677837371826, + "logits/rejected": -1.1468350887298584, + "logps/chosen": -52.1712532043457, + "logps/rejected": -10.22407341003418, + "loss": 0.2875, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.1736767292022705, + "rewards/margins": 1.4541881084442139, + "rewards/rejected": 0.7194885611534119, + "step": 3896 + }, + { + "epoch": 0.63, + "learning_rate": 8.000881840177276e-06, + "logits/chosen": -1.4119873046875, + "logits/rejected": -1.446589708328247, + "logps/chosen": -136.1309814453125, + "logps/rejected": -170.109130859375, + "loss": 1.7628, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.8017303943634033, + "rewards/margins": -3.2845003604888916, + "rewards/rejected": 5.086230754852295, + "step": 3897 + }, + { + "epoch": 0.63, + "learning_rate": 7.999830507243478e-06, + "logits/chosen": -1.17661452293396, + "logits/rejected": -1.1267597675323486, + "logps/chosen": -98.28992462158203, + "logps/rejected": -79.19500732421875, + "loss": 0.2736, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.998081207275391, + "rewards/margins": 0.3728294372558594, + "rewards/rejected": 4.625251770019531, + "step": 3898 + }, + { + "epoch": 0.63, + "learning_rate": 7.998778967049778e-06, + "logits/chosen": -1.2459473609924316, + "logits/rejected": -1.2040743827819824, + "logps/chosen": -48.98857116699219, + "logps/rejected": -53.22815704345703, + "loss": 0.6406, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.00590443611145, + "rewards/margins": 0.22234570980072021, + "rewards/rejected": 1.78355872631073, + "step": 3899 + }, + { + "epoch": 0.63, + "learning_rate": 7.997727219668827e-06, + "logits/chosen": -1.1462806463241577, + "logits/rejected": -1.170288324356079, + "logps/chosen": -86.39791870117188, + "logps/rejected": -98.39691162109375, + "loss": 2.2666, + "rewards/accuracies": 0.0, + "rewards/chosen": 4.800088405609131, + "rewards/margins": -4.485873699188232, + "rewards/rejected": 9.285962104797363, + "step": 3900 + }, + { + "epoch": 0.63, + "learning_rate": 7.996675265173289e-06, + "logits/chosen": -1.0496052503585815, + "logits/rejected": -0.918592631816864, + "logps/chosen": -54.65163040161133, + "logps/rejected": -56.99639892578125, + "loss": 1.8735, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.0143635272979736, + "rewards/margins": -1.1158185005187988, + "rewards/rejected": 3.1301820278167725, + "step": 3901 + }, + { + "epoch": 0.63, + "learning_rate": 7.995623103635843e-06, + "logits/chosen": -1.0204344987869263, + "logits/rejected": -0.9567539095878601, + "logps/chosen": -176.12733459472656, + "logps/rejected": -123.05762481689453, + "loss": 0.1683, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.6007461547851562, + "rewards/margins": 1.4563560485839844, + "rewards/rejected": 1.1443901062011719, + "step": 3902 + }, + { + "epoch": 0.63, + "learning_rate": 7.994570735129188e-06, + "logits/chosen": -1.522457242012024, + "logits/rejected": -1.2473224401474, + "logps/chosen": -135.5847625732422, + "logps/rejected": -40.23115539550781, + "loss": 0.096, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.790966987609863, + "rewards/margins": 4.249439239501953, + "rewards/rejected": 0.5415279269218445, + "step": 3903 + }, + { + "epoch": 0.63, + "learning_rate": 7.993518159726028e-06, + "logits/chosen": -0.986151397228241, + "logits/rejected": -1.0575637817382812, + "logps/chosen": -80.84556579589844, + "logps/rejected": -114.83753967285156, + "loss": 0.3482, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.4479012489318848, + "rewards/margins": 0.030091047286987305, + "rewards/rejected": 2.4178102016448975, + "step": 3904 + }, + { + "epoch": 0.63, + "learning_rate": 7.99246537749909e-06, + "logits/chosen": -1.172184705734253, + "logits/rejected": -1.1823476552963257, + "logps/chosen": -46.02006530761719, + "logps/rejected": -108.80404663085938, + "loss": 0.5177, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.4456093311309814, + "rewards/margins": -0.562593936920166, + "rewards/rejected": 2.0082032680511475, + "step": 3905 + }, + { + "epoch": 0.63, + "learning_rate": 7.991412388521108e-06, + "logits/chosen": -1.311855673789978, + "logits/rejected": -1.1927249431610107, + "logps/chosen": -162.5841064453125, + "logps/rejected": -15.889554023742676, + "loss": 0.0337, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.074163913726807, + "rewards/margins": 4.509269714355469, + "rewards/rejected": 0.5648941993713379, + "step": 3906 + }, + { + "epoch": 0.63, + "learning_rate": 7.990359192864837e-06, + "logits/chosen": -0.756827175617218, + "logits/rejected": -0.756827175617218, + "logps/chosen": -33.81507873535156, + "logps/rejected": -33.81507873535156, + "loss": 1.1653, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.3555946350097656, + "rewards/margins": 0.0, + "rewards/rejected": 2.3555946350097656, + "step": 3907 + }, + { + "epoch": 0.63, + "learning_rate": 7.989305790603038e-06, + "logits/chosen": -0.9731307029724121, + "logits/rejected": -0.9928808212280273, + "logps/chosen": -68.4115219116211, + "logps/rejected": -90.65800476074219, + "loss": 1.1586, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.123617649078369, + "rewards/margins": -0.8121383190155029, + "rewards/rejected": 2.935755968093872, + "step": 3908 + }, + { + "epoch": 0.63, + "learning_rate": 7.988252181808495e-06, + "logits/chosen": -1.1539840698242188, + "logits/rejected": -1.1769720315933228, + "logps/chosen": -103.62867736816406, + "logps/rejected": -147.72486877441406, + "loss": 0.3672, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.376405239105225, + "rewards/margins": 1.6556439399719238, + "rewards/rejected": 4.720761299133301, + "step": 3909 + }, + { + "epoch": 0.63, + "learning_rate": 7.987198366554002e-06, + "logits/chosen": -1.2539280652999878, + "logits/rejected": -1.1716023683547974, + "logps/chosen": -70.82785034179688, + "logps/rejected": -50.38145446777344, + "loss": 0.2096, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.681898593902588, + "rewards/margins": 0.996717095375061, + "rewards/rejected": 1.6851814985275269, + "step": 3910 + }, + { + "epoch": 0.63, + "learning_rate": 7.986144344912367e-06, + "logits/chosen": -1.0389537811279297, + "logits/rejected": -0.9863147139549255, + "logps/chosen": -67.27851867675781, + "logps/rejected": -44.840694427490234, + "loss": 0.4877, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.4322525262832642, + "rewards/margins": 0.62557452917099, + "rewards/rejected": 0.8066779971122742, + "step": 3911 + }, + { + "epoch": 0.63, + "learning_rate": 7.985090116956412e-06, + "logits/chosen": -1.0835249423980713, + "logits/rejected": -1.0784136056900024, + "logps/chosen": -216.41709899902344, + "logps/rejected": -48.02265167236328, + "loss": 0.313, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.429698467254639, + "rewards/margins": 3.3731393814086914, + "rewards/rejected": 2.0565590858459473, + "step": 3912 + }, + { + "epoch": 0.64, + "learning_rate": 7.984035682758975e-06, + "logits/chosen": -1.2315086126327515, + "logits/rejected": -1.1284019947052002, + "logps/chosen": -82.92277526855469, + "logps/rejected": -45.21551513671875, + "loss": 0.0627, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.3418169021606445, + "rewards/margins": 2.2067155838012695, + "rewards/rejected": 2.135101318359375, + "step": 3913 + }, + { + "epoch": 0.64, + "learning_rate": 7.982981042392907e-06, + "logits/chosen": -0.8247467875480652, + "logits/rejected": -0.8577926158905029, + "logps/chosen": -40.30710983276367, + "logps/rejected": -60.97340393066406, + "loss": 0.3519, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.0993905067443848, + "rewards/margins": -0.0172882080078125, + "rewards/rejected": 2.1166787147521973, + "step": 3914 + }, + { + "epoch": 0.64, + "learning_rate": 7.981926195931077e-06, + "logits/chosen": -0.8307114243507385, + "logits/rejected": -0.8922058343887329, + "logps/chosen": -68.06886291503906, + "logps/rejected": -81.49052429199219, + "loss": 1.9877, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.7630136013031006, + "rewards/margins": -3.9211418628692627, + "rewards/rejected": 6.684155464172363, + "step": 3915 + }, + { + "epoch": 0.64, + "learning_rate": 7.98087114344636e-06, + "logits/chosen": -1.0090043544769287, + "logits/rejected": -0.9991745352745056, + "logps/chosen": -51.97058868408203, + "logps/rejected": -102.85595703125, + "loss": 1.1271, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.431047797203064, + "rewards/margins": 0.08422160148620605, + "rewards/rejected": 1.346826195716858, + "step": 3916 + }, + { + "epoch": 0.64, + "learning_rate": 7.979815885011652e-06, + "logits/chosen": -0.7787067294120789, + "logits/rejected": -0.7795971035957336, + "logps/chosen": -1.1812453269958496, + "logps/rejected": -1.96426260471344, + "loss": 0.6148, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.13181547820568085, + "rewards/margins": 0.03250937908887863, + "rewards/rejected": 0.09930609911680222, + "step": 3917 + }, + { + "epoch": 0.64, + "learning_rate": 7.978760420699863e-06, + "logits/chosen": -0.8268766403198242, + "logits/rejected": -0.7462208271026611, + "logps/chosen": -40.344696044921875, + "logps/rejected": -32.68259811401367, + "loss": 0.7217, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.1165788173675537, + "rewards/margins": 0.2664353847503662, + "rewards/rejected": 1.8501434326171875, + "step": 3918 + }, + { + "epoch": 0.64, + "learning_rate": 7.977704750583915e-06, + "logits/chosen": -1.032964825630188, + "logits/rejected": -1.0890675783157349, + "logps/chosen": -15.122289657592773, + "logps/rejected": -32.06405258178711, + "loss": 1.1677, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.3939733505249023, + "rewards/margins": -2.1080358028411865, + "rewards/rejected": 3.502009153366089, + "step": 3919 + }, + { + "epoch": 0.64, + "learning_rate": 7.976648874736743e-06, + "logits/chosen": -1.011427402496338, + "logits/rejected": -1.0562167167663574, + "logps/chosen": -67.77769470214844, + "logps/rejected": -78.72908020019531, + "loss": 1.0337, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.8952209949493408, + "rewards/margins": -1.916449785232544, + "rewards/rejected": 3.8116707801818848, + "step": 3920 + }, + { + "epoch": 0.64, + "learning_rate": 7.975592793231298e-06, + "logits/chosen": -0.8077276349067688, + "logits/rejected": -0.8405191898345947, + "logps/chosen": -31.8258113861084, + "logps/rejected": -52.07002258300781, + "loss": 1.7687, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.4446130990982056, + "rewards/margins": -0.1137533187866211, + "rewards/rejected": 1.5583664178848267, + "step": 3921 + }, + { + "epoch": 0.64, + "learning_rate": 7.974536506140546e-06, + "logits/chosen": -1.3241573572158813, + "logits/rejected": -1.1076637506484985, + "logps/chosen": -134.78488159179688, + "logps/rejected": -27.82278823852539, + "loss": 0.0302, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.209677219390869, + "rewards/margins": 3.8948278427124023, + "rewards/rejected": 1.3148494958877563, + "step": 3922 + }, + { + "epoch": 0.64, + "learning_rate": 7.97348001353747e-06, + "logits/chosen": -1.2775737047195435, + "logits/rejected": -1.2801860570907593, + "logps/chosen": -87.81983947753906, + "logps/rejected": -73.52857971191406, + "loss": 1.086, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.2353867292404175, + "rewards/margins": -2.0036978721618652, + "rewards/rejected": 3.2390847206115723, + "step": 3923 + }, + { + "epoch": 0.64, + "learning_rate": 7.972423315495058e-06, + "logits/chosen": -1.442212700843811, + "logits/rejected": -1.385146141052246, + "logps/chosen": -96.8829345703125, + "logps/rejected": -32.1663932800293, + "loss": 0.0203, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.7007477283477783, + "rewards/margins": 3.310476303100586, + "rewards/rejected": -0.6097284555435181, + "step": 3924 + }, + { + "epoch": 0.64, + "learning_rate": 7.97136641208632e-06, + "logits/chosen": -0.7617202401161194, + "logits/rejected": -0.7287135720252991, + "logps/chosen": -68.15528869628906, + "logps/rejected": -64.32286834716797, + "loss": 0.9399, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.6286888122558594, + "rewards/margins": 1.0228652954101562, + "rewards/rejected": 1.6058235168457031, + "step": 3925 + }, + { + "epoch": 0.64, + "learning_rate": 7.970309303384278e-06, + "logits/chosen": -1.1655503511428833, + "logits/rejected": -0.9124324917793274, + "logps/chosen": -120.63235473632812, + "logps/rejected": -16.268226623535156, + "loss": 0.0281, + "rewards/accuracies": 1.0, + "rewards/chosen": 8.9866361618042, + "rewards/margins": 8.289884567260742, + "rewards/rejected": 0.6967514157295227, + "step": 3926 + }, + { + "epoch": 0.64, + "learning_rate": 7.969251989461969e-06, + "logits/chosen": -0.8862631916999817, + "logits/rejected": -0.8862631916999817, + "logps/chosen": -72.84652709960938, + "logps/rejected": -72.84652709960938, + "loss": 0.3677, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.8129814863204956, + "rewards/margins": 0.0, + "rewards/rejected": 1.8129814863204956, + "step": 3927 + }, + { + "epoch": 0.64, + "learning_rate": 7.968194470392444e-06, + "logits/chosen": -0.7736749053001404, + "logits/rejected": -0.923392117023468, + "logps/chosen": -118.40658569335938, + "logps/rejected": -139.28802490234375, + "loss": 1.1249, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.545355200767517, + "rewards/margins": -2.1295838356018066, + "rewards/rejected": 3.674938917160034, + "step": 3928 + }, + { + "epoch": 0.64, + "learning_rate": 7.967136746248765e-06, + "logits/chosen": -1.3484920263290405, + "logits/rejected": -1.3262574672698975, + "logps/chosen": -30.792011260986328, + "logps/rejected": -32.240535736083984, + "loss": 0.2561, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.2715176343917847, + "rewards/margins": 0.5879849195480347, + "rewards/rejected": 0.68353271484375, + "step": 3929 + }, + { + "epoch": 0.64, + "learning_rate": 7.966078817104012e-06, + "logits/chosen": -0.517558217048645, + "logits/rejected": -0.44459497928619385, + "logps/chosen": -38.43621826171875, + "logps/rejected": -47.43586730957031, + "loss": 0.3562, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.3581318855285645, + "rewards/margins": 1.6618072986602783, + "rewards/rejected": 0.6963245272636414, + "step": 3930 + }, + { + "epoch": 0.64, + "learning_rate": 7.965020683031279e-06, + "logits/chosen": -1.0445868968963623, + "logits/rejected": -1.0802810192108154, + "logps/chosen": -49.88719940185547, + "logps/rejected": -51.19607162475586, + "loss": 0.2328, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.029170274734497, + "rewards/margins": 0.5584064722061157, + "rewards/rejected": 1.4707638025283813, + "step": 3931 + }, + { + "epoch": 0.64, + "learning_rate": 7.96396234410367e-06, + "logits/chosen": -1.0111716985702515, + "logits/rejected": -0.8668968677520752, + "logps/chosen": -106.273681640625, + "logps/rejected": -25.027095794677734, + "loss": 0.2598, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.6045563220977783, + "rewards/margins": 1.7169475555419922, + "rewards/rejected": 0.8876087069511414, + "step": 3932 + }, + { + "epoch": 0.64, + "learning_rate": 7.96290380039431e-06, + "logits/chosen": -1.3573275804519653, + "logits/rejected": -1.328244686126709, + "logps/chosen": -150.3635711669922, + "logps/rejected": -113.46847534179688, + "loss": 0.0877, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.3909454345703125, + "rewards/margins": 2.1003310680389404, + "rewards/rejected": 1.290614366531372, + "step": 3933 + }, + { + "epoch": 0.64, + "learning_rate": 7.961845051976334e-06, + "logits/chosen": -0.8762831091880798, + "logits/rejected": -0.9454395174980164, + "logps/chosen": -56.493404388427734, + "logps/rejected": -119.1718978881836, + "loss": 0.6238, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.0061633586883545, + "rewards/margins": 0.6112498044967651, + "rewards/rejected": 1.3949135541915894, + "step": 3934 + }, + { + "epoch": 0.64, + "learning_rate": 7.960786098922888e-06, + "logits/chosen": -1.1805833578109741, + "logits/rejected": -1.041447639465332, + "logps/chosen": -46.00171661376953, + "logps/rejected": -19.890039443969727, + "loss": 0.3592, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.0884109735488892, + "rewards/margins": 0.27163374423980713, + "rewards/rejected": 0.816777229309082, + "step": 3935 + }, + { + "epoch": 0.64, + "learning_rate": 7.959726941307137e-06, + "logits/chosen": -1.2111821174621582, + "logits/rejected": -1.2090824842453003, + "logps/chosen": -147.81173706054688, + "logps/rejected": -108.01589965820312, + "loss": 0.8258, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.0754287242889404, + "rewards/margins": -0.6413743495941162, + "rewards/rejected": 2.7168030738830566, + "step": 3936 + }, + { + "epoch": 0.64, + "learning_rate": 7.958667579202261e-06, + "logits/chosen": -1.059792160987854, + "logits/rejected": -0.9528894424438477, + "logps/chosen": -124.79464721679688, + "logps/rejected": -48.24236297607422, + "loss": 0.2482, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.315512180328369, + "rewards/margins": 0.4696950912475586, + "rewards/rejected": 4.8458170890808105, + "step": 3937 + }, + { + "epoch": 0.64, + "learning_rate": 7.957608012681452e-06, + "logits/chosen": -1.0187121629714966, + "logits/rejected": -1.095762014389038, + "logps/chosen": -50.08721160888672, + "logps/rejected": -96.52543640136719, + "loss": 0.3646, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.9217705726623535, + "rewards/margins": 0.4046974182128906, + "rewards/rejected": 3.517073154449463, + "step": 3938 + }, + { + "epoch": 0.64, + "learning_rate": 7.956548241817914e-06, + "logits/chosen": -1.1901154518127441, + "logits/rejected": -1.1198997497558594, + "logps/chosen": -39.643531799316406, + "logps/rejected": -30.473587036132812, + "loss": 1.5868, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.9224671125411987, + "rewards/margins": -0.7723456621170044, + "rewards/rejected": 2.694812774658203, + "step": 3939 + }, + { + "epoch": 0.64, + "learning_rate": 7.955488266684865e-06, + "logits/chosen": -1.3241033554077148, + "logits/rejected": -1.3163889646530151, + "logps/chosen": -188.32028198242188, + "logps/rejected": -107.58123016357422, + "loss": 0.7747, + "rewards/accuracies": 0.0, + "rewards/chosen": 6.563581943511963, + "rewards/margins": -0.7336158752441406, + "rewards/rejected": 7.2971978187561035, + "step": 3940 + }, + { + "epoch": 0.64, + "learning_rate": 7.954428087355544e-06, + "logits/chosen": -1.2573180198669434, + "logits/rejected": -0.9431111216545105, + "logps/chosen": -134.4493865966797, + "logps/rejected": -27.832611083984375, + "loss": 0.0009, + "rewards/accuracies": 1.0, + "rewards/chosen": 7.809074401855469, + "rewards/margins": 7.702698230743408, + "rewards/rejected": 0.1063762679696083, + "step": 3941 + }, + { + "epoch": 0.64, + "learning_rate": 7.953367703903196e-06, + "logits/chosen": -0.9394932985305786, + "logits/rejected": -0.9013490080833435, + "logps/chosen": -43.01353073120117, + "logps/rejected": -2.323066234588623, + "loss": 0.6533, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.004383087158203125, + "rewards/margins": -0.4192967414855957, + "rewards/rejected": 0.42367982864379883, + "step": 3942 + }, + { + "epoch": 0.64, + "learning_rate": 7.952307116401086e-06, + "logits/chosen": -1.410462737083435, + "logits/rejected": -1.3401918411254883, + "logps/chosen": -128.85067749023438, + "logps/rejected": -188.73275756835938, + "loss": 0.6298, + "rewards/accuracies": 0.0, + "rewards/chosen": 5.360970973968506, + "rewards/margins": -0.9225039482116699, + "rewards/rejected": 6.283474922180176, + "step": 3943 + }, + { + "epoch": 0.64, + "learning_rate": 7.951246324922488e-06, + "logits/chosen": -1.2993173599243164, + "logits/rejected": -1.2312209606170654, + "logps/chosen": -77.37747955322266, + "logps/rejected": -110.51795196533203, + "loss": 0.2838, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.148256778717041, + "rewards/margins": 0.689971923828125, + "rewards/rejected": 2.458284854888916, + "step": 3944 + }, + { + "epoch": 0.64, + "learning_rate": 7.950185329540693e-06, + "logits/chosen": -1.1488929986953735, + "logits/rejected": -1.0537062883377075, + "logps/chosen": -124.36514282226562, + "logps/rejected": -175.27972412109375, + "loss": 0.4206, + "rewards/accuracies": 1.0, + "rewards/chosen": 8.396005630493164, + "rewards/margins": 0.03506183624267578, + "rewards/rejected": 8.360943794250488, + "step": 3945 + }, + { + "epoch": 0.64, + "learning_rate": 7.94912413032901e-06, + "logits/chosen": -1.6511693000793457, + "logits/rejected": -1.6924139261245728, + "logps/chosen": -152.9324188232422, + "logps/rejected": -77.10649871826172, + "loss": 1.3346, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.9988784790039062, + "rewards/margins": -2.3912668228149414, + "rewards/rejected": 4.390145301818848, + "step": 3946 + }, + { + "epoch": 0.64, + "learning_rate": 7.948062727360753e-06, + "logits/chosen": -0.9977867603302002, + "logits/rejected": -1.0358580350875854, + "logps/chosen": -76.70394897460938, + "logps/rejected": -103.72989654541016, + "loss": 1.3862, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.5822579860687256, + "rewards/margins": -2.1688153743743896, + "rewards/rejected": 5.751073360443115, + "step": 3947 + }, + { + "epoch": 0.64, + "learning_rate": 7.947001120709254e-06, + "logits/chosen": -1.043853759765625, + "logits/rejected": -1.1753572225570679, + "logps/chosen": -84.68710327148438, + "logps/rejected": -123.813720703125, + "loss": 1.4175, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.377707004547119, + "rewards/margins": -1.2503721714019775, + "rewards/rejected": 3.6280791759490967, + "step": 3948 + }, + { + "epoch": 0.64, + "learning_rate": 7.945939310447866e-06, + "logits/chosen": -1.0820263624191284, + "logits/rejected": -1.155655026435852, + "logps/chosen": -69.81076049804688, + "logps/rejected": -79.92855834960938, + "loss": 1.7575, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.718426465988159, + "rewards/margins": -1.9484031200408936, + "rewards/rejected": 4.666829586029053, + "step": 3949 + }, + { + "epoch": 0.64, + "learning_rate": 7.944877296649945e-06, + "logits/chosen": -1.3669850826263428, + "logits/rejected": -1.402596354484558, + "logps/chosen": -91.39867401123047, + "logps/rejected": -99.92427062988281, + "loss": 0.6127, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.9481712579727173, + "rewards/margins": -0.868360161781311, + "rewards/rejected": 2.8165314197540283, + "step": 3950 + }, + { + "epoch": 0.64, + "learning_rate": 7.943815079388867e-06, + "logits/chosen": -0.9090657234191895, + "logits/rejected": -1.0200115442276, + "logps/chosen": -55.10049057006836, + "logps/rejected": -94.78812408447266, + "loss": 2.2493, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.4186710119247437, + "rewards/margins": -2.777653694152832, + "rewards/rejected": 4.196324825286865, + "step": 3951 + }, + { + "epoch": 0.64, + "learning_rate": 7.942752658738023e-06, + "logits/chosen": -0.839972198009491, + "logits/rejected": -0.9806963801383972, + "logps/chosen": -75.2467041015625, + "logps/rejected": -115.29837799072266, + "loss": 1.5341, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.7124404907226562, + "rewards/margins": -3.0194649696350098, + "rewards/rejected": 5.731905460357666, + "step": 3952 + }, + { + "epoch": 0.64, + "learning_rate": 7.941690034770812e-06, + "logits/chosen": -1.2347304821014404, + "logits/rejected": -1.3194833993911743, + "logps/chosen": -81.00797271728516, + "logps/rejected": -89.92912292480469, + "loss": 1.7793, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.8173927068710327, + "rewards/margins": -2.9765024185180664, + "rewards/rejected": 4.793895244598389, + "step": 3953 + }, + { + "epoch": 0.64, + "learning_rate": 7.940627207560655e-06, + "logits/chosen": -0.8283079266548157, + "logits/rejected": -0.8819118738174438, + "logps/chosen": -56.4329948425293, + "logps/rejected": -57.54632568359375, + "loss": 0.8199, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.445058822631836, + "rewards/margins": -1.3394412994384766, + "rewards/rejected": 2.7845001220703125, + "step": 3954 + }, + { + "epoch": 0.64, + "learning_rate": 7.939564177180986e-06, + "logits/chosen": -1.302520513534546, + "logits/rejected": -1.450595498085022, + "logps/chosen": -43.754608154296875, + "logps/rejected": -125.87577819824219, + "loss": 2.0685, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.218065023422241, + "rewards/margins": -4.119610786437988, + "rewards/rejected": 6.33767557144165, + "step": 3955 + }, + { + "epoch": 0.64, + "learning_rate": 7.938500943705243e-06, + "logits/chosen": -0.7701758742332458, + "logits/rejected": -0.7668618559837341, + "logps/chosen": -2.1263034343719482, + "logps/rejected": -1.6939724683761597, + "loss": 0.8095, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.23129451274871826, + "rewards/margins": 0.08425416052341461, + "rewards/rejected": 0.14704035222530365, + "step": 3956 + }, + { + "epoch": 0.64, + "learning_rate": 7.937437507206889e-06, + "logits/chosen": -0.7272621989250183, + "logits/rejected": -0.7220442891120911, + "logps/chosen": -21.67666244506836, + "logps/rejected": -18.555137634277344, + "loss": 0.5536, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.3682266175746918, + "rewards/margins": -0.5729926824569702, + "rewards/rejected": 0.9412193298339844, + "step": 3957 + }, + { + "epoch": 0.64, + "learning_rate": 7.9363738677594e-06, + "logits/chosen": -1.1656721830368042, + "logits/rejected": -0.8141342401504517, + "logps/chosen": -86.23736572265625, + "logps/rejected": -63.58132553100586, + "loss": 0.9089, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.2350265979766846, + "rewards/margins": 0.9253727197647095, + "rewards/rejected": 1.309653878211975, + "step": 3958 + }, + { + "epoch": 0.64, + "learning_rate": 7.935310025436258e-06, + "logits/chosen": -1.3381702899932861, + "logits/rejected": -1.3383344411849976, + "logps/chosen": -98.57009887695312, + "logps/rejected": -113.5128173828125, + "loss": 0.4268, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.3969223499298096, + "rewards/margins": 0.4467254877090454, + "rewards/rejected": 0.9501968622207642, + "step": 3959 + }, + { + "epoch": 0.64, + "learning_rate": 7.93424598031097e-06, + "logits/chosen": -1.0381118059158325, + "logits/rejected": -0.9945800304412842, + "logps/chosen": -102.15708923339844, + "logps/rejected": -64.29874420166016, + "loss": 1.4336, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.8023300170898438, + "rewards/margins": 0.1294463872909546, + "rewards/rejected": 0.6728836297988892, + "step": 3960 + }, + { + "epoch": 0.64, + "learning_rate": 7.933181732457047e-06, + "logits/chosen": -1.3916282653808594, + "logits/rejected": -1.4506914615631104, + "logps/chosen": -125.54893493652344, + "logps/rejected": -78.64080810546875, + "loss": 1.1318, + "rewards/accuracies": 0.0, + "rewards/chosen": 5.4991455078125, + "rewards/margins": -0.35180044174194336, + "rewards/rejected": 5.850945949554443, + "step": 3961 + }, + { + "epoch": 0.64, + "learning_rate": 7.932117281948021e-06, + "logits/chosen": -1.1359034776687622, + "logits/rejected": -1.0602275133132935, + "logps/chosen": -131.6271209716797, + "logps/rejected": -70.04796600341797, + "loss": 0.4528, + "rewards/accuracies": 0.0, + "rewards/chosen": 7.403886318206787, + "rewards/margins": -0.3735942840576172, + "rewards/rejected": 7.777480602264404, + "step": 3962 + }, + { + "epoch": 0.64, + "learning_rate": 7.931052628857436e-06, + "logits/chosen": -1.1116169691085815, + "logits/rejected": -1.1210256814956665, + "logps/chosen": -62.80646896362305, + "logps/rejected": -141.20440673828125, + "loss": 1.7584, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.2630306482315063, + "rewards/margins": -1.006686806678772, + "rewards/rejected": 2.2697174549102783, + "step": 3963 + }, + { + "epoch": 0.64, + "learning_rate": 7.929987773258847e-06, + "logits/chosen": -0.8175296187400818, + "logits/rejected": -0.8182263374328613, + "logps/chosen": -5.9316020011901855, + "logps/rejected": -2.1973447799682617, + "loss": 1.1464, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.38647791743278503, + "rewards/margins": -0.238498717546463, + "rewards/rejected": 0.624976634979248, + "step": 3964 + }, + { + "epoch": 0.64, + "learning_rate": 7.928922715225827e-06, + "logits/chosen": -0.9481043815612793, + "logits/rejected": -0.9481043815612793, + "logps/chosen": -55.202613830566406, + "logps/rejected": -55.202613830566406, + "loss": 0.6826, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.16536369919776917, + "rewards/margins": 0.0, + "rewards/rejected": 0.16536369919776917, + "step": 3965 + }, + { + "epoch": 0.64, + "learning_rate": 7.92785745483196e-06, + "logits/chosen": -1.3662585020065308, + "logits/rejected": -1.3662585020065308, + "logps/chosen": -41.95309066772461, + "logps/rejected": -41.95309066772461, + "loss": 0.9372, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.6311241388320923, + "rewards/margins": 0.0, + "rewards/rejected": 0.6311241388320923, + "step": 3966 + }, + { + "epoch": 0.64, + "learning_rate": 7.926791992150849e-06, + "logits/chosen": -0.8738020062446594, + "logits/rejected": -0.8101748824119568, + "logps/chosen": -94.3487777709961, + "logps/rejected": -41.799198150634766, + "loss": 0.233, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.5255165100097656, + "rewards/margins": 0.5454097390174866, + "rewards/rejected": 0.980106770992279, + "step": 3967 + }, + { + "epoch": 0.64, + "learning_rate": 7.925726327256103e-06, + "logits/chosen": -1.123016119003296, + "logits/rejected": -1.105050802230835, + "logps/chosen": -61.73390197753906, + "logps/rejected": -80.021728515625, + "loss": 0.3024, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.1299827098846436, + "rewards/margins": 0.24417340755462646, + "rewards/rejected": 1.885809302330017, + "step": 3968 + }, + { + "epoch": 0.64, + "learning_rate": 7.924660460221352e-06, + "logits/chosen": -0.9141106009483337, + "logits/rejected": -0.7782359719276428, + "logps/chosen": -73.48663330078125, + "logps/rejected": -61.804527282714844, + "loss": 0.0275, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.928719997406006, + "rewards/margins": 3.314070701599121, + "rewards/rejected": 2.6146492958068848, + "step": 3969 + }, + { + "epoch": 0.64, + "learning_rate": 7.923594391120237e-06, + "logits/chosen": -1.1193552017211914, + "logits/rejected": -1.020933985710144, + "logps/chosen": -176.47006225585938, + "logps/rejected": -148.9426727294922, + "loss": 1.1195, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.841238498687744, + "rewards/margins": 0.9060103893280029, + "rewards/rejected": 3.935228109359741, + "step": 3970 + }, + { + "epoch": 0.64, + "learning_rate": 7.92252812002641e-06, + "logits/chosen": -1.2624343633651733, + "logits/rejected": -1.3306198120117188, + "logps/chosen": -88.30288696289062, + "logps/rejected": -129.6583251953125, + "loss": 0.5973, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.1830841302871704, + "rewards/margins": -0.8306335210800171, + "rewards/rejected": 2.0137176513671875, + "step": 3971 + }, + { + "epoch": 0.64, + "learning_rate": 7.921461647013546e-06, + "logits/chosen": -1.4741336107254028, + "logits/rejected": -1.4746692180633545, + "logps/chosen": -83.59627532958984, + "logps/rejected": -157.1182098388672, + "loss": 2.799, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.3719300031661987, + "rewards/margins": -2.731187343597412, + "rewards/rejected": 4.1031174659729, + "step": 3972 + }, + { + "epoch": 0.64, + "learning_rate": 7.920394972155326e-06, + "logits/chosen": -1.061444640159607, + "logits/rejected": -1.0330325365066528, + "logps/chosen": -93.93789672851562, + "logps/rejected": -67.18348693847656, + "loss": 0.1306, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.671520233154297, + "rewards/margins": 1.3474944829940796, + "rewards/rejected": 1.3240257501602173, + "step": 3973 + }, + { + "epoch": 0.65, + "learning_rate": 7.919328095525446e-06, + "logits/chosen": -1.3887025117874146, + "logits/rejected": -1.2650128602981567, + "logps/chosen": -131.90829467773438, + "logps/rejected": -73.32243347167969, + "loss": 0.2541, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.9803056716918945, + "rewards/margins": 1.9704818725585938, + "rewards/rejected": 4.009823799133301, + "step": 3974 + }, + { + "epoch": 0.65, + "learning_rate": 7.918261017197615e-06, + "logits/chosen": -1.0761452913284302, + "logits/rejected": -1.0666732788085938, + "logps/chosen": -45.971168518066406, + "logps/rejected": -52.34779357910156, + "loss": 0.3952, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.4959625005722046, + "rewards/margins": 0.25366735458374023, + "rewards/rejected": 1.2422951459884644, + "step": 3975 + }, + { + "epoch": 0.65, + "learning_rate": 7.917193737245563e-06, + "logits/chosen": -1.0239306688308716, + "logits/rejected": -1.0456875562667847, + "logps/chosen": -88.43941497802734, + "logps/rejected": -106.50665283203125, + "loss": 0.5272, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.397613525390625, + "rewards/margins": -0.4068315029144287, + "rewards/rejected": 2.8044450283050537, + "step": 3976 + }, + { + "epoch": 0.65, + "learning_rate": 7.916126255743024e-06, + "logits/chosen": -1.0991469621658325, + "logits/rejected": -1.0900393724441528, + "logps/chosen": -43.81465148925781, + "logps/rejected": -59.99817657470703, + "loss": 0.3507, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.8646904230117798, + "rewards/margins": 0.032700300216674805, + "rewards/rejected": 1.831990122795105, + "step": 3977 + }, + { + "epoch": 0.65, + "learning_rate": 7.915058572763757e-06, + "logits/chosen": -0.6746648550033569, + "logits/rejected": -0.6728711128234863, + "logps/chosen": -1.7267448902130127, + "logps/rejected": -1.4529718160629272, + "loss": 0.3763, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.41672953963279724, + "rewards/margins": -0.010800808668136597, + "rewards/rejected": 0.42753034830093384, + "step": 3978 + }, + { + "epoch": 0.65, + "learning_rate": 7.913990688381523e-06, + "logits/chosen": -1.148300051689148, + "logits/rejected": -1.1609649658203125, + "logps/chosen": -122.96234130859375, + "logps/rejected": -128.33094787597656, + "loss": 0.5747, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.3758392333984375, + "rewards/margins": 0.7843841314315796, + "rewards/rejected": 1.591455101966858, + "step": 3979 + }, + { + "epoch": 0.65, + "learning_rate": 7.912922602670105e-06, + "logits/chosen": -1.1846481561660767, + "logits/rejected": -1.2079718112945557, + "logps/chosen": -143.04776000976562, + "logps/rejected": -88.69845581054688, + "loss": 0.1134, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.849359035491943, + "rewards/margins": 1.5354399681091309, + "rewards/rejected": 3.3139190673828125, + "step": 3980 + }, + { + "epoch": 0.65, + "learning_rate": 7.911854315703298e-06, + "logits/chosen": -0.8217802047729492, + "logits/rejected": -0.7776433229446411, + "logps/chosen": -209.72088623046875, + "logps/rejected": -42.88807678222656, + "loss": 1.6912, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.827597141265869, + "rewards/margins": 5.641010284423828, + "rewards/rejected": 1.1865867376327515, + "step": 3981 + }, + { + "epoch": 0.65, + "learning_rate": 7.910785827554909e-06, + "logits/chosen": -1.4001294374465942, + "logits/rejected": -1.4893288612365723, + "logps/chosen": -132.65696716308594, + "logps/rejected": -176.3135986328125, + "loss": 0.3653, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.503318786621094, + "rewards/margins": 0.02718353271484375, + "rewards/rejected": 5.47613525390625, + "step": 3982 + }, + { + "epoch": 0.65, + "learning_rate": 7.909717138298762e-06, + "logits/chosen": -0.7874782681465149, + "logits/rejected": -0.7835392355918884, + "logps/chosen": -7.048890113830566, + "logps/rejected": -2.2672019004821777, + "loss": 0.7323, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.5687403082847595, + "rewards/margins": -0.1509607434272766, + "rewards/rejected": 0.7197010517120361, + "step": 3983 + }, + { + "epoch": 0.65, + "learning_rate": 7.908648248008693e-06, + "logits/chosen": -0.9172347784042358, + "logits/rejected": -0.9189088344573975, + "logps/chosen": -78.01631164550781, + "logps/rejected": -103.27376556396484, + "loss": 1.6439, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.515576124191284, + "rewards/margins": -2.4291388988494873, + "rewards/rejected": 4.9447150230407715, + "step": 3984 + }, + { + "epoch": 0.65, + "learning_rate": 7.90757915675855e-06, + "logits/chosen": -1.2954940795898438, + "logits/rejected": -1.3347045183181763, + "logps/chosen": -65.26841735839844, + "logps/rejected": -215.13082885742188, + "loss": 3.2008, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.1654975414276123, + "rewards/margins": -6.387090682983398, + "rewards/rejected": 9.55258846282959, + "step": 3985 + }, + { + "epoch": 0.65, + "learning_rate": 7.906509864622202e-06, + "logits/chosen": -0.6153544187545776, + "logits/rejected": -0.6148954033851624, + "logps/chosen": -3.6837198734283447, + "logps/rejected": -1.9175012111663818, + "loss": 0.6363, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.22010424733161926, + "rewards/margins": -0.13489580154418945, + "rewards/rejected": 0.3550000488758087, + "step": 3986 + }, + { + "epoch": 0.65, + "learning_rate": 7.905440371673522e-06, + "logits/chosen": -0.9250564575195312, + "logits/rejected": -0.824122428894043, + "logps/chosen": -62.65927505493164, + "logps/rejected": -39.36707305908203, + "loss": 0.3633, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.7222301959991455, + "rewards/margins": 1.0458812713623047, + "rewards/rejected": 1.6763489246368408, + "step": 3987 + }, + { + "epoch": 0.65, + "learning_rate": 7.904370677986404e-06, + "logits/chosen": -0.9358455538749695, + "logits/rejected": -0.9625266790390015, + "logps/chosen": -58.10602951049805, + "logps/rejected": -100.72994232177734, + "loss": 0.4444, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.2305790185928345, + "rewards/margins": 0.42089349031448364, + "rewards/rejected": 0.8096855282783508, + "step": 3988 + }, + { + "epoch": 0.65, + "learning_rate": 7.903300783634755e-06, + "logits/chosen": -1.3064072132110596, + "logits/rejected": -1.2211976051330566, + "logps/chosen": -74.11077880859375, + "logps/rejected": -23.89679527282715, + "loss": 0.1284, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.9905388355255127, + "rewards/margins": 1.281433343887329, + "rewards/rejected": 0.7091054916381836, + "step": 3989 + }, + { + "epoch": 0.65, + "learning_rate": 7.902230688692492e-06, + "logits/chosen": -1.1759086847305298, + "logits/rejected": -1.1734957695007324, + "logps/chosen": -56.32465744018555, + "logps/rejected": -47.711082458496094, + "loss": 0.4066, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.645090103149414, + "rewards/margins": 0.442518949508667, + "rewards/rejected": 1.202571153640747, + "step": 3990 + }, + { + "epoch": 0.65, + "learning_rate": 7.90116039323355e-06, + "logits/chosen": -0.9398636817932129, + "logits/rejected": -0.8900417685508728, + "logps/chosen": -38.607215881347656, + "logps/rejected": -76.64940643310547, + "loss": 0.5881, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.025781273841858, + "rewards/margins": -0.757550835609436, + "rewards/rejected": 1.783332109451294, + "step": 3991 + }, + { + "epoch": 0.65, + "learning_rate": 7.900089897331875e-06, + "logits/chosen": -1.0258054733276367, + "logits/rejected": -0.9839504361152649, + "logps/chosen": -60.24578857421875, + "logps/rejected": -80.02466583251953, + "loss": 0.526, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.14019775390625, + "rewards/margins": -0.5367240905761719, + "rewards/rejected": 2.676921844482422, + "step": 3992 + }, + { + "epoch": 0.65, + "learning_rate": 7.89901920106143e-06, + "logits/chosen": -0.4908847212791443, + "logits/rejected": -0.3880205750465393, + "logps/chosen": -47.81897735595703, + "logps/rejected": -1.2438048124313354, + "loss": 0.5464, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5330169796943665, + "rewards/margins": 0.09010478854179382, + "rewards/rejected": 0.44291219115257263, + "step": 3993 + }, + { + "epoch": 0.65, + "learning_rate": 7.897948304496189e-06, + "logits/chosen": -0.9982830882072449, + "logits/rejected": -0.9982830882072449, + "logps/chosen": -108.55072021484375, + "logps/rejected": -108.55072021484375, + "loss": 0.368, + "rewards/accuracies": 0.0, + "rewards/chosen": 4.488241672515869, + "rewards/margins": 0.0, + "rewards/rejected": 4.488241672515869, + "step": 3994 + }, + { + "epoch": 0.65, + "learning_rate": 7.89687720771014e-06, + "logits/chosen": -1.169638752937317, + "logits/rejected": -1.1316049098968506, + "logps/chosen": -69.11711883544922, + "logps/rejected": -55.23875427246094, + "loss": 0.1581, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.7929633855819702, + "rewards/margins": 1.4755332469940186, + "rewards/rejected": 0.3174301087856293, + "step": 3995 + }, + { + "epoch": 0.65, + "learning_rate": 7.895805910777288e-06, + "logits/chosen": -1.2134116888046265, + "logits/rejected": -1.3084909915924072, + "logps/chosen": -100.9415283203125, + "logps/rejected": -128.1461639404297, + "loss": 2.153, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.9295578002929688, + "rewards/margins": -4.2919769287109375, + "rewards/rejected": 7.221534729003906, + "step": 3996 + }, + { + "epoch": 0.65, + "learning_rate": 7.894734413771646e-06, + "logits/chosen": -0.786307692527771, + "logits/rejected": -0.8093978762626648, + "logps/chosen": -1.4224721193313599, + "logps/rejected": -30.765945434570312, + "loss": 1.0466, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.2501469552516937, + "rewards/margins": -0.450959712266922, + "rewards/rejected": 0.7011066675186157, + "step": 3997 + }, + { + "epoch": 0.65, + "learning_rate": 7.893662716767247e-06, + "logits/chosen": -0.9701523184776306, + "logits/rejected": -0.9627497792243958, + "logps/chosen": -48.93484878540039, + "logps/rejected": -95.11053466796875, + "loss": 0.1939, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.3842358589172363, + "rewards/margins": 0.7685314416885376, + "rewards/rejected": 1.6157044172286987, + "step": 3998 + }, + { + "epoch": 0.65, + "learning_rate": 7.892590819838135e-06, + "logits/chosen": -1.0587102174758911, + "logits/rejected": -0.9066492319107056, + "logps/chosen": -79.37078857421875, + "logps/rejected": -84.60700225830078, + "loss": 2.3205, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.9553269147872925, + "rewards/margins": -1.4906119108200073, + "rewards/rejected": 3.4459388256073, + "step": 3999 + }, + { + "epoch": 0.65, + "learning_rate": 7.891518723058367e-06, + "logits/chosen": -1.1545889377593994, + "logits/rejected": -1.1308907270431519, + "logps/chosen": -137.70599365234375, + "logps/rejected": -71.94020080566406, + "loss": 0.9107, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.8093109130859375, + "rewards/margins": 0.8000259399414062, + "rewards/rejected": 1.0092849731445312, + "step": 4000 + }, + { + "epoch": 0.65, + "learning_rate": 7.890446426502015e-06, + "logits/chosen": -0.651064395904541, + "logits/rejected": -0.6320760846138, + "logps/chosen": -83.7306137084961, + "logps/rejected": -46.074222564697266, + "loss": 0.6397, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.2095222473144531, + "rewards/margins": -0.20104563236236572, + "rewards/rejected": 1.4105678796768188, + "step": 4001 + }, + { + "epoch": 0.65, + "learning_rate": 7.889373930243166e-06, + "logits/chosen": -1.1201763153076172, + "logits/rejected": -0.9383255243301392, + "logps/chosen": -59.254249572753906, + "logps/rejected": -29.268491744995117, + "loss": 0.0669, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.325826406478882, + "rewards/margins": 2.159369707107544, + "rewards/rejected": 0.16645680367946625, + "step": 4002 + }, + { + "epoch": 0.65, + "learning_rate": 7.888301234355915e-06, + "logits/chosen": -0.9860442280769348, + "logits/rejected": -0.9581464529037476, + "logps/chosen": -52.609352111816406, + "logps/rejected": -51.75501251220703, + "loss": 1.349, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.98466956615448, + "rewards/margins": -0.09048998355865479, + "rewards/rejected": 2.0751595497131348, + "step": 4003 + }, + { + "epoch": 0.65, + "learning_rate": 7.887228338914379e-06, + "logits/chosen": -0.8630712032318115, + "logits/rejected": -0.8280065059661865, + "logps/chosen": -89.47212219238281, + "logps/rejected": -55.71038818359375, + "loss": 0.1866, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.0030105113983154, + "rewards/margins": 0.8229018449783325, + "rewards/rejected": 1.180108666419983, + "step": 4004 + }, + { + "epoch": 0.65, + "learning_rate": 7.886155243992683e-06, + "logits/chosen": -1.2506093978881836, + "logits/rejected": -1.2447649240493774, + "logps/chosen": -73.50450134277344, + "logps/rejected": -45.2688102722168, + "loss": 1.1807, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.6799278259277344, + "rewards/margins": -0.6227893829345703, + "rewards/rejected": 2.3027172088623047, + "step": 4005 + }, + { + "epoch": 0.65, + "learning_rate": 7.885081949664971e-06, + "logits/chosen": -1.2245676517486572, + "logits/rejected": -1.2686235904693604, + "logps/chosen": -165.36248779296875, + "logps/rejected": -82.22076416015625, + "loss": 0.115, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.577649116516113, + "rewards/margins": 1.3536224365234375, + "rewards/rejected": 5.224026679992676, + "step": 4006 + }, + { + "epoch": 0.65, + "learning_rate": 7.884008456005394e-06, + "logits/chosen": -1.0569379329681396, + "logits/rejected": -1.0973856449127197, + "logps/chosen": -106.03878784179688, + "logps/rejected": -102.17782592773438, + "loss": 2.8036, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.58447265625, + "rewards/margins": -5.504460334777832, + "rewards/rejected": 8.088932991027832, + "step": 4007 + }, + { + "epoch": 0.65, + "learning_rate": 7.882934763088121e-06, + "logits/chosen": -1.02704918384552, + "logits/rejected": -1.0000859498977661, + "logps/chosen": -125.52409362792969, + "logps/rejected": -88.24967956542969, + "loss": 0.5395, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.281498908996582, + "rewards/margins": 1.174241542816162, + "rewards/rejected": 4.10725736618042, + "step": 4008 + }, + { + "epoch": 0.65, + "learning_rate": 7.881860870987336e-06, + "logits/chosen": -1.0710384845733643, + "logits/rejected": -0.9723858833312988, + "logps/chosen": -90.39692687988281, + "logps/rejected": -70.74198913574219, + "loss": 0.0261, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.710483074188232, + "rewards/margins": 3.343130588531494, + "rewards/rejected": 1.3673523664474487, + "step": 4009 + }, + { + "epoch": 0.65, + "learning_rate": 7.880786779777233e-06, + "logits/chosen": -1.0723826885223389, + "logits/rejected": -1.076828956604004, + "logps/chosen": -42.91851043701172, + "logps/rejected": -114.70643615722656, + "loss": 0.1245, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.3626419305801392, + "rewards/margins": 1.331990122795105, + "rewards/rejected": 0.03065185621380806, + "step": 4010 + }, + { + "epoch": 0.65, + "learning_rate": 7.87971248953202e-06, + "logits/chosen": -1.1052976846694946, + "logits/rejected": -1.1261179447174072, + "logps/chosen": -68.8996353149414, + "logps/rejected": -88.45738220214844, + "loss": 0.4317, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.8827149868011475, + "rewards/margins": -0.3056907653808594, + "rewards/rejected": 3.188405752182007, + "step": 4011 + }, + { + "epoch": 0.65, + "learning_rate": 7.878638000325925e-06, + "logits/chosen": -0.5784704089164734, + "logits/rejected": -0.5804036855697632, + "logps/chosen": -10.263962745666504, + "logps/rejected": -1.7332788705825806, + "loss": 0.3399, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3531312048435211, + "rewards/margins": 0.07072272896766663, + "rewards/rejected": 0.2824084758758545, + "step": 4012 + }, + { + "epoch": 0.65, + "learning_rate": 7.87756331223318e-06, + "logits/chosen": -1.1968997716903687, + "logits/rejected": -1.200325608253479, + "logps/chosen": -66.64387512207031, + "logps/rejected": -56.463111877441406, + "loss": 0.6408, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.6535156965255737, + "rewards/margins": -0.9076255559921265, + "rewards/rejected": 2.5611412525177, + "step": 4013 + }, + { + "epoch": 0.65, + "learning_rate": 7.876488425328037e-06, + "logits/chosen": -1.088342308998108, + "logits/rejected": -1.0920237302780151, + "logps/chosen": -125.61224365234375, + "logps/rejected": -67.81346893310547, + "loss": 0.2732, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.811920166015625, + "rewards/margins": 0.4653282165527344, + "rewards/rejected": 3.3465919494628906, + "step": 4014 + }, + { + "epoch": 0.65, + "learning_rate": 7.875413339684764e-06, + "logits/chosen": -1.5799996852874756, + "logits/rejected": -1.6534184217453003, + "logps/chosen": -193.25169372558594, + "logps/rejected": -208.1243896484375, + "loss": 2.3303, + "rewards/accuracies": 0.0, + "rewards/chosen": 7.3376054763793945, + "rewards/margins": -1.2438583374023438, + "rewards/rejected": 8.581463813781738, + "step": 4015 + }, + { + "epoch": 0.65, + "learning_rate": 7.874338055377634e-06, + "logits/chosen": -1.0068267583847046, + "logits/rejected": -1.0037869215011597, + "logps/chosen": -82.09414672851562, + "logps/rejected": -128.61993408203125, + "loss": 0.3006, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.4798431396484375, + "rewards/margins": 0.1958785057067871, + "rewards/rejected": 2.2839646339416504, + "step": 4016 + }, + { + "epoch": 0.65, + "learning_rate": 7.873262572480943e-06, + "logits/chosen": -1.137944221496582, + "logits/rejected": -1.1713309288024902, + "logps/chosen": -69.90765380859375, + "logps/rejected": -104.0510482788086, + "loss": 0.8676, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.690333604812622, + "rewards/margins": -1.1307549476623535, + "rewards/rejected": 2.8210885524749756, + "step": 4017 + }, + { + "epoch": 0.65, + "learning_rate": 7.872186891068997e-06, + "logits/chosen": -1.0616323947906494, + "logits/rejected": -1.1153011322021484, + "logps/chosen": -41.93207550048828, + "logps/rejected": -73.24039459228516, + "loss": 0.5168, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.4655334949493408, + "rewards/margins": -0.22704088687896729, + "rewards/rejected": 1.692574381828308, + "step": 4018 + }, + { + "epoch": 0.65, + "learning_rate": 7.87111101121611e-06, + "logits/chosen": -0.6276582479476929, + "logits/rejected": -0.6346274614334106, + "logps/chosen": -1.4391976594924927, + "logps/rejected": -2.719303607940674, + "loss": 0.4147, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3054150640964508, + "rewards/margins": 0.01862207055091858, + "rewards/rejected": 0.2867929935455322, + "step": 4019 + }, + { + "epoch": 0.65, + "learning_rate": 7.870034932996622e-06, + "logits/chosen": -0.7320036292076111, + "logits/rejected": -0.756783664226532, + "logps/chosen": -6.376450061798096, + "logps/rejected": -35.60542678833008, + "loss": 0.734, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.30874237418174744, + "rewards/margins": 0.12037719786167145, + "rewards/rejected": 0.188365176320076, + "step": 4020 + }, + { + "epoch": 0.65, + "learning_rate": 7.868958656484875e-06, + "logits/chosen": -1.0820554494857788, + "logits/rejected": -1.0293844938278198, + "logps/chosen": -96.84329223632812, + "logps/rejected": -108.53469848632812, + "loss": 0.0739, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.529791355133057, + "rewards/margins": 3.655813694000244, + "rewards/rejected": 2.8739776611328125, + "step": 4021 + }, + { + "epoch": 0.65, + "learning_rate": 7.86788218175523e-06, + "logits/chosen": -1.3072867393493652, + "logits/rejected": -1.349448323249817, + "logps/chosen": -91.75775146484375, + "logps/rejected": -203.2462921142578, + "loss": 2.1739, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.4703705310821533, + "rewards/margins": -1.64316725730896, + "rewards/rejected": 5.113537788391113, + "step": 4022 + }, + { + "epoch": 0.65, + "learning_rate": 7.866805508882064e-06, + "logits/chosen": -1.016018033027649, + "logits/rejected": -1.049007773399353, + "logps/chosen": -178.94195556640625, + "logps/rejected": -87.35090637207031, + "loss": 1.6617, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.564401149749756, + "rewards/margins": 2.7611889839172363, + "rewards/rejected": 1.80321204662323, + "step": 4023 + }, + { + "epoch": 0.65, + "learning_rate": 7.865728637939764e-06, + "logits/chosen": -1.0907758474349976, + "logits/rejected": -1.124819040298462, + "logps/chosen": -79.54132080078125, + "logps/rejected": -79.78909301757812, + "loss": 0.3366, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.4835762977600098, + "rewards/margins": 0.38026881217956543, + "rewards/rejected": 2.1033074855804443, + "step": 4024 + }, + { + "epoch": 0.65, + "learning_rate": 7.864651569002731e-06, + "logits/chosen": -0.9325492978096008, + "logits/rejected": -0.9325492978096008, + "logps/chosen": -82.66294860839844, + "logps/rejected": -82.66294860839844, + "loss": 0.3504, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.0717437267303467, + "rewards/margins": 0.0, + "rewards/rejected": 2.0717437267303467, + "step": 4025 + }, + { + "epoch": 0.65, + "learning_rate": 7.86357430214538e-06, + "logits/chosen": -1.0872435569763184, + "logits/rejected": -1.0750205516815186, + "logps/chosen": -73.39093780517578, + "logps/rejected": -63.89896774291992, + "loss": 0.3045, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.266050100326538, + "rewards/margins": 0.8898640871047974, + "rewards/rejected": 1.3761860132217407, + "step": 4026 + }, + { + "epoch": 0.65, + "learning_rate": 7.86249683744214e-06, + "logits/chosen": -1.0453635454177856, + "logits/rejected": -0.9473844766616821, + "logps/chosen": -95.58658599853516, + "logps/rejected": -40.36785125732422, + "loss": 0.039, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.3435966968536377, + "rewards/margins": 2.647052049636841, + "rewards/rejected": 0.6965446472167969, + "step": 4027 + }, + { + "epoch": 0.65, + "learning_rate": 7.861419174967452e-06, + "logits/chosen": -1.0042616128921509, + "logits/rejected": -1.0018688440322876, + "logps/chosen": -53.963531494140625, + "logps/rejected": -56.70047378540039, + "loss": 1.0991, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.5999497175216675, + "rewards/margins": 0.9272171854972839, + "rewards/rejected": 0.6727325320243835, + "step": 4028 + }, + { + "epoch": 0.65, + "learning_rate": 7.860341314795775e-06, + "logits/chosen": -0.934670627117157, + "logits/rejected": -0.9722179770469666, + "logps/chosen": -111.70875549316406, + "logps/rejected": -89.34664916992188, + "loss": 2.2382, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.566418409347534, + "rewards/margins": -4.202447891235352, + "rewards/rejected": 6.768866062164307, + "step": 4029 + }, + { + "epoch": 0.65, + "learning_rate": 7.859263257001578e-06, + "logits/chosen": -1.072582721710205, + "logits/rejected": -1.0507392883300781, + "logps/chosen": -89.59664916992188, + "logps/rejected": -52.99660110473633, + "loss": 0.0459, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.481305122375488, + "rewards/margins": 2.3536367416381836, + "rewards/rejected": 3.1276683807373047, + "step": 4030 + }, + { + "epoch": 0.65, + "learning_rate": 7.858185001659344e-06, + "logits/chosen": -1.1480810642242432, + "logits/rejected": -1.1229724884033203, + "logps/chosen": -58.3261833190918, + "logps/rejected": -47.502708435058594, + "loss": 0.45, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.952487587928772, + "rewards/margins": 0.1051098108291626, + "rewards/rejected": 1.8473777770996094, + "step": 4031 + }, + { + "epoch": 0.65, + "learning_rate": 7.857106548843571e-06, + "logits/chosen": -1.2958686351776123, + "logits/rejected": -1.3062219619750977, + "logps/chosen": -218.233642578125, + "logps/rejected": -94.24214172363281, + "loss": 0.0093, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.211669921875, + "rewards/margins": 4.246926784515381, + "rewards/rejected": 1.9647430181503296, + "step": 4032 + }, + { + "epoch": 0.65, + "learning_rate": 7.856027898628771e-06, + "logits/chosen": -1.4161568880081177, + "logits/rejected": -1.428505778312683, + "logps/chosen": -49.723777770996094, + "logps/rejected": -68.83056640625, + "loss": 0.4324, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.091604709625244, + "rewards/margins": 0.11157691478729248, + "rewards/rejected": 1.9800277948379517, + "step": 4033 + }, + { + "epoch": 0.65, + "learning_rate": 7.854949051089467e-06, + "logits/chosen": -1.0011425018310547, + "logits/rejected": -0.9932675957679749, + "logps/chosen": -31.068204879760742, + "logps/rejected": -13.77759838104248, + "loss": 0.8491, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.7914045453071594, + "rewards/margins": 0.13982439041137695, + "rewards/rejected": 0.6515801548957825, + "step": 4034 + }, + { + "epoch": 0.65, + "learning_rate": 7.853870006300196e-06, + "logits/chosen": -1.2241368293762207, + "logits/rejected": -1.234954595565796, + "logps/chosen": -192.98443603515625, + "logps/rejected": -87.5329818725586, + "loss": 0.5459, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.857837200164795, + "rewards/margins": 1.3743281364440918, + "rewards/rejected": 4.483509063720703, + "step": 4035 + }, + { + "epoch": 0.66, + "learning_rate": 7.852790764335511e-06, + "logits/chosen": -0.888683557510376, + "logits/rejected": -0.9356359243392944, + "logps/chosen": -55.742530822753906, + "logps/rejected": -68.63768768310547, + "loss": 1.445, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.2176384925842285, + "rewards/margins": 0.04357767105102539, + "rewards/rejected": 2.174060821533203, + "step": 4036 + }, + { + "epoch": 0.66, + "learning_rate": 7.851711325269979e-06, + "logits/chosen": -1.1359913349151611, + "logits/rejected": -1.147628664970398, + "logps/chosen": -89.98170471191406, + "logps/rejected": -75.30355072021484, + "loss": 0.42, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.933271884918213, + "rewards/margins": 0.15999293327331543, + "rewards/rejected": 3.7732789516448975, + "step": 4037 + }, + { + "epoch": 0.66, + "learning_rate": 7.850631689178177e-06, + "logits/chosen": -0.7226632833480835, + "logits/rejected": -0.6389182209968567, + "logps/chosen": -76.06024169921875, + "logps/rejected": -80.80813598632812, + "loss": 1.6465, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.7015488147735596, + "rewards/margins": -0.11483383178710938, + "rewards/rejected": 2.816382646560669, + "step": 4038 + }, + { + "epoch": 0.66, + "learning_rate": 7.849551856134699e-06, + "logits/chosen": -1.0029664039611816, + "logits/rejected": -0.883062481880188, + "logps/chosen": -41.33472442626953, + "logps/rejected": -13.323282241821289, + "loss": 0.8594, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.6056755781173706, + "rewards/margins": 0.7645224332809448, + "rewards/rejected": 0.8411531448364258, + "step": 4039 + }, + { + "epoch": 0.66, + "learning_rate": 7.848471826214148e-06, + "logits/chosen": -1.2057626247406006, + "logits/rejected": -1.181996464729309, + "logps/chosen": -52.23507308959961, + "logps/rejected": -96.54402923583984, + "loss": 2.6884, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.6546810865402222, + "rewards/margins": 0.09883928298950195, + "rewards/rejected": 1.5558418035507202, + "step": 4040 + }, + { + "epoch": 0.66, + "learning_rate": 7.847391599491147e-06, + "logits/chosen": -0.4900146424770355, + "logits/rejected": -0.5439905524253845, + "logps/chosen": -17.135934829711914, + "logps/rejected": -26.54095458984375, + "loss": 0.4254, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.680777370929718, + "rewards/margins": 0.06272584199905396, + "rewards/rejected": 0.6180515289306641, + "step": 4041 + }, + { + "epoch": 0.66, + "learning_rate": 7.846311176040331e-06, + "logits/chosen": -1.0700929164886475, + "logits/rejected": -0.995759904384613, + "logps/chosen": -46.35259246826172, + "logps/rejected": -52.459312438964844, + "loss": 0.3976, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.6006065607070923, + "rewards/margins": -0.03690493106842041, + "rewards/rejected": 1.6375114917755127, + "step": 4042 + }, + { + "epoch": 0.66, + "learning_rate": 7.845230555936342e-06, + "logits/chosen": -0.9122933745384216, + "logits/rejected": -0.9586975574493408, + "logps/chosen": -368.4774169921875, + "logps/rejected": -97.54603576660156, + "loss": 0.0731, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.532769680023193, + "rewards/margins": 2.2971084117889404, + "rewards/rejected": 3.235661268234253, + "step": 4043 + }, + { + "epoch": 0.66, + "learning_rate": 7.844149739253845e-06, + "logits/chosen": -1.1937522888183594, + "logits/rejected": -1.075943946838379, + "logps/chosen": -67.26590728759766, + "logps/rejected": -15.522408485412598, + "loss": 0.7147, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.7903175354003906, + "rewards/margins": 1.1137244701385498, + "rewards/rejected": 0.6765931248664856, + "step": 4044 + }, + { + "epoch": 0.66, + "learning_rate": 7.843068726067513e-06, + "logits/chosen": -0.883307933807373, + "logits/rejected": -0.9103922247886658, + "logps/chosen": -47.04907989501953, + "logps/rejected": -50.483821868896484, + "loss": 0.9827, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.9997280240058899, + "rewards/margins": -0.028889477252960205, + "rewards/rejected": 1.02861750125885, + "step": 4045 + }, + { + "epoch": 0.66, + "learning_rate": 7.841987516452032e-06, + "logits/chosen": -1.1577956676483154, + "logits/rejected": -0.8255302309989929, + "logps/chosen": -133.8873748779297, + "logps/rejected": -95.692626953125, + "loss": 0.3528, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.5280656814575195, + "rewards/margins": 1.0469927787780762, + "rewards/rejected": 4.481072902679443, + "step": 4046 + }, + { + "epoch": 0.66, + "learning_rate": 7.840906110482107e-06, + "logits/chosen": -0.737311840057373, + "logits/rejected": -0.7424339652061462, + "logps/chosen": -32.61629104614258, + "logps/rejected": -33.39934539794922, + "loss": 0.4903, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.2956902980804443, + "rewards/margins": -0.4527456760406494, + "rewards/rejected": 2.7484359741210938, + "step": 4047 + }, + { + "epoch": 0.66, + "learning_rate": 7.839824508232448e-06, + "logits/chosen": -0.685705840587616, + "logits/rejected": -0.6742202043533325, + "logps/chosen": -80.72845458984375, + "logps/rejected": -98.70069122314453, + "loss": 1.0113, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.5826950073242188, + "rewards/margins": -1.860257625579834, + "rewards/rejected": 4.442952632904053, + "step": 4048 + }, + { + "epoch": 0.66, + "learning_rate": 7.838742709777789e-06, + "logits/chosen": -0.8526175022125244, + "logits/rejected": -0.8020020127296448, + "logps/chosen": -86.71229553222656, + "logps/rejected": -57.05078887939453, + "loss": 0.2936, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.054139614105225, + "rewards/margins": 2.2059950828552246, + "rewards/rejected": 2.84814453125, + "step": 4049 + }, + { + "epoch": 0.66, + "learning_rate": 7.837660715192867e-06, + "logits/chosen": -1.2831885814666748, + "logits/rejected": -1.106587529182434, + "logps/chosen": -121.27436828613281, + "logps/rejected": -75.2961654663086, + "loss": 0.8785, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.822035312652588, + "rewards/margins": 1.0589455366134644, + "rewards/rejected": 1.7630897760391235, + "step": 4050 + }, + { + "epoch": 0.66, + "learning_rate": 7.83657852455244e-06, + "logits/chosen": -1.1685818433761597, + "logits/rejected": -1.1670938730239868, + "logps/chosen": -53.39957046508789, + "logps/rejected": -133.83348083496094, + "loss": 0.5198, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.774081826210022, + "rewards/margins": 0.1351490020751953, + "rewards/rejected": 1.6389328241348267, + "step": 4051 + }, + { + "epoch": 0.66, + "learning_rate": 7.835496137931278e-06, + "logits/chosen": -0.8863881230354309, + "logits/rejected": -0.9119289517402649, + "logps/chosen": -51.304229736328125, + "logps/rejected": -81.56987762451172, + "loss": 0.4772, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.450958251953125, + "rewards/margins": 1.2084189653396606, + "rewards/rejected": 1.2425392866134644, + "step": 4052 + }, + { + "epoch": 0.66, + "learning_rate": 7.834413555404162e-06, + "logits/chosen": -0.7665000557899475, + "logits/rejected": -0.7665000557899475, + "logps/chosen": -73.79224395751953, + "logps/rejected": -73.79224395751953, + "loss": 0.4582, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.0874733924865723, + "rewards/margins": 0.0, + "rewards/rejected": 3.0874733924865723, + "step": 4053 + }, + { + "epoch": 0.66, + "learning_rate": 7.833330777045886e-06, + "logits/chosen": -0.9058898687362671, + "logits/rejected": -0.7803190350532532, + "logps/chosen": -65.97109985351562, + "logps/rejected": -22.88625717163086, + "loss": 0.9181, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.356146216392517, + "rewards/margins": 0.9172669649124146, + "rewards/rejected": 0.43887922167778015, + "step": 4054 + }, + { + "epoch": 0.66, + "learning_rate": 7.832247802931267e-06, + "logits/chosen": -0.692427933216095, + "logits/rejected": -0.7416743636131287, + "logps/chosen": -59.30980682373047, + "logps/rejected": -52.77790832519531, + "loss": 0.2706, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.4352989196777344, + "rewards/margins": 0.4082832336425781, + "rewards/rejected": 2.0270156860351562, + "step": 4055 + }, + { + "epoch": 0.66, + "learning_rate": 7.831164633135123e-06, + "logits/chosen": -0.8627668023109436, + "logits/rejected": -0.8321475386619568, + "logps/chosen": -56.41806411743164, + "logps/rejected": -52.254676818847656, + "loss": 0.8572, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.7413337826728821, + "rewards/margins": -0.036083221435546875, + "rewards/rejected": 0.777417004108429, + "step": 4056 + }, + { + "epoch": 0.66, + "learning_rate": 7.83008126773229e-06, + "logits/chosen": -1.0038341283798218, + "logits/rejected": -0.9293395280838013, + "logps/chosen": -128.85047912597656, + "logps/rejected": -107.17281341552734, + "loss": 0.5931, + "rewards/accuracies": 0.0, + "rewards/chosen": 6.278872966766357, + "rewards/margins": -0.8071842193603516, + "rewards/rejected": 7.086057186126709, + "step": 4057 + }, + { + "epoch": 0.66, + "learning_rate": 7.82899770679762e-06, + "logits/chosen": -0.9777312278747559, + "logits/rejected": -0.998124361038208, + "logps/chosen": -103.30352783203125, + "logps/rejected": -65.20161437988281, + "loss": 0.4752, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.0988495349884033, + "rewards/margins": -0.37053680419921875, + "rewards/rejected": 3.469386339187622, + "step": 4058 + }, + { + "epoch": 0.66, + "learning_rate": 7.827913950405977e-06, + "logits/chosen": -1.1868480443954468, + "logits/rejected": -1.1868480443954468, + "logps/chosen": -53.103546142578125, + "logps/rejected": -53.103546142578125, + "loss": 0.3553, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.037493944168091, + "rewards/margins": 0.0, + "rewards/rejected": 3.037493944168091, + "step": 4059 + }, + { + "epoch": 0.66, + "learning_rate": 7.826829998632236e-06, + "logits/chosen": -1.1724224090576172, + "logits/rejected": -1.1275070905685425, + "logps/chosen": -81.33560180664062, + "logps/rejected": -74.39097595214844, + "loss": 1.3314, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.157958984375, + "rewards/margins": -1.8286943435668945, + "rewards/rejected": 4.9866533279418945, + "step": 4060 + }, + { + "epoch": 0.66, + "learning_rate": 7.825745851551294e-06, + "logits/chosen": -0.9805187582969666, + "logits/rejected": -0.9842492341995239, + "logps/chosen": -13.204684257507324, + "logps/rejected": -15.579988479614258, + "loss": 0.6679, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.169291377067566, + "rewards/margins": 0.10376882553100586, + "rewards/rejected": 1.06552255153656, + "step": 4061 + }, + { + "epoch": 0.66, + "learning_rate": 7.824661509238049e-06, + "logits/chosen": -1.1751809120178223, + "logits/rejected": -1.2641282081604004, + "logps/chosen": -72.70860290527344, + "logps/rejected": -92.43914031982422, + "loss": 1.5426, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.6005616188049316, + "rewards/margins": -2.966320037841797, + "rewards/rejected": 5.5668816566467285, + "step": 4062 + }, + { + "epoch": 0.66, + "learning_rate": 7.82357697176742e-06, + "logits/chosen": -0.980278730392456, + "logits/rejected": -0.980278730392456, + "logps/chosen": -1.2224242687225342, + "logps/rejected": -1.2224242687225342, + "loss": 0.6434, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.2241591215133667, + "rewards/margins": 0.0, + "rewards/rejected": 0.2241591215133667, + "step": 4063 + }, + { + "epoch": 0.66, + "learning_rate": 7.82249223921434e-06, + "logits/chosen": -0.9193297028541565, + "logits/rejected": -0.8785491585731506, + "logps/chosen": -19.388057708740234, + "logps/rejected": -18.86532211303711, + "loss": 1.8174, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.186113715171814, + "rewards/margins": 0.10521578788757324, + "rewards/rejected": 1.0808979272842407, + "step": 4064 + }, + { + "epoch": 0.66, + "learning_rate": 7.821407311653752e-06, + "logits/chosen": -1.102586269378662, + "logits/rejected": -1.1399091482162476, + "logps/chosen": -146.61734008789062, + "logps/rejected": -90.49504089355469, + "loss": 0.1783, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.443145751953125, + "rewards/margins": 0.8587632179260254, + "rewards/rejected": 4.5843825340271, + "step": 4065 + }, + { + "epoch": 0.66, + "learning_rate": 7.820322189160618e-06, + "logits/chosen": -1.2635215520858765, + "logits/rejected": -1.2498126029968262, + "logps/chosen": -91.53981018066406, + "logps/rejected": -82.20025634765625, + "loss": 0.2531, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.159370422363281, + "rewards/margins": 0.5701072216033936, + "rewards/rejected": 3.5892632007598877, + "step": 4066 + }, + { + "epoch": 0.66, + "learning_rate": 7.819236871809904e-06, + "logits/chosen": -0.6940708756446838, + "logits/rejected": -0.6427006721496582, + "logps/chosen": -48.406288146972656, + "logps/rejected": -42.325313568115234, + "loss": 1.4714, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.428804874420166, + "rewards/margins": 0.6418229341506958, + "rewards/rejected": 1.7869819402694702, + "step": 4067 + }, + { + "epoch": 0.66, + "learning_rate": 7.8181513596766e-06, + "logits/chosen": -1.0845061540603638, + "logits/rejected": -1.0928350687026978, + "logps/chosen": -94.29855346679688, + "logps/rejected": -111.25743103027344, + "loss": 0.2636, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.5280702114105225, + "rewards/margins": 0.5375199317932129, + "rewards/rejected": 2.9905502796173096, + "step": 4068 + }, + { + "epoch": 0.66, + "learning_rate": 7.8170656528357e-06, + "logits/chosen": -0.9605954885482788, + "logits/rejected": -0.9790159463882446, + "logps/chosen": -115.20133972167969, + "logps/rejected": -76.36128234863281, + "loss": 1.0277, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.8765275478363037, + "rewards/margins": -0.1786360740661621, + "rewards/rejected": 3.055163621902466, + "step": 4069 + }, + { + "epoch": 0.66, + "learning_rate": 7.815979751362221e-06, + "logits/chosen": -1.0930476188659668, + "logits/rejected": -1.1894729137420654, + "logps/chosen": -76.98101806640625, + "logps/rejected": -88.73210144042969, + "loss": 1.8801, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.9968308210372925, + "rewards/margins": -3.6034650802612305, + "rewards/rejected": 5.6002960205078125, + "step": 4070 + }, + { + "epoch": 0.66, + "learning_rate": 7.814893655331186e-06, + "logits/chosen": -0.6774574518203735, + "logits/rejected": -0.6685897707939148, + "logps/chosen": -3.894944667816162, + "logps/rejected": -6.90199613571167, + "loss": 0.474, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3149454593658447, + "rewards/margins": 0.2042146623134613, + "rewards/rejected": 0.11073078960180283, + "step": 4071 + }, + { + "epoch": 0.66, + "learning_rate": 7.813807364817635e-06, + "logits/chosen": -1.2438730001449585, + "logits/rejected": -0.8929877877235413, + "logps/chosen": -59.250675201416016, + "logps/rejected": -103.32090759277344, + "loss": 0.17, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.735375881195068, + "rewards/margins": 1.164860486984253, + "rewards/rejected": 3.5705153942108154, + "step": 4072 + }, + { + "epoch": 0.66, + "learning_rate": 7.812720879896616e-06, + "logits/chosen": -1.022221565246582, + "logits/rejected": -1.0239434242248535, + "logps/chosen": -53.7152214050293, + "logps/rejected": -79.82347106933594, + "loss": 0.6596, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.8111011981964111, + "rewards/margins": -0.011478781700134277, + "rewards/rejected": 1.8225799798965454, + "step": 4073 + }, + { + "epoch": 0.66, + "learning_rate": 7.811634200643202e-06, + "logits/chosen": -0.6715745329856873, + "logits/rejected": -0.6354051828384399, + "logps/chosen": -22.82661247253418, + "logps/rejected": -3.3872222900390625, + "loss": 0.3499, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.9738821387290955, + "rewards/margins": 0.6119522452354431, + "rewards/rejected": 0.36192989349365234, + "step": 4074 + }, + { + "epoch": 0.66, + "learning_rate": 7.810547327132467e-06, + "logits/chosen": -0.9429360032081604, + "logits/rejected": -0.9172648787498474, + "logps/chosen": -64.18413543701172, + "logps/rejected": -84.62858581542969, + "loss": 0.1073, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.8988234996795654, + "rewards/margins": 1.795292615890503, + "rewards/rejected": 1.1035308837890625, + "step": 4075 + }, + { + "epoch": 0.66, + "learning_rate": 7.809460259439506e-06, + "logits/chosen": -1.2774529457092285, + "logits/rejected": -1.3242775201797485, + "logps/chosen": -135.35874938964844, + "logps/rejected": -153.23129272460938, + "loss": 0.0733, + "rewards/accuracies": 1.0, + "rewards/chosen": 8.717903137207031, + "rewards/margins": 1.9875106811523438, + "rewards/rejected": 6.7303924560546875, + "step": 4076 + }, + { + "epoch": 0.66, + "learning_rate": 7.808372997639423e-06, + "logits/chosen": -1.2536840438842773, + "logits/rejected": -1.1842235326766968, + "logps/chosen": -105.20995330810547, + "logps/rejected": -38.75457000732422, + "loss": 0.5914, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.4738380908966064, + "rewards/margins": -0.8136932849884033, + "rewards/rejected": 2.2875313758850098, + "step": 4077 + }, + { + "epoch": 0.66, + "learning_rate": 7.807285541807342e-06, + "logits/chosen": -1.0140289068222046, + "logits/rejected": -0.9759604930877686, + "logps/chosen": -57.4286003112793, + "logps/rejected": -73.728759765625, + "loss": 0.9218, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.3449971675872803, + "rewards/margins": -0.33414411544799805, + "rewards/rejected": 3.6791412830352783, + "step": 4078 + }, + { + "epoch": 0.66, + "learning_rate": 7.806197892018391e-06, + "logits/chosen": -0.6364582180976868, + "logits/rejected": -0.6774641275405884, + "logps/chosen": -85.3558349609375, + "logps/rejected": -67.58551025390625, + "loss": 1.4883, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.507088541984558, + "rewards/margins": 0.45364463329315186, + "rewards/rejected": 1.0534439086914062, + "step": 4079 + }, + { + "epoch": 0.66, + "learning_rate": 7.805110048347719e-06, + "logits/chosen": -1.1746059656143188, + "logits/rejected": -1.1142276525497437, + "logps/chosen": -75.87882232666016, + "logps/rejected": -58.78776931762695, + "loss": 1.5756, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.7018417119979858, + "rewards/margins": -0.025115609169006348, + "rewards/rejected": 1.7269573211669922, + "step": 4080 + }, + { + "epoch": 0.66, + "learning_rate": 7.804022010870484e-06, + "logits/chosen": -1.0168853998184204, + "logits/rejected": -0.7371770143508911, + "logps/chosen": -274.4896545410156, + "logps/rejected": -15.44233512878418, + "loss": 0.0312, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.746762275695801, + "rewards/margins": 3.9162821769714355, + "rewards/rejected": 0.8304800391197205, + "step": 4081 + }, + { + "epoch": 0.66, + "learning_rate": 7.80293377966186e-06, + "logits/chosen": -1.3478537797927856, + "logits/rejected": -1.3676471710205078, + "logps/chosen": -90.88246154785156, + "logps/rejected": -63.53767776489258, + "loss": 0.487, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.615287780761719, + "rewards/margins": 0.007701396942138672, + "rewards/rejected": 4.60758638381958, + "step": 4082 + }, + { + "epoch": 0.66, + "learning_rate": 7.801845354797033e-06, + "logits/chosen": -0.9764230847358704, + "logits/rejected": -0.9836128354072571, + "logps/chosen": -91.31466674804688, + "logps/rejected": -185.64773559570312, + "loss": 0.1609, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.226536512374878, + "rewards/margins": 0.9904067516326904, + "rewards/rejected": 1.2361297607421875, + "step": 4083 + }, + { + "epoch": 0.66, + "learning_rate": 7.800756736351204e-06, + "logits/chosen": -0.8515661358833313, + "logits/rejected": -0.82213294506073, + "logps/chosen": -61.12623596191406, + "logps/rejected": -88.069580078125, + "loss": 0.9653, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.7981605529785156, + "rewards/margins": -1.5533103942871094, + "rewards/rejected": 3.351470947265625, + "step": 4084 + }, + { + "epoch": 0.66, + "learning_rate": 7.799667924399585e-06, + "logits/chosen": -1.182690978050232, + "logits/rejected": -1.2201403379440308, + "logps/chosen": -112.70480346679688, + "logps/rejected": -110.69947814941406, + "loss": 1.4244, + "rewards/accuracies": 0.0, + "rewards/chosen": 4.391424655914307, + "rewards/margins": -1.0334882736206055, + "rewards/rejected": 5.424912929534912, + "step": 4085 + }, + { + "epoch": 0.66, + "learning_rate": 7.798578919017404e-06, + "logits/chosen": -1.0310111045837402, + "logits/rejected": -1.0791817903518677, + "logps/chosen": -61.01709747314453, + "logps/rejected": -75.31114196777344, + "loss": 0.7005, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.292860507965088, + "rewards/margins": -0.923590898513794, + "rewards/rejected": 3.216451406478882, + "step": 4086 + }, + { + "epoch": 0.66, + "learning_rate": 7.797489720279899e-06, + "logits/chosen": -0.9406718015670776, + "logits/rejected": -0.9406718015670776, + "logps/chosen": -35.311859130859375, + "logps/rejected": -35.311859130859375, + "loss": 0.4739, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.6332054138183594, + "rewards/margins": 0.0, + "rewards/rejected": 1.6332054138183594, + "step": 4087 + }, + { + "epoch": 0.66, + "learning_rate": 7.796400328262325e-06, + "logits/chosen": -1.5925569534301758, + "logits/rejected": -1.5468697547912598, + "logps/chosen": -63.644920349121094, + "logps/rejected": -41.79241943359375, + "loss": 0.1063, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.302668809890747, + "rewards/margins": 1.5192551612854004, + "rewards/rejected": 0.7834137082099915, + "step": 4088 + }, + { + "epoch": 0.66, + "learning_rate": 7.795310743039948e-06, + "logits/chosen": -0.9850870370864868, + "logits/rejected": -0.9643687605857849, + "logps/chosen": -22.65323829650879, + "logps/rejected": -44.62220001220703, + "loss": 0.7473, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.100583076477051, + "rewards/margins": -0.20599818229675293, + "rewards/rejected": 2.3065812587738037, + "step": 4089 + }, + { + "epoch": 0.66, + "learning_rate": 7.794220964688048e-06, + "logits/chosen": -1.2601803541183472, + "logits/rejected": -1.255416989326477, + "logps/chosen": -43.96989059448242, + "logps/rejected": -54.60555648803711, + "loss": 0.9744, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.1084446907043457, + "rewards/margins": -0.7369856834411621, + "rewards/rejected": 2.845430374145508, + "step": 4090 + }, + { + "epoch": 0.66, + "learning_rate": 7.793130993281919e-06, + "logits/chosen": -1.492962121963501, + "logits/rejected": -1.3980612754821777, + "logps/chosen": -95.09785461425781, + "logps/rejected": -202.23468017578125, + "loss": 1.2186, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.1874077320098877, + "rewards/margins": -2.240821123123169, + "rewards/rejected": 4.428228855133057, + "step": 4091 + }, + { + "epoch": 0.66, + "learning_rate": 7.792040828896868e-06, + "logits/chosen": -1.092045545578003, + "logits/rejected": -1.1314630508422852, + "logps/chosen": -82.780029296875, + "logps/rejected": -75.06718444824219, + "loss": 0.4719, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.516361951828003, + "rewards/margins": 0.6235227584838867, + "rewards/rejected": 2.892839193344116, + "step": 4092 + }, + { + "epoch": 0.66, + "learning_rate": 7.790950471608211e-06, + "logits/chosen": -1.2308369874954224, + "logits/rejected": -1.1518712043762207, + "logps/chosen": -180.6039276123047, + "logps/rejected": -132.6279296875, + "loss": 0.1811, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.655082702636719, + "rewards/margins": 0.8624157905578613, + "rewards/rejected": 5.792666912078857, + "step": 4093 + }, + { + "epoch": 0.66, + "learning_rate": 7.789859921491288e-06, + "logits/chosen": -1.3037680387496948, + "logits/rejected": -1.2942471504211426, + "logps/chosen": -135.03256225585938, + "logps/rejected": -123.90530395507812, + "loss": 0.4416, + "rewards/accuracies": 0.0, + "rewards/chosen": 6.099609375, + "rewards/margins": -0.15665912628173828, + "rewards/rejected": 6.256268501281738, + "step": 4094 + }, + { + "epoch": 0.66, + "learning_rate": 7.78876917862144e-06, + "logits/chosen": -1.5220690965652466, + "logits/rejected": -1.5000561475753784, + "logps/chosen": -66.00933837890625, + "logps/rejected": -54.61115264892578, + "loss": 0.5784, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.157143592834473, + "rewards/margins": 4.062005996704102, + "rewards/rejected": 2.09513783454895, + "step": 4095 + }, + { + "epoch": 0.66, + "learning_rate": 7.78767824307403e-06, + "logits/chosen": -1.0084662437438965, + "logits/rejected": -1.0335124731063843, + "logps/chosen": -48.33197784423828, + "logps/rejected": -131.7489013671875, + "loss": 0.7189, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.6019614934921265, + "rewards/margins": 0.5279746055603027, + "rewards/rejected": 1.0739868879318237, + "step": 4096 + }, + { + "epoch": 0.66, + "learning_rate": 7.786587114924431e-06, + "logits/chosen": -0.6782735586166382, + "logits/rejected": -0.7378796339035034, + "logps/chosen": -51.73942565917969, + "logps/rejected": -108.22708129882812, + "loss": 0.1447, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.0857269763946533, + "rewards/margins": 1.2316207885742188, + "rewards/rejected": 0.8541061282157898, + "step": 4097 + }, + { + "epoch": 0.67, + "learning_rate": 7.78549579424803e-06, + "logits/chosen": -1.29421067237854, + "logits/rejected": -1.3323156833648682, + "logps/chosen": -94.6387710571289, + "logps/rejected": -91.37454223632812, + "loss": 0.499, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.604865312576294, + "rewards/margins": -0.20350873470306396, + "rewards/rejected": 1.808374047279358, + "step": 4098 + }, + { + "epoch": 0.67, + "learning_rate": 7.784404281120225e-06, + "logits/chosen": -1.3558610677719116, + "logits/rejected": -1.3651467561721802, + "logps/chosen": -127.23726654052734, + "logps/rejected": -99.48442077636719, + "loss": 2.1726, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.013822317123413, + "rewards/margins": -4.238096237182617, + "rewards/rejected": 6.251918315887451, + "step": 4099 + }, + { + "epoch": 0.67, + "learning_rate": 7.783312575616432e-06, + "logits/chosen": -1.2747788429260254, + "logits/rejected": -1.2164092063903809, + "logps/chosen": -38.283016204833984, + "logps/rejected": -153.29678344726562, + "loss": 0.4888, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.2376277446746826, + "rewards/margins": 1.1116359233856201, + "rewards/rejected": 1.1259918212890625, + "step": 4100 + }, + { + "epoch": 0.67, + "learning_rate": 7.782220677812074e-06, + "logits/chosen": -1.3334242105484009, + "logits/rejected": -1.307386875152588, + "logps/chosen": -143.05569458007812, + "logps/rejected": -98.5020751953125, + "loss": 1.4064, + "rewards/accuracies": 0.0, + "rewards/chosen": 6.463876247406006, + "rewards/margins": -1.9757370948791504, + "rewards/rejected": 8.439613342285156, + "step": 4101 + }, + { + "epoch": 0.67, + "learning_rate": 7.781128587782595e-06, + "logits/chosen": -1.076780915260315, + "logits/rejected": -1.0663714408874512, + "logps/chosen": -89.99283599853516, + "logps/rejected": -137.21337890625, + "loss": 1.5618, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.939935326576233, + "rewards/margins": -3.062835693359375, + "rewards/rejected": 5.002770900726318, + "step": 4102 + }, + { + "epoch": 0.67, + "learning_rate": 7.780036305603445e-06, + "logits/chosen": -1.4057096242904663, + "logits/rejected": -1.3623498678207397, + "logps/chosen": -40.59809875488281, + "logps/rejected": -37.57777786254883, + "loss": 0.2544, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.6346116065979004, + "rewards/margins": 0.4255053997039795, + "rewards/rejected": 2.209106206893921, + "step": 4103 + }, + { + "epoch": 0.67, + "learning_rate": 7.77894383135009e-06, + "logits/chosen": -1.0745970010757446, + "logits/rejected": -1.0745970010757446, + "logps/chosen": -115.24840545654297, + "logps/rejected": -115.24840545654297, + "loss": 1.2451, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.1023385524749756, + "rewards/margins": 0.0, + "rewards/rejected": 3.1023385524749756, + "step": 4104 + }, + { + "epoch": 0.67, + "learning_rate": 7.777851165098012e-06, + "logits/chosen": -1.1609556674957275, + "logits/rejected": -1.2888652086257935, + "logps/chosen": -70.91365814208984, + "logps/rejected": -117.28260040283203, + "loss": 1.4291, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.340268850326538, + "rewards/margins": -2.0431807041168213, + "rewards/rejected": 5.383449554443359, + "step": 4105 + }, + { + "epoch": 0.67, + "learning_rate": 7.776758306922703e-06, + "logits/chosen": -1.283191204071045, + "logits/rejected": -1.192804217338562, + "logps/chosen": -78.86027526855469, + "logps/rejected": -32.230628967285156, + "loss": 0.5751, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2007896900177, + "rewards/margins": 2.2279515266418457, + "rewards/rejected": 0.9728382229804993, + "step": 4106 + }, + { + "epoch": 0.67, + "learning_rate": 7.775665256899667e-06, + "logits/chosen": -1.0036168098449707, + "logits/rejected": -0.934029221534729, + "logps/chosen": -66.42321014404297, + "logps/rejected": -38.46247482299805, + "loss": 0.8196, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.7943428158760071, + "rewards/margins": -1.3700284957885742, + "rewards/rejected": 2.1643712520599365, + "step": 4107 + }, + { + "epoch": 0.67, + "learning_rate": 7.774572015104427e-06, + "logits/chosen": -0.990464448928833, + "logits/rejected": -1.0883928537368774, + "logps/chosen": -173.25668334960938, + "logps/rejected": -125.72047424316406, + "loss": 0.0551, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.237652778625488, + "rewards/margins": 3.055589437484741, + "rewards/rejected": 2.182063341140747, + "step": 4108 + }, + { + "epoch": 0.67, + "learning_rate": 7.773478581612514e-06, + "logits/chosen": -0.8805405497550964, + "logits/rejected": -0.8953830599784851, + "logps/chosen": -58.96333694458008, + "logps/rejected": -54.618194580078125, + "loss": 0.6992, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.7002102136611938, + "rewards/margins": -1.1090441942214966, + "rewards/rejected": 2.8092544078826904, + "step": 4109 + }, + { + "epoch": 0.67, + "learning_rate": 7.772384956499475e-06, + "logits/chosen": -0.5955837965011597, + "logits/rejected": -0.6381538510322571, + "logps/chosen": -16.802339553833008, + "logps/rejected": -32.93166732788086, + "loss": 2.1473, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.5513243079185486, + "rewards/margins": -0.42653560638427734, + "rewards/rejected": 0.9778599143028259, + "step": 4110 + }, + { + "epoch": 0.67, + "learning_rate": 7.771291139840867e-06, + "logits/chosen": -0.9813845157623291, + "logits/rejected": -0.9758561849594116, + "logps/chosen": -66.86450958251953, + "logps/rejected": -84.87942504882812, + "loss": 0.1886, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.5422401428222656, + "rewards/margins": 0.7877250909805298, + "rewards/rejected": 1.7545150518417358, + "step": 4111 + }, + { + "epoch": 0.67, + "learning_rate": 7.770197131712262e-06, + "logits/chosen": -0.9108277559280396, + "logits/rejected": -0.6161225438117981, + "logps/chosen": -97.32964324951172, + "logps/rejected": -26.50066566467285, + "loss": 0.0493, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.4739327430725098, + "rewards/margins": 2.8782401084899902, + "rewards/rejected": 0.5956926345825195, + "step": 4112 + }, + { + "epoch": 0.67, + "learning_rate": 7.769102932189249e-06, + "logits/chosen": -0.7420825362205505, + "logits/rejected": -0.7156261801719666, + "logps/chosen": -52.20482635498047, + "logps/rejected": -75.8763427734375, + "loss": 0.9177, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.5279289484024048, + "rewards/margins": -0.7578743696212769, + "rewards/rejected": 2.2858033180236816, + "step": 4113 + }, + { + "epoch": 0.67, + "learning_rate": 7.768008541347423e-06, + "logits/chosen": -1.1508800983428955, + "logits/rejected": -1.1299196481704712, + "logps/chosen": -76.77366638183594, + "logps/rejected": -68.60942077636719, + "loss": 0.7094, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.9063400030136108, + "rewards/margins": -1.0149673223495483, + "rewards/rejected": 2.921307325363159, + "step": 4114 + }, + { + "epoch": 0.67, + "learning_rate": 7.7669139592624e-06, + "logits/chosen": -1.057145357131958, + "logits/rejected": -0.8676489591598511, + "logps/chosen": -128.35873413085938, + "logps/rejected": -88.22502136230469, + "loss": 0.0958, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.956457614898682, + "rewards/margins": 1.7316789627075195, + "rewards/rejected": 5.224778652191162, + "step": 4115 + }, + { + "epoch": 0.67, + "learning_rate": 7.765819186009802e-06, + "logits/chosen": -1.229704737663269, + "logits/rejected": -1.2167242765426636, + "logps/chosen": -69.73556518554688, + "logps/rejected": -65.401123046875, + "loss": 0.759, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.6586685180664062, + "rewards/margins": -0.5714836120605469, + "rewards/rejected": 2.230152130126953, + "step": 4116 + }, + { + "epoch": 0.67, + "learning_rate": 7.764724221665269e-06, + "logits/chosen": -0.6417826414108276, + "logits/rejected": -0.6392539143562317, + "logps/chosen": -1.994659185409546, + "logps/rejected": -9.287260055541992, + "loss": 0.6904, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.23801878094673157, + "rewards/margins": 0.08878277242183685, + "rewards/rejected": 0.14923600852489471, + "step": 4117 + }, + { + "epoch": 0.67, + "learning_rate": 7.763629066304452e-06, + "logits/chosen": -1.0931140184402466, + "logits/rejected": -1.0484651327133179, + "logps/chosen": -78.94769287109375, + "logps/rejected": -61.419822692871094, + "loss": 1.899, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.5174522399902344, + "rewards/margins": -2.3232994079589844, + "rewards/rejected": 3.8407516479492188, + "step": 4118 + }, + { + "epoch": 0.67, + "learning_rate": 7.762533720003016e-06, + "logits/chosen": -0.8109433054924011, + "logits/rejected": -0.20383328199386597, + "logps/chosen": -38.19136428833008, + "logps/rejected": -36.51949691772461, + "loss": 1.0934, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.643087387084961, + "rewards/margins": 1.2892882823944092, + "rewards/rejected": 0.35379907488822937, + "step": 4119 + }, + { + "epoch": 0.67, + "learning_rate": 7.76143818283664e-06, + "logits/chosen": -1.4902551174163818, + "logits/rejected": -1.5423870086669922, + "logps/chosen": -81.43572998046875, + "logps/rejected": -89.4513168334961, + "loss": 1.759, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.2462738752365112, + "rewards/margins": -2.9357824325561523, + "rewards/rejected": 4.182056427001953, + "step": 4120 + }, + { + "epoch": 0.67, + "learning_rate": 7.760342454881013e-06, + "logits/chosen": -0.7687674164772034, + "logits/rejected": -0.7622455954551697, + "logps/chosen": -0.7402436137199402, + "logps/rejected": -4.839109897613525, + "loss": 0.7148, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.16904786229133606, + "rewards/margins": -0.09785535931587219, + "rewards/rejected": 0.26690322160720825, + "step": 4121 + }, + { + "epoch": 0.67, + "learning_rate": 7.759246536211843e-06, + "logits/chosen": -1.3822124004364014, + "logits/rejected": -1.3311337232589722, + "logps/chosen": -41.9058837890625, + "logps/rejected": -28.796619415283203, + "loss": 0.0947, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.5689762830734253, + "rewards/margins": 1.819217562675476, + "rewards/rejected": -0.2502412796020508, + "step": 4122 + }, + { + "epoch": 0.67, + "learning_rate": 7.758150426904844e-06, + "logits/chosen": -0.7156274914741516, + "logits/rejected": -0.7436218857765198, + "logps/chosen": -53.63605499267578, + "logps/rejected": -80.29940795898438, + "loss": 0.6734, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.867853581905365, + "rewards/margins": -0.6712715029716492, + "rewards/rejected": 1.5391250848770142, + "step": 4123 + }, + { + "epoch": 0.67, + "learning_rate": 7.75705412703575e-06, + "logits/chosen": -1.0801668167114258, + "logits/rejected": -1.0505907535552979, + "logps/chosen": -97.78160095214844, + "logps/rejected": -56.19575500488281, + "loss": 0.5048, + "rewards/accuracies": 0.0, + "rewards/chosen": 4.648259162902832, + "rewards/margins": -0.5426793098449707, + "rewards/rejected": 5.190938472747803, + "step": 4124 + }, + { + "epoch": 0.67, + "learning_rate": 7.755957636680303e-06, + "logits/chosen": -1.1537142992019653, + "logits/rejected": -1.1312247514724731, + "logps/chosen": -40.77953338623047, + "logps/rejected": -52.867759704589844, + "loss": 1.5788, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.3869264125823975, + "rewards/margins": 0.4445030689239502, + "rewards/rejected": 2.9424233436584473, + "step": 4125 + }, + { + "epoch": 0.67, + "learning_rate": 7.75486095591426e-06, + "logits/chosen": -1.1048012971878052, + "logits/rejected": -1.1335662603378296, + "logps/chosen": -173.4135284423828, + "logps/rejected": -70.16588592529297, + "loss": 0.1012, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.489021301269531, + "rewards/margins": 1.5371253490447998, + "rewards/rejected": 3.9518959522247314, + "step": 4126 + }, + { + "epoch": 0.67, + "learning_rate": 7.75376408481339e-06, + "logits/chosen": -1.033195972442627, + "logits/rejected": -1.0918772220611572, + "logps/chosen": -116.41067504882812, + "logps/rejected": -106.14974975585938, + "loss": 0.8361, + "rewards/accuracies": 0.0, + "rewards/chosen": 6.603869915008545, + "rewards/margins": -0.37445640563964844, + "rewards/rejected": 6.978326320648193, + "step": 4127 + }, + { + "epoch": 0.67, + "learning_rate": 7.752667023453481e-06, + "logits/chosen": -0.8058549165725708, + "logits/rejected": -0.8773077726364136, + "logps/chosen": -76.54586791992188, + "logps/rejected": -105.64009857177734, + "loss": 0.9343, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.446986436843872, + "rewards/margins": -1.5719962120056152, + "rewards/rejected": 3.0189826488494873, + "step": 4128 + }, + { + "epoch": 0.67, + "learning_rate": 7.751569771910326e-06, + "logits/chosen": -0.925037682056427, + "logits/rejected": -0.8731239438056946, + "logps/chosen": -62.661460876464844, + "logps/rejected": -43.41227340698242, + "loss": 0.1813, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.665013074874878, + "rewards/margins": 1.3685797452926636, + "rewards/rejected": 1.2964333295822144, + "step": 4129 + }, + { + "epoch": 0.67, + "learning_rate": 7.750472330259735e-06, + "logits/chosen": -1.1637920141220093, + "logits/rejected": -1.1337852478027344, + "logps/chosen": -34.7081298828125, + "logps/rejected": -11.045242309570312, + "loss": 0.0654, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0545144081115723, + "rewards/margins": 2.3398258686065674, + "rewards/rejected": 0.7146884799003601, + "step": 4130 + }, + { + "epoch": 0.67, + "learning_rate": 7.74937469857753e-06, + "logits/chosen": -1.0940176248550415, + "logits/rejected": -0.876962423324585, + "logps/chosen": -94.58563232421875, + "logps/rejected": -48.58763885498047, + "loss": 0.2918, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.488037109375, + "rewards/margins": 0.7055809497833252, + "rewards/rejected": 2.782456159591675, + "step": 4131 + }, + { + "epoch": 0.67, + "learning_rate": 7.74827687693955e-06, + "logits/chosen": -0.970073938369751, + "logits/rejected": -0.8272401690483093, + "logps/chosen": -61.27082443237305, + "logps/rejected": -56.954376220703125, + "loss": 0.3668, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.099915027618408, + "rewards/margins": 0.49939775466918945, + "rewards/rejected": 3.6005172729492188, + "step": 4132 + }, + { + "epoch": 0.67, + "learning_rate": 7.74717886542164e-06, + "logits/chosen": -1.0342024564743042, + "logits/rejected": -1.275674819946289, + "logps/chosen": -122.75535583496094, + "logps/rejected": -42.38562774658203, + "loss": 0.1641, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.9980669021606445, + "rewards/margins": 4.633267879486084, + "rewards/rejected": 1.36479914188385, + "step": 4133 + }, + { + "epoch": 0.67, + "learning_rate": 7.746080664099667e-06, + "logits/chosen": -1.2005939483642578, + "logits/rejected": -1.2005939483642578, + "logps/chosen": -28.71564483642578, + "logps/rejected": -28.71564483642578, + "loss": 2.0904, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.794114828109741, + "rewards/margins": 0.0, + "rewards/rejected": 2.794114828109741, + "step": 4134 + }, + { + "epoch": 0.67, + "learning_rate": 7.744982273049502e-06, + "logits/chosen": -0.9366518259048462, + "logits/rejected": -0.8774245977401733, + "logps/chosen": -69.59813690185547, + "logps/rejected": -44.08680725097656, + "loss": 0.9902, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.2907204627990723, + "rewards/margins": -1.0609397888183594, + "rewards/rejected": 3.3516602516174316, + "step": 4135 + }, + { + "epoch": 0.67, + "learning_rate": 7.743883692347036e-06, + "logits/chosen": -1.1468887329101562, + "logits/rejected": -1.1118942499160767, + "logps/chosen": -65.72512817382812, + "logps/rejected": -72.33706665039062, + "loss": 1.1147, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.8528244495391846, + "rewards/margins": -1.2938621044158936, + "rewards/rejected": 4.146686553955078, + "step": 4136 + }, + { + "epoch": 0.67, + "learning_rate": 7.74278492206817e-06, + "logits/chosen": -0.8036367297172546, + "logits/rejected": -0.8060488104820251, + "logps/chosen": -2.1243896484375, + "logps/rejected": -1.271671175956726, + "loss": 0.3872, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.17527027428150177, + "rewards/margins": -0.14526920020580292, + "rewards/rejected": 0.3205394744873047, + "step": 4137 + }, + { + "epoch": 0.67, + "learning_rate": 7.741685962288817e-06, + "logits/chosen": -0.9287634491920471, + "logits/rejected": -0.7990986704826355, + "logps/chosen": -45.38203048706055, + "logps/rejected": -40.26285171508789, + "loss": 0.6174, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.3216159343719482, + "rewards/margins": -0.4126253128051758, + "rewards/rejected": 2.734241247177124, + "step": 4138 + }, + { + "epoch": 0.67, + "learning_rate": 7.740586813084907e-06, + "logits/chosen": -0.5901035070419312, + "logits/rejected": -0.5901035070419312, + "logps/chosen": -0.20715519785881042, + "logps/rejected": -0.20715519785881042, + "loss": 0.6198, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.10463998466730118, + "rewards/margins": 0.0, + "rewards/rejected": 0.10463998466730118, + "step": 4139 + }, + { + "epoch": 0.67, + "learning_rate": 7.73948747453238e-06, + "logits/chosen": -1.0815006494522095, + "logits/rejected": -1.0815006494522095, + "logps/chosen": -21.30414581298828, + "logps/rejected": -21.30414581298828, + "loss": 0.8568, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.9210622906684875, + "rewards/margins": 0.0, + "rewards/rejected": 0.9210622906684875, + "step": 4140 + }, + { + "epoch": 0.67, + "learning_rate": 7.73838794670719e-06, + "logits/chosen": -1.206917405128479, + "logits/rejected": -1.1577908992767334, + "logps/chosen": -76.26559448242188, + "logps/rejected": -75.99253845214844, + "loss": 0.0216, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.561465740203857, + "rewards/margins": 3.4997270107269287, + "rewards/rejected": 3.0617387294769287, + "step": 4141 + }, + { + "epoch": 0.67, + "learning_rate": 7.737288229685303e-06, + "logits/chosen": -1.2930861711502075, + "logits/rejected": -1.3066380023956299, + "logps/chosen": -49.258907318115234, + "logps/rejected": -80.70435333251953, + "loss": 0.9473, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.5806782245635986, + "rewards/margins": -0.9434452056884766, + "rewards/rejected": 2.524123430252075, + "step": 4142 + }, + { + "epoch": 0.67, + "learning_rate": 7.736188323542699e-06, + "logits/chosen": -1.1510355472564697, + "logits/rejected": -1.12894868850708, + "logps/chosen": -83.27162170410156, + "logps/rejected": -123.28865814208984, + "loss": 1.6935, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.1900497674942017, + "rewards/margins": 0.6385261416435242, + "rewards/rejected": 0.5515236258506775, + "step": 4143 + }, + { + "epoch": 0.67, + "learning_rate": 7.735088228355373e-06, + "logits/chosen": -1.228739619255066, + "logits/rejected": -1.2461012601852417, + "logps/chosen": -72.69723510742188, + "logps/rejected": -59.11454772949219, + "loss": 2.6042, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.351660966873169, + "rewards/margins": -0.2700660228729248, + "rewards/rejected": 2.6217269897460938, + "step": 4144 + }, + { + "epoch": 0.67, + "learning_rate": 7.733987944199331e-06, + "logits/chosen": -1.103704810142517, + "logits/rejected": -1.1639190912246704, + "logps/chosen": -5.415612697601318, + "logps/rejected": -64.04257202148438, + "loss": 1.0397, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.6672672629356384, + "rewards/margins": -0.43111151456832886, + "rewards/rejected": 1.0983787775039673, + "step": 4145 + }, + { + "epoch": 0.67, + "learning_rate": 7.732887471150589e-06, + "logits/chosen": -1.37870192527771, + "logits/rejected": -1.3160178661346436, + "logps/chosen": -42.7018928527832, + "logps/rejected": -30.45962905883789, + "loss": 0.2253, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.6493278741836548, + "rewards/margins": 0.703948974609375, + "rewards/rejected": 0.9453788995742798, + "step": 4146 + }, + { + "epoch": 0.67, + "learning_rate": 7.731786809285184e-06, + "logits/chosen": -1.3912644386291504, + "logits/rejected": -1.186079502105713, + "logps/chosen": -125.79952239990234, + "logps/rejected": -52.61011505126953, + "loss": 0.1153, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.250626564025879, + "rewards/margins": 5.6028361320495605, + "rewards/rejected": 0.6477905511856079, + "step": 4147 + }, + { + "epoch": 0.67, + "learning_rate": 7.730685958679158e-06, + "logits/chosen": -1.2398754358291626, + "logits/rejected": -1.0647722482681274, + "logps/chosen": -111.7885971069336, + "logps/rejected": -26.771331787109375, + "loss": 0.4814, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.8307061195373535, + "rewards/margins": 3.74951171875, + "rewards/rejected": 3.0811944007873535, + "step": 4148 + }, + { + "epoch": 0.67, + "learning_rate": 7.729584919408571e-06, + "logits/chosen": -0.9471275806427002, + "logits/rejected": -1.0325344800949097, + "logps/chosen": -68.49325561523438, + "logps/rejected": -101.20045471191406, + "loss": 1.6575, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.8819305896759033, + "rewards/margins": -3.2625277042388916, + "rewards/rejected": 5.144458293914795, + "step": 4149 + }, + { + "epoch": 0.67, + "learning_rate": 7.728483691549491e-06, + "logits/chosen": -1.1197178363800049, + "logits/rejected": -1.0308717489242554, + "logps/chosen": -41.20890808105469, + "logps/rejected": -34.94511413574219, + "loss": 0.5476, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.9884270429611206, + "rewards/margins": 0.7705227136611938, + "rewards/rejected": 1.2179043292999268, + "step": 4150 + }, + { + "epoch": 0.67, + "learning_rate": 7.727382275178008e-06, + "logits/chosen": -1.1367555856704712, + "logits/rejected": -1.112707257270813, + "logps/chosen": -86.57179260253906, + "logps/rejected": -90.25521850585938, + "loss": 0.4414, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.0278306007385254, + "rewards/margins": 0.2856346368789673, + "rewards/rejected": 1.742195963859558, + "step": 4151 + }, + { + "epoch": 0.67, + "learning_rate": 7.726280670370214e-06, + "logits/chosen": -0.7265112996101379, + "logits/rejected": -0.819096028804779, + "logps/chosen": -53.51812744140625, + "logps/rejected": -36.73220443725586, + "loss": 0.4046, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.8941749334335327, + "rewards/margins": 0.41655266284942627, + "rewards/rejected": 1.4776222705841064, + "step": 4152 + }, + { + "epoch": 0.67, + "learning_rate": 7.725178877202225e-06, + "logits/chosen": -1.3148014545440674, + "logits/rejected": -1.2393723726272583, + "logps/chosen": -117.5152359008789, + "logps/rejected": -59.06686019897461, + "loss": 0.75, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.5098412036895752, + "rewards/margins": -0.8967556953430176, + "rewards/rejected": 2.4065968990325928, + "step": 4153 + }, + { + "epoch": 0.67, + "learning_rate": 7.72407689575016e-06, + "logits/chosen": -1.3082741498947144, + "logits/rejected": -1.3293143510818481, + "logps/chosen": -222.20327758789062, + "logps/rejected": -144.23324584960938, + "loss": 0.4697, + "rewards/accuracies": 0.0, + "rewards/chosen": 7.254245281219482, + "rewards/margins": -0.4141964912414551, + "rewards/rejected": 7.6684417724609375, + "step": 4154 + }, + { + "epoch": 0.67, + "learning_rate": 7.722974726090155e-06, + "logits/chosen": -0.7428123354911804, + "logits/rejected": -0.7380301356315613, + "logps/chosen": -42.22779083251953, + "logps/rejected": -81.07691192626953, + "loss": 1.714, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.2806862592697144, + "rewards/margins": -2.6025137901306152, + "rewards/rejected": 3.883200168609619, + "step": 4155 + }, + { + "epoch": 0.67, + "learning_rate": 7.721872368298365e-06, + "logits/chosen": -1.0225366353988647, + "logits/rejected": -0.9780578017234802, + "logps/chosen": -92.23727416992188, + "logps/rejected": -95.04899597167969, + "loss": 0.2261, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2487633228302, + "rewards/margins": 1.3394310474395752, + "rewards/rejected": 1.909332275390625, + "step": 4156 + }, + { + "epoch": 0.67, + "learning_rate": 7.720769822450946e-06, + "logits/chosen": -1.0892361402511597, + "logits/rejected": -1.080385446548462, + "logps/chosen": -162.39984130859375, + "logps/rejected": -64.65460205078125, + "loss": 0.5946, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.414453029632568, + "rewards/margins": 3.4652960300445557, + "rewards/rejected": 1.9491569995880127, + "step": 4157 + }, + { + "epoch": 0.67, + "learning_rate": 7.719667088624078e-06, + "logits/chosen": -0.8646401166915894, + "logits/rejected": -0.8859816193580627, + "logps/chosen": -54.02974319458008, + "logps/rejected": -94.95984649658203, + "loss": 0.456, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.9229183197021484, + "rewards/margins": -0.12322354316711426, + "rewards/rejected": 2.0461418628692627, + "step": 4158 + }, + { + "epoch": 0.68, + "learning_rate": 7.718564166893947e-06, + "logits/chosen": -1.2434760332107544, + "logits/rejected": -1.1857813596725464, + "logps/chosen": -87.21044921875, + "logps/rejected": -70.51638793945312, + "loss": 0.0366, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.744049072265625, + "rewards/margins": 2.8478384017944336, + "rewards/rejected": 1.8962105512619019, + "step": 4159 + }, + { + "epoch": 0.68, + "learning_rate": 7.717461057336755e-06, + "logits/chosen": -0.8865240812301636, + "logits/rejected": -0.8390069603919983, + "logps/chosen": -55.647239685058594, + "logps/rejected": -43.205318450927734, + "loss": 2.5262, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.3767204284667969, + "rewards/margins": -0.011918306350708008, + "rewards/rejected": 1.3886387348175049, + "step": 4160 + }, + { + "epoch": 0.68, + "learning_rate": 7.716357760028718e-06, + "logits/chosen": -1.1074798107147217, + "logits/rejected": -1.0940473079681396, + "logps/chosen": -80.90257263183594, + "logps/rejected": -34.820953369140625, + "loss": 0.3556, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.772619605064392, + "rewards/margins": 0.006365180015563965, + "rewards/rejected": 1.7662544250488281, + "step": 4161 + }, + { + "epoch": 0.68, + "learning_rate": 7.715254275046062e-06, + "logits/chosen": -1.5385003089904785, + "logits/rejected": -1.5867124795913696, + "logps/chosen": -82.63642120361328, + "logps/rejected": -91.98556518554688, + "loss": 0.8882, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.5967933535575867, + "rewards/margins": -1.5555999279022217, + "rewards/rejected": 2.152393341064453, + "step": 4162 + }, + { + "epoch": 0.68, + "learning_rate": 7.714150602465028e-06, + "logits/chosen": -1.136474847793579, + "logits/rejected": -1.1025327444076538, + "logps/chosen": -60.90092468261719, + "logps/rejected": -59.989830017089844, + "loss": 1.1564, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.183697462081909, + "rewards/margins": -1.9130961894989014, + "rewards/rejected": 4.0967936515808105, + "step": 4163 + }, + { + "epoch": 0.68, + "learning_rate": 7.713046742361867e-06, + "logits/chosen": -0.8379074335098267, + "logits/rejected": -0.8216598033905029, + "logps/chosen": -148.0310821533203, + "logps/rejected": -65.04881286621094, + "loss": 0.085, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.7611236572265625, + "rewards/margins": 1.7142834663391113, + "rewards/rejected": 5.046840190887451, + "step": 4164 + }, + { + "epoch": 0.68, + "learning_rate": 7.711942694812849e-06, + "logits/chosen": -1.0898829698562622, + "logits/rejected": -1.0761932134628296, + "logps/chosen": -89.09977722167969, + "logps/rejected": -58.573875427246094, + "loss": 0.5177, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.7633583545684814, + "rewards/margins": 0.5753874778747559, + "rewards/rejected": 2.1879708766937256, + "step": 4165 + }, + { + "epoch": 0.68, + "learning_rate": 7.71083845989425e-06, + "logits/chosen": -1.137748122215271, + "logits/rejected": -1.1552585363388062, + "logps/chosen": -52.16483688354492, + "logps/rejected": -84.66737365722656, + "loss": 0.7404, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.6888889074325562, + "rewards/margins": -0.25682950019836426, + "rewards/rejected": 1.9457184076309204, + "step": 4166 + }, + { + "epoch": 0.68, + "learning_rate": 7.709734037682364e-06, + "logits/chosen": -1.0740978717803955, + "logits/rejected": -1.1152678728103638, + "logps/chosen": -89.89662170410156, + "logps/rejected": -84.0135498046875, + "loss": 0.4399, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.2043564319610596, + "rewards/margins": 0.947407603263855, + "rewards/rejected": 1.2569488286972046, + "step": 4167 + }, + { + "epoch": 0.68, + "learning_rate": 7.708629428253497e-06, + "logits/chosen": -0.9219570755958557, + "logits/rejected": -0.8782307505607605, + "logps/chosen": -47.72081756591797, + "logps/rejected": -80.93347930908203, + "loss": 0.4404, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.9641467928886414, + "rewards/margins": -0.30226486921310425, + "rewards/rejected": 1.2664116621017456, + "step": 4168 + }, + { + "epoch": 0.68, + "learning_rate": 7.707524631683964e-06, + "logits/chosen": -1.4296863079071045, + "logits/rejected": -1.3626556396484375, + "logps/chosen": -104.38705444335938, + "logps/rejected": -44.8299560546875, + "loss": 0.3963, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.8695564270019531, + "rewards/margins": -0.06706929206848145, + "rewards/rejected": 1.9366257190704346, + "step": 4169 + }, + { + "epoch": 0.68, + "learning_rate": 7.7064196480501e-06, + "logits/chosen": -0.7933391332626343, + "logits/rejected": -0.8005738258361816, + "logps/chosen": -2.494763135910034, + "logps/rejected": -0.9375553727149963, + "loss": 0.5557, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.20001986622810364, + "rewards/margins": -0.05325651168823242, + "rewards/rejected": 0.25327637791633606, + "step": 4170 + }, + { + "epoch": 0.68, + "learning_rate": 7.705314477428246e-06, + "logits/chosen": -1.111671805381775, + "logits/rejected": -1.111671805381775, + "logps/chosen": -36.76858901977539, + "logps/rejected": -36.76858901977539, + "loss": 0.4123, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.2235122919082642, + "rewards/margins": 0.0, + "rewards/rejected": 1.2235122919082642, + "step": 4171 + }, + { + "epoch": 0.68, + "learning_rate": 7.704209119894759e-06, + "logits/chosen": -1.2828608751296997, + "logits/rejected": -1.3219314813613892, + "logps/chosen": -36.85681915283203, + "logps/rejected": -75.14185333251953, + "loss": 0.3186, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.8921082019805908, + "rewards/margins": 0.12616658210754395, + "rewards/rejected": 1.7659416198730469, + "step": 4172 + }, + { + "epoch": 0.68, + "learning_rate": 7.70310357552601e-06, + "logits/chosen": -0.9319390058517456, + "logits/rejected": -0.948293149471283, + "logps/chosen": -58.05718994140625, + "logps/rejected": -55.63590621948242, + "loss": 0.7656, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.346971869468689, + "rewards/margins": -1.1677647829055786, + "rewards/rejected": 2.5147366523742676, + "step": 4173 + }, + { + "epoch": 0.68, + "learning_rate": 7.701997844398379e-06, + "logits/chosen": -1.109144926071167, + "logits/rejected": -1.1549192667007446, + "logps/chosen": -67.67826080322266, + "logps/rejected": -69.55802917480469, + "loss": 0.7796, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.3918190002441406, + "rewards/margins": -1.2155311107635498, + "rewards/rejected": 3.6073501110076904, + "step": 4174 + }, + { + "epoch": 0.68, + "learning_rate": 7.700891926588265e-06, + "logits/chosen": -1.0637598037719727, + "logits/rejected": -1.1105698347091675, + "logps/chosen": -42.984832763671875, + "logps/rejected": -71.81324768066406, + "loss": 1.2272, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.7683407068252563, + "rewards/margins": -1.5726262331008911, + "rewards/rejected": 2.3409669399261475, + "step": 4175 + }, + { + "epoch": 0.68, + "learning_rate": 7.699785822172074e-06, + "logits/chosen": -1.2981033325195312, + "logits/rejected": -1.3303250074386597, + "logps/chosen": -271.4908142089844, + "logps/rejected": -81.62024688720703, + "loss": 1.1282, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.975308418273926, + "rewards/margins": 4.133703231811523, + "rewards/rejected": 1.8416054248809814, + "step": 4176 + }, + { + "epoch": 0.68, + "learning_rate": 7.698679531226229e-06, + "logits/chosen": -0.6195260882377625, + "logits/rejected": -0.7609591484069824, + "logps/chosen": -75.40229034423828, + "logps/rejected": -95.24542236328125, + "loss": 2.4625, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.1243263483047485, + "rewards/margins": -4.834655284881592, + "rewards/rejected": 5.958981513977051, + "step": 4177 + }, + { + "epoch": 0.68, + "learning_rate": 7.697573053827163e-06, + "logits/chosen": -1.0806647539138794, + "logits/rejected": -0.9965039491653442, + "logps/chosen": -99.24810791015625, + "logps/rejected": -58.66600036621094, + "loss": 0.0297, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.385327339172363, + "rewards/margins": 3.8023033142089844, + "rewards/rejected": 1.5830239057540894, + "step": 4178 + }, + { + "epoch": 0.68, + "learning_rate": 7.696466390051325e-06, + "logits/chosen": -1.1364731788635254, + "logits/rejected": -1.1973234415054321, + "logps/chosen": -41.78825759887695, + "logps/rejected": -59.144371032714844, + "loss": 3.6183, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.3077282905578613, + "rewards/margins": -0.7133288383483887, + "rewards/rejected": 4.02105712890625, + "step": 4179 + }, + { + "epoch": 0.68, + "learning_rate": 7.695359539975173e-06, + "logits/chosen": -1.1144492626190186, + "logits/rejected": -1.0290244817733765, + "logps/chosen": -63.313934326171875, + "logps/rejected": -59.239952087402344, + "loss": 0.1836, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.7089264392852783, + "rewards/margins": 1.1445626020431519, + "rewards/rejected": 1.5643638372421265, + "step": 4180 + }, + { + "epoch": 0.68, + "learning_rate": 7.694252503675181e-06, + "logits/chosen": -0.9449334144592285, + "logits/rejected": -0.7872663736343384, + "logps/chosen": -45.35614776611328, + "logps/rejected": -11.075793266296387, + "loss": 0.2971, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.7055584192276, + "rewards/margins": 0.8809680342674255, + "rewards/rejected": 0.8245903849601746, + "step": 4181 + }, + { + "epoch": 0.68, + "learning_rate": 7.693145281227834e-06, + "logits/chosen": -0.9693682789802551, + "logits/rejected": -0.884823739528656, + "logps/chosen": -56.607208251953125, + "logps/rejected": -34.649169921875, + "loss": 0.2775, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.254171848297119, + "rewards/margins": 2.0635251998901367, + "rewards/rejected": 0.19064675271511078, + "step": 4182 + }, + { + "epoch": 0.68, + "learning_rate": 7.69203787270963e-06, + "logits/chosen": -0.9125913977622986, + "logits/rejected": -0.8524573445320129, + "logps/chosen": -25.543468475341797, + "logps/rejected": -39.44236755371094, + "loss": 0.4783, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.6991214752197266, + "rewards/margins": -0.1473773717880249, + "rewards/rejected": 1.8464988470077515, + "step": 4183 + }, + { + "epoch": 0.68, + "learning_rate": 7.690930278197082e-06, + "logits/chosen": -1.2623354196548462, + "logits/rejected": -1.1505221128463745, + "logps/chosen": -104.309814453125, + "logps/rejected": -141.82025146484375, + "loss": 0.0594, + "rewards/accuracies": 1.0, + "rewards/chosen": 7.567269802093506, + "rewards/margins": 2.16849946975708, + "rewards/rejected": 5.398770332336426, + "step": 4184 + }, + { + "epoch": 0.68, + "learning_rate": 7.689822497766712e-06, + "logits/chosen": -0.9901956915855408, + "logits/rejected": -0.8614225387573242, + "logps/chosen": -51.90226745605469, + "logps/rejected": -50.606292724609375, + "loss": 1.1055, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.5177315473556519, + "rewards/margins": -2.0673890113830566, + "rewards/rejected": 3.585120439529419, + "step": 4185 + }, + { + "epoch": 0.68, + "learning_rate": 7.688714531495061e-06, + "logits/chosen": -0.9837056994438171, + "logits/rejected": -1.0034137964248657, + "logps/chosen": -92.3275146484375, + "logps/rejected": -92.15408325195312, + "loss": 0.1673, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.6283440589904785, + "rewards/margins": 1.1451202630996704, + "rewards/rejected": 1.483223795890808, + "step": 4186 + }, + { + "epoch": 0.68, + "learning_rate": 7.687606379458677e-06, + "logits/chosen": -0.9507120251655579, + "logits/rejected": -0.8802892565727234, + "logps/chosen": -71.77759552001953, + "logps/rejected": -14.979509353637695, + "loss": 0.1326, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.8468856811523438, + "rewards/margins": 1.2031941413879395, + "rewards/rejected": 0.6436914801597595, + "step": 4187 + }, + { + "epoch": 0.68, + "learning_rate": 7.686498041734121e-06, + "logits/chosen": -1.3630436658859253, + "logits/rejected": -1.3630436658859253, + "logps/chosen": -58.119232177734375, + "logps/rejected": -58.119232177734375, + "loss": 0.467, + "rewards/accuracies": 0.0, + "rewards/chosen": 4.0179595947265625, + "rewards/margins": 0.0, + "rewards/rejected": 4.0179595947265625, + "step": 4188 + }, + { + "epoch": 0.68, + "learning_rate": 7.68538951839797e-06, + "logits/chosen": -0.6429746150970459, + "logits/rejected": -0.5657570958137512, + "logps/chosen": -38.723480224609375, + "logps/rejected": -16.449352264404297, + "loss": 0.2329, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.593335747718811, + "rewards/margins": 0.9721508026123047, + "rewards/rejected": 0.6211849451065063, + "step": 4189 + }, + { + "epoch": 0.68, + "learning_rate": 7.684280809526813e-06, + "logits/chosen": -1.0638234615325928, + "logits/rejected": -1.0954362154006958, + "logps/chosen": -100.13531494140625, + "logps/rejected": -94.9743423461914, + "loss": 1.1884, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.7436057925224304, + "rewards/margins": -1.4781279563903809, + "rewards/rejected": 2.221733808517456, + "step": 4190 + }, + { + "epoch": 0.68, + "learning_rate": 7.683171915197251e-06, + "logits/chosen": -1.2729240655899048, + "logits/rejected": -1.2825919389724731, + "logps/chosen": -57.802032470703125, + "logps/rejected": -42.8911247253418, + "loss": 0.8796, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.2568107843399048, + "rewards/margins": -0.652065634727478, + "rewards/rejected": 1.9088764190673828, + "step": 4191 + }, + { + "epoch": 0.68, + "learning_rate": 7.682062835485898e-06, + "logits/chosen": -0.9442195892333984, + "logits/rejected": -0.9425562620162964, + "logps/chosen": -51.10325622558594, + "logps/rejected": -46.3958854675293, + "loss": 0.674, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.4209396839141846, + "rewards/margins": 1.4798054695129395, + "rewards/rejected": 0.9411342740058899, + "step": 4192 + }, + { + "epoch": 0.68, + "learning_rate": 7.680953570469381e-06, + "logits/chosen": -0.7797046899795532, + "logits/rejected": -0.7792373299598694, + "logps/chosen": -68.51591491699219, + "logps/rejected": -55.23271942138672, + "loss": 0.5137, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.7226547002792358, + "rewards/margins": -0.18272101879119873, + "rewards/rejected": 1.9053757190704346, + "step": 4193 + }, + { + "epoch": 0.68, + "learning_rate": 7.67984412022434e-06, + "logits/chosen": -1.0543947219848633, + "logits/rejected": -1.0738779306411743, + "logps/chosen": -34.127254486083984, + "logps/rejected": -63.36848449707031, + "loss": 0.4707, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.4255731105804443, + "rewards/margins": 0.20259642601013184, + "rewards/rejected": 2.2229766845703125, + "step": 4194 + }, + { + "epoch": 0.68, + "learning_rate": 7.678734484827428e-06, + "logits/chosen": -1.5383269786834717, + "logits/rejected": -1.4166889190673828, + "logps/chosen": -81.12956237792969, + "logps/rejected": -50.35936737060547, + "loss": 0.1741, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.4921464920043945, + "rewards/margins": 2.3076188564300537, + "rewards/rejected": 2.184527635574341, + "step": 4195 + }, + { + "epoch": 0.68, + "learning_rate": 7.677624664355308e-06, + "logits/chosen": -1.2745637893676758, + "logits/rejected": -1.2307868003845215, + "logps/chosen": -62.60968017578125, + "logps/rejected": -31.992809295654297, + "loss": 0.8899, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.1426186561584473, + "rewards/margins": 0.9541752338409424, + "rewards/rejected": 1.1884434223175049, + "step": 4196 + }, + { + "epoch": 0.68, + "learning_rate": 7.676514658884661e-06, + "logits/chosen": -1.1370586156845093, + "logits/rejected": -1.2443100214004517, + "logps/chosen": -37.83922576904297, + "logps/rejected": -69.03475952148438, + "loss": 2.0981, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.1591637134552, + "rewards/margins": -2.472377061843872, + "rewards/rejected": 4.631540775299072, + "step": 4197 + }, + { + "epoch": 0.68, + "learning_rate": 7.675404468492175e-06, + "logits/chosen": -0.8426483273506165, + "logits/rejected": -0.8962728381156921, + "logps/chosen": -1.580096960067749, + "logps/rejected": -48.59376907348633, + "loss": 0.8119, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.37632492184638977, + "rewards/margins": 0.10848206281661987, + "rewards/rejected": 0.2678428590297699, + "step": 4198 + }, + { + "epoch": 0.68, + "learning_rate": 7.674294093254555e-06, + "logits/chosen": -0.931615948677063, + "logits/rejected": -0.9540539979934692, + "logps/chosen": -103.45220947265625, + "logps/rejected": -122.053955078125, + "loss": 1.146, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.5390862226486206, + "rewards/margins": -2.1752753257751465, + "rewards/rejected": 3.7143616676330566, + "step": 4199 + }, + { + "epoch": 0.68, + "learning_rate": 7.673183533248521e-06, + "logits/chosen": -1.1963950395584106, + "logits/rejected": -1.2203717231750488, + "logps/chosen": -40.80916976928711, + "logps/rejected": -52.74845886230469, + "loss": 0.3508, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1363773345947266, + "rewards/margins": 0.4230532646179199, + "rewards/rejected": 2.7133240699768066, + "step": 4200 + }, + { + "epoch": 0.68, + "learning_rate": 7.672072788550795e-06, + "logits/chosen": -0.8638618588447571, + "logits/rejected": -0.7495986223220825, + "logps/chosen": -56.43944549560547, + "logps/rejected": -46.5863151550293, + "loss": 1.1366, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.187124729156494, + "rewards/margins": -0.05866742134094238, + "rewards/rejected": 2.2457921504974365, + "step": 4201 + }, + { + "epoch": 0.68, + "learning_rate": 7.670961859238124e-06, + "logits/chosen": -1.173505187034607, + "logits/rejected": -1.1656444072723389, + "logps/chosen": -94.45341491699219, + "logps/rejected": -112.6610107421875, + "loss": 0.311, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.361837863922119, + "rewards/margins": 0.8711457252502441, + "rewards/rejected": 3.490692138671875, + "step": 4202 + }, + { + "epoch": 0.68, + "learning_rate": 7.669850745387261e-06, + "logits/chosen": -1.4493650197982788, + "logits/rejected": -1.3883055448532104, + "logps/chosen": -108.24177551269531, + "logps/rejected": -63.22608184814453, + "loss": 0.1046, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.1924285888671875, + "rewards/margins": 1.7623100280761719, + "rewards/rejected": 4.430118560791016, + "step": 4203 + }, + { + "epoch": 0.68, + "learning_rate": 7.668739447074974e-06, + "logits/chosen": -1.18013334274292, + "logits/rejected": -1.2381528615951538, + "logps/chosen": -183.45608520507812, + "logps/rejected": -174.81085205078125, + "loss": 0.3128, + "rewards/accuracies": 1.0, + "rewards/chosen": 7.179220676422119, + "rewards/margins": 0.2586240768432617, + "rewards/rejected": 6.920596599578857, + "step": 4204 + }, + { + "epoch": 0.68, + "learning_rate": 7.667627964378044e-06, + "logits/chosen": -0.808677077293396, + "logits/rejected": -0.790436327457428, + "logps/chosen": -30.713825225830078, + "logps/rejected": -17.470855712890625, + "loss": 0.7014, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.54482501745224, + "rewards/margins": -0.5287976861000061, + "rewards/rejected": 1.073622703552246, + "step": 4205 + }, + { + "epoch": 0.68, + "learning_rate": 7.666516297373262e-06, + "logits/chosen": -0.6866846680641174, + "logits/rejected": -0.6527461409568787, + "logps/chosen": -44.939083099365234, + "logps/rejected": -8.58704662322998, + "loss": 0.5465, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.047834038734436, + "rewards/margins": 0.42297643423080444, + "rewards/rejected": 0.6248576045036316, + "step": 4206 + }, + { + "epoch": 0.68, + "learning_rate": 7.665404446137434e-06, + "logits/chosen": -0.9584026336669922, + "logits/rejected": -1.0459929704666138, + "logps/chosen": -40.105499267578125, + "logps/rejected": -44.661895751953125, + "loss": 1.4107, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.1576381921768188, + "rewards/margins": -0.8088763952255249, + "rewards/rejected": 1.9665145874023438, + "step": 4207 + }, + { + "epoch": 0.68, + "learning_rate": 7.664292410747381e-06, + "logits/chosen": -1.075608253479004, + "logits/rejected": -1.0888913869857788, + "logps/chosen": -53.58265686035156, + "logps/rejected": -66.71443939208984, + "loss": 1.219, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.309175968170166, + "rewards/margins": -0.8942046165466309, + "rewards/rejected": 4.203380584716797, + "step": 4208 + }, + { + "epoch": 0.68, + "learning_rate": 7.663180191279931e-06, + "logits/chosen": -0.6879609823226929, + "logits/rejected": -0.7093052864074707, + "logps/chosen": -41.51420211791992, + "logps/rejected": -104.2313003540039, + "loss": 1.6914, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.1248319149017334, + "rewards/margins": -3.222053289413452, + "rewards/rejected": 5.3468852043151855, + "step": 4209 + }, + { + "epoch": 0.68, + "learning_rate": 7.66206778781193e-06, + "logits/chosen": -1.4443670511245728, + "logits/rejected": -1.4192790985107422, + "logps/chosen": -94.90348052978516, + "logps/rejected": -75.4638671875, + "loss": 0.3708, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.3762567043304443, + "rewards/margins": -0.0790557861328125, + "rewards/rejected": 3.455312490463257, + "step": 4210 + }, + { + "epoch": 0.68, + "learning_rate": 7.660955200420232e-06, + "logits/chosen": -1.4335910081863403, + "logits/rejected": -1.4097236394882202, + "logps/chosen": -101.61763000488281, + "logps/rejected": -30.374521255493164, + "loss": 0.2713, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.2549270391464233, + "rewards/margins": 0.42430514097213745, + "rewards/rejected": 0.8306218981742859, + "step": 4211 + }, + { + "epoch": 0.68, + "learning_rate": 7.659842429181708e-06, + "logits/chosen": -0.8358656167984009, + "logits/rejected": -0.8581246733665466, + "logps/chosen": -64.91661071777344, + "logps/rejected": -60.67936706542969, + "loss": 0.4662, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.8216629028320312, + "rewards/margins": -0.37226176261901855, + "rewards/rejected": 3.19392466545105, + "step": 4212 + }, + { + "epoch": 0.68, + "learning_rate": 7.658729474173241e-06, + "logits/chosen": -1.1368409395217896, + "logits/rejected": -1.050109624862671, + "logps/chosen": -62.636627197265625, + "logps/rejected": -37.210655212402344, + "loss": 0.5663, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.398176670074463, + "rewards/margins": -0.4076101779937744, + "rewards/rejected": 2.8057868480682373, + "step": 4213 + }, + { + "epoch": 0.68, + "learning_rate": 7.657616335471723e-06, + "logits/chosen": -1.0213595628738403, + "logits/rejected": -1.0053834915161133, + "logps/chosen": -61.214691162109375, + "logps/rejected": -96.43140411376953, + "loss": 0.9136, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.103846788406372, + "rewards/margins": 0.3444000482559204, + "rewards/rejected": 1.7594467401504517, + "step": 4214 + }, + { + "epoch": 0.68, + "learning_rate": 7.656503013154064e-06, + "logits/chosen": -1.117781639099121, + "logits/rejected": -1.203455924987793, + "logps/chosen": -65.02017211914062, + "logps/rejected": -92.74563598632812, + "loss": 1.4035, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.6505935192108154, + "rewards/margins": -2.584040880203247, + "rewards/rejected": 5.2346343994140625, + "step": 4215 + }, + { + "epoch": 0.68, + "learning_rate": 7.655389507297181e-06, + "logits/chosen": -1.1316850185394287, + "logits/rejected": -1.138594150543213, + "logps/chosen": -31.35687255859375, + "logps/rejected": -53.77579116821289, + "loss": 0.4878, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.4396625757217407, + "rewards/margins": -0.49216771125793457, + "rewards/rejected": 1.9318302869796753, + "step": 4216 + }, + { + "epoch": 0.68, + "learning_rate": 7.65427581797801e-06, + "logits/chosen": -1.0692055225372314, + "logits/rejected": -0.9418155550956726, + "logps/chosen": -49.040042877197266, + "logps/rejected": -25.79814338684082, + "loss": 0.618, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.5898823738098145, + "rewards/margins": 3.418262481689453, + "rewards/rejected": 0.17161999642848969, + "step": 4217 + }, + { + "epoch": 0.68, + "learning_rate": 7.653161945273497e-06, + "logits/chosen": -0.8381332159042358, + "logits/rejected": -0.8707780838012695, + "logps/chosen": -50.41973114013672, + "logps/rejected": -76.80392456054688, + "loss": 0.4733, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.770220160484314, + "rewards/margins": 0.23790204524993896, + "rewards/rejected": 1.532318115234375, + "step": 4218 + }, + { + "epoch": 0.68, + "learning_rate": 7.652047889260595e-06, + "logits/chosen": -1.08717679977417, + "logits/rejected": -1.08717679977417, + "logps/chosen": -36.547889709472656, + "logps/rejected": -36.547889709472656, + "loss": 0.5264, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.9328876733779907, + "rewards/margins": 0.0, + "rewards/rejected": 1.9328876733779907, + "step": 4219 + }, + { + "epoch": 0.68, + "learning_rate": 7.650933650016279e-06, + "logits/chosen": -0.3927542269229889, + "logits/rejected": -0.36333656311035156, + "logps/chosen": -6.547746658325195, + "logps/rejected": -9.546327590942383, + "loss": 0.6561, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.5438709259033203, + "rewards/margins": -0.4525865912437439, + "rewards/rejected": 0.9964575171470642, + "step": 4220 + }, + { + "epoch": 0.69, + "learning_rate": 7.64981922761753e-06, + "logits/chosen": -0.5858179330825806, + "logits/rejected": -0.5858179330825806, + "logps/chosen": -56.46416091918945, + "logps/rejected": -56.46416091918945, + "loss": 0.3665, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.242918848991394, + "rewards/margins": 0.0, + "rewards/rejected": 1.242918848991394, + "step": 4221 + }, + { + "epoch": 0.69, + "learning_rate": 7.648704622141347e-06, + "logits/chosen": -1.5114586353302002, + "logits/rejected": -1.5166572332382202, + "logps/chosen": -80.78547668457031, + "logps/rejected": -63.264869689941406, + "loss": 0.9409, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.764862060546875, + "rewards/margins": -1.5507957935333252, + "rewards/rejected": 2.3156578540802, + "step": 4222 + }, + { + "epoch": 0.69, + "learning_rate": 7.647589833664737e-06, + "logits/chosen": -0.6793234348297119, + "logits/rejected": -0.6558361053466797, + "logps/chosen": -67.16238403320312, + "logps/rejected": -70.86150360107422, + "loss": 0.588, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.7627884149551392, + "rewards/margins": -0.23367995023727417, + "rewards/rejected": 0.9964683651924133, + "step": 4223 + }, + { + "epoch": 0.69, + "learning_rate": 7.64647486226472e-06, + "logits/chosen": -0.811522901058197, + "logits/rejected": -0.7736455202102661, + "logps/chosen": -14.627368927001953, + "logps/rejected": -5.212683200836182, + "loss": 0.4527, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.6135717630386353, + "rewards/margins": -0.2655191421508789, + "rewards/rejected": 0.8790909051895142, + "step": 4224 + }, + { + "epoch": 0.69, + "learning_rate": 7.645359708018331e-06, + "logits/chosen": -0.682731568813324, + "logits/rejected": -0.7847209572792053, + "logps/chosen": -90.05660247802734, + "logps/rejected": -93.59382629394531, + "loss": 3.1408, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.8045570254325867, + "rewards/margins": -2.8684945106506348, + "rewards/rejected": 3.673051595687866, + "step": 4225 + }, + { + "epoch": 0.69, + "learning_rate": 7.644244371002619e-06, + "logits/chosen": -1.0336424112319946, + "logits/rejected": -0.9693331718444824, + "logps/chosen": -78.46981811523438, + "logps/rejected": -59.59528350830078, + "loss": 2.0207, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.6134033203125, + "rewards/margins": 0.8659484386444092, + "rewards/rejected": 1.7474548816680908, + "step": 4226 + }, + { + "epoch": 0.69, + "learning_rate": 7.643128851294637e-06, + "logits/chosen": -0.6981802582740784, + "logits/rejected": -0.7092235088348389, + "logps/chosen": -64.26412200927734, + "logps/rejected": -132.49713134765625, + "loss": 0.7281, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.8966178894042969, + "rewards/margins": 1.0825247764587402, + "rewards/rejected": 0.8140930533409119, + "step": 4227 + }, + { + "epoch": 0.69, + "learning_rate": 7.642013148971465e-06, + "logits/chosen": -0.914535403251648, + "logits/rejected": -0.9184622168540955, + "logps/chosen": -2.0039350986480713, + "logps/rejected": -1.9593249559402466, + "loss": 1.0213, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.289561003446579, + "rewards/margins": 0.10096538066864014, + "rewards/rejected": 0.18859562277793884, + "step": 4228 + }, + { + "epoch": 0.69, + "learning_rate": 7.64089726411018e-06, + "logits/chosen": -1.4022454023361206, + "logits/rejected": -1.4588539600372314, + "logps/chosen": -212.56060791015625, + "logps/rejected": -40.05838394165039, + "loss": 0.0171, + "rewards/accuracies": 1.0, + "rewards/chosen": 7.328686714172363, + "rewards/margins": 7.062218189239502, + "rewards/rejected": 0.26646843552589417, + "step": 4229 + }, + { + "epoch": 0.69, + "learning_rate": 7.639781196787886e-06, + "logits/chosen": -1.2288674116134644, + "logits/rejected": -1.1065760850906372, + "logps/chosen": -147.62930297851562, + "logps/rejected": -57.352046966552734, + "loss": 0.1082, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.7054765224456787, + "rewards/margins": 1.5110273361206055, + "rewards/rejected": 2.1944491863250732, + "step": 4230 + }, + { + "epoch": 0.69, + "learning_rate": 7.638664947081687e-06, + "logits/chosen": -0.8648074269294739, + "logits/rejected": -0.8633161187171936, + "logps/chosen": -46.33599853515625, + "logps/rejected": -66.89151763916016, + "loss": 0.8625, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.7035430669784546, + "rewards/margins": -1.4697593450546265, + "rewards/rejected": 3.173302412033081, + "step": 4231 + }, + { + "epoch": 0.69, + "learning_rate": 7.637548515068706e-06, + "logits/chosen": -1.1130000352859497, + "logits/rejected": -1.1095647811889648, + "logps/chosen": -66.39218139648438, + "logps/rejected": -103.20835876464844, + "loss": 0.4799, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.2534186840057373, + "rewards/margins": 1.079032063484192, + "rewards/rejected": 1.1743866205215454, + "step": 4232 + }, + { + "epoch": 0.69, + "learning_rate": 7.636431900826082e-06, + "logits/chosen": -0.7506889700889587, + "logits/rejected": -0.8029608130455017, + "logps/chosen": -72.19380187988281, + "logps/rejected": -132.11375427246094, + "loss": 0.635, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.430493950843811, + "rewards/margins": 0.6314186453819275, + "rewards/rejected": 0.7990753054618835, + "step": 4233 + }, + { + "epoch": 0.69, + "learning_rate": 7.635315104430959e-06, + "logits/chosen": -1.135087013244629, + "logits/rejected": -1.2164164781570435, + "logps/chosen": -76.73918151855469, + "logps/rejected": -128.43756103515625, + "loss": 2.8932, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.641286611557007, + "rewards/margins": -5.261402130126953, + "rewards/rejected": 7.902688503265381, + "step": 4234 + }, + { + "epoch": 0.69, + "learning_rate": 7.634198125960498e-06, + "logits/chosen": -0.7232565879821777, + "logits/rejected": -0.6990861296653748, + "logps/chosen": -88.35824584960938, + "logps/rejected": -57.579593658447266, + "loss": 0.3823, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.6899971961975098, + "rewards/margins": 1.5181057453155518, + "rewards/rejected": 1.171891450881958, + "step": 4235 + }, + { + "epoch": 0.69, + "learning_rate": 7.63308096549187e-06, + "logits/chosen": -1.0780751705169678, + "logits/rejected": -1.1779696941375732, + "logps/chosen": -107.90239715576172, + "logps/rejected": -91.86036682128906, + "loss": 0.5969, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.3425514698028564, + "rewards/margins": 0.03521728515625, + "rewards/rejected": 3.3073341846466064, + "step": 4236 + }, + { + "epoch": 0.69, + "learning_rate": 7.631963623102264e-06, + "logits/chosen": -1.237074375152588, + "logits/rejected": -1.2641605138778687, + "logps/chosen": -120.38570404052734, + "logps/rejected": -107.47308349609375, + "loss": 0.0485, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.418926239013672, + "rewards/margins": 2.4573638439178467, + "rewards/rejected": 0.9615623354911804, + "step": 4237 + }, + { + "epoch": 0.69, + "learning_rate": 7.630846098868875e-06, + "logits/chosen": -0.5964647531509399, + "logits/rejected": -0.5964647531509399, + "logps/chosen": -1.173404335975647, + "logps/rejected": -1.173404335975647, + "loss": 0.5224, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.4783463180065155, + "rewards/margins": 0.0, + "rewards/rejected": 0.4783463180065155, + "step": 4238 + }, + { + "epoch": 0.69, + "learning_rate": 7.629728392868913e-06, + "logits/chosen": -1.0589662790298462, + "logits/rejected": -1.1842238903045654, + "logps/chosen": -38.207069396972656, + "logps/rejected": -138.706787109375, + "loss": 2.7896, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.7919613122940063, + "rewards/margins": -3.8740010261535645, + "rewards/rejected": 5.665962219238281, + "step": 4239 + }, + { + "epoch": 0.69, + "learning_rate": 7.628610505179602e-06, + "logits/chosen": -1.3319766521453857, + "logits/rejected": -1.2553621530532837, + "logps/chosen": -97.24848175048828, + "logps/rejected": -75.4365005493164, + "loss": 0.1233, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.757443904876709, + "rewards/margins": 2.527639627456665, + "rewards/rejected": 3.229804277420044, + "step": 4240 + }, + { + "epoch": 0.69, + "learning_rate": 7.627492435878177e-06, + "logits/chosen": -0.8654321432113647, + "logits/rejected": -0.8563678860664368, + "logps/chosen": -15.851853370666504, + "logps/rejected": -5.408362865447998, + "loss": 0.9936, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.4223078787326813, + "rewards/margins": 0.14603513479232788, + "rewards/rejected": 0.2762727439403534, + "step": 4241 + }, + { + "epoch": 0.69, + "learning_rate": 7.626374185041887e-06, + "logits/chosen": -1.4592212438583374, + "logits/rejected": -1.292488932609558, + "logps/chosen": -112.39842987060547, + "logps/rejected": -30.654640197753906, + "loss": 0.1626, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.4974380731582642, + "rewards/margins": 1.076642632484436, + "rewards/rejected": 0.4207954406738281, + "step": 4242 + }, + { + "epoch": 0.69, + "learning_rate": 7.625255752747991e-06, + "logits/chosen": -0.8283042907714844, + "logits/rejected": -0.6507970690727234, + "logps/chosen": -55.756988525390625, + "logps/rejected": -59.59889602661133, + "loss": 0.0468, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.500500679016113, + "rewards/margins": 2.3544530868530273, + "rewards/rejected": 3.146047592163086, + "step": 4243 + }, + { + "epoch": 0.69, + "learning_rate": 7.624137139073762e-06, + "logits/chosen": -0.7263708114624023, + "logits/rejected": -0.7263708114624023, + "logps/chosen": -53.059120178222656, + "logps/rejected": -53.059120178222656, + "loss": 0.5259, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.7076714038848877, + "rewards/margins": 0.0, + "rewards/rejected": 2.7076714038848877, + "step": 4244 + }, + { + "epoch": 0.69, + "learning_rate": 7.623018344096489e-06, + "logits/chosen": -1.3096288442611694, + "logits/rejected": -1.3630414009094238, + "logps/chosen": -73.56137084960938, + "logps/rejected": -119.96450805664062, + "loss": 0.8094, + "rewards/accuracies": 0.0, + "rewards/chosen": 5.750163555145264, + "rewards/margins": -1.390411376953125, + "rewards/rejected": 7.140574932098389, + "step": 4245 + }, + { + "epoch": 0.69, + "learning_rate": 7.621899367893466e-06, + "logits/chosen": -1.2772197723388672, + "logits/rejected": -1.2071572542190552, + "logps/chosen": -61.430389404296875, + "logps/rejected": -14.873356819152832, + "loss": 0.0488, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.9457011222839355, + "rewards/margins": 4.160367965698242, + "rewards/rejected": 0.7853333353996277, + "step": 4246 + }, + { + "epoch": 0.69, + "learning_rate": 7.6207802105420045e-06, + "logits/chosen": -1.0303691625595093, + "logits/rejected": -0.9269859194755554, + "logps/chosen": -72.0726089477539, + "logps/rejected": -66.38545989990234, + "loss": 0.5095, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.5122780799865723, + "rewards/margins": -0.3672003746032715, + "rewards/rejected": 2.8794784545898438, + "step": 4247 + }, + { + "epoch": 0.69, + "learning_rate": 7.6196608721194295e-06, + "logits/chosen": -1.3662042617797852, + "logits/rejected": -1.3804471492767334, + "logps/chosen": -71.76875305175781, + "logps/rejected": -91.72737884521484, + "loss": 0.3602, + "rewards/accuracies": 0.0, + "rewards/chosen": 4.691827297210693, + "rewards/margins": -0.046451568603515625, + "rewards/rejected": 4.738278865814209, + "step": 4248 + }, + { + "epoch": 0.69, + "learning_rate": 7.618541352703077e-06, + "logits/chosen": -1.1038392782211304, + "logits/rejected": -1.0969361066818237, + "logps/chosen": -69.20516967773438, + "logps/rejected": -67.18844604492188, + "loss": 0.4837, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.3660943508148193, + "rewards/margins": -0.24291372299194336, + "rewards/rejected": 3.6090080738067627, + "step": 4249 + }, + { + "epoch": 0.69, + "learning_rate": 7.617421652370293e-06, + "logits/chosen": -0.8809769749641418, + "logits/rejected": -0.9143452644348145, + "logps/chosen": -56.64450454711914, + "logps/rejected": -95.54108428955078, + "loss": 1.2479, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.2711819410324097, + "rewards/margins": -2.3867244720458984, + "rewards/rejected": 3.6579062938690186, + "step": 4250 + }, + { + "epoch": 0.69, + "learning_rate": 7.616301771198438e-06, + "logits/chosen": -1.0843425989151, + "logits/rejected": -0.8751185536384583, + "logps/chosen": -61.22642517089844, + "logps/rejected": -153.63270568847656, + "loss": 0.668, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.8744447231292725, + "rewards/margins": -0.5592391490936279, + "rewards/rejected": 3.4336838722229004, + "step": 4251 + }, + { + "epoch": 0.69, + "learning_rate": 7.6151817092648875e-06, + "logits/chosen": -0.6133414506912231, + "logits/rejected": -0.6133414506912231, + "logps/chosen": -59.8100471496582, + "logps/rejected": -59.8100471496582, + "loss": 0.8844, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.7020763158798218, + "rewards/margins": 0.0, + "rewards/rejected": 1.7020763158798218, + "step": 4252 + }, + { + "epoch": 0.69, + "learning_rate": 7.614061466647026e-06, + "logits/chosen": -1.0434820652008057, + "logits/rejected": -1.0557875633239746, + "logps/chosen": -45.48969650268555, + "logps/rejected": -51.915626525878906, + "loss": 0.6971, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.8869190216064453, + "rewards/margins": 0.5129672288894653, + "rewards/rejected": 1.37395179271698, + "step": 4253 + }, + { + "epoch": 0.69, + "learning_rate": 7.6129410434222505e-06, + "logits/chosen": -1.0395948886871338, + "logits/rejected": -0.9823881983757019, + "logps/chosen": -40.099266052246094, + "logps/rejected": -47.917510986328125, + "loss": 0.329, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.7158958911895752, + "rewards/margins": 0.07629239559173584, + "rewards/rejected": 1.6396034955978394, + "step": 4254 + }, + { + "epoch": 0.69, + "learning_rate": 7.611820439667974e-06, + "logits/chosen": -0.811971127986908, + "logits/rejected": -0.826930820941925, + "logps/chosen": -59.99303436279297, + "logps/rejected": -42.55042266845703, + "loss": 0.5339, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.4483085870742798, + "rewards/margins": -0.6396740674972534, + "rewards/rejected": 2.087982654571533, + "step": 4255 + }, + { + "epoch": 0.69, + "learning_rate": 7.610699655461618e-06, + "logits/chosen": -0.9582726955413818, + "logits/rejected": -1.02973210811615, + "logps/chosen": -61.64506912231445, + "logps/rejected": -86.72835540771484, + "loss": 1.3106, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.9487224817276, + "rewards/margins": -2.524508476257324, + "rewards/rejected": 4.473230838775635, + "step": 4256 + }, + { + "epoch": 0.69, + "learning_rate": 7.609578690880619e-06, + "logits/chosen": -0.8628509640693665, + "logits/rejected": -0.93921959400177, + "logps/chosen": -31.861385345458984, + "logps/rejected": -52.72511291503906, + "loss": 2.3131, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.7209007740020752, + "rewards/margins": -3.282113790512085, + "rewards/rejected": 5.00301456451416, + "step": 4257 + }, + { + "epoch": 0.69, + "learning_rate": 7.608457546002423e-06, + "logits/chosen": -0.7612985372543335, + "logits/rejected": -0.7626156806945801, + "logps/chosen": -85.74434661865234, + "logps/rejected": -87.47709655761719, + "loss": 0.3121, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.280668020248413, + "rewards/margins": 0.20611119270324707, + "rewards/rejected": 2.074556827545166, + "step": 4258 + }, + { + "epoch": 0.69, + "learning_rate": 7.607336220904493e-06, + "logits/chosen": -0.6753203272819519, + "logits/rejected": -0.6794503927230835, + "logps/chosen": -3.951781749725342, + "logps/rejected": -0.5518850684165955, + "loss": 0.6221, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.1757655143737793, + "rewards/margins": -0.05689159035682678, + "rewards/rejected": 0.23265710473060608, + "step": 4259 + }, + { + "epoch": 0.69, + "learning_rate": 7.606214715664302e-06, + "logits/chosen": -0.6930784583091736, + "logits/rejected": -0.6930784583091736, + "logps/chosen": -18.452592849731445, + "logps/rejected": -18.452592849731445, + "loss": 0.3683, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.05127773433923721, + "rewards/margins": 0.0, + "rewards/rejected": -0.05127773433923721, + "step": 4260 + }, + { + "epoch": 0.69, + "learning_rate": 7.605093030359333e-06, + "logits/chosen": -1.0047270059585571, + "logits/rejected": -0.8215038180351257, + "logps/chosen": -94.3132553100586, + "logps/rejected": -15.501548767089844, + "loss": 0.0798, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.502779483795166, + "rewards/margins": 2.2394304275512695, + "rewards/rejected": 0.2633489668369293, + "step": 4261 + }, + { + "epoch": 0.69, + "learning_rate": 7.603971165067086e-06, + "logits/chosen": -1.2681605815887451, + "logits/rejected": -1.167485237121582, + "logps/chosen": -181.46551513671875, + "logps/rejected": -8.870511054992676, + "loss": 0.0014, + "rewards/accuracies": 1.0, + "rewards/chosen": 7.19191312789917, + "rewards/margins": 6.172832489013672, + "rewards/rejected": 1.019080638885498, + "step": 4262 + }, + { + "epoch": 0.69, + "learning_rate": 7.60284911986507e-06, + "logits/chosen": -0.9777287244796753, + "logits/rejected": -0.9516900777816772, + "logps/chosen": -194.30587768554688, + "logps/rejected": -107.73258209228516, + "loss": 1.6348, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.7220458984375, + "rewards/margins": -3.2215371131896973, + "rewards/rejected": 5.943583011627197, + "step": 4263 + }, + { + "epoch": 0.69, + "learning_rate": 7.601726894830808e-06, + "logits/chosen": -0.8859713077545166, + "logits/rejected": -0.9525075554847717, + "logps/chosen": -96.52149963378906, + "logps/rejected": -85.48489379882812, + "loss": 0.2641, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.3860650062561035, + "rewards/margins": 0.36440277099609375, + "rewards/rejected": 2.0216622352600098, + "step": 4264 + }, + { + "epoch": 0.69, + "learning_rate": 7.6006044900418355e-06, + "logits/chosen": -1.3124759197235107, + "logits/rejected": -1.3274991512298584, + "logps/chosen": -84.38421630859375, + "logps/rejected": -82.24771881103516, + "loss": 0.34, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.297308325767517, + "rewards/margins": 0.38399118185043335, + "rewards/rejected": 0.9133171439170837, + "step": 4265 + }, + { + "epoch": 0.69, + "learning_rate": 7.599481905575699e-06, + "logits/chosen": -1.1777842044830322, + "logits/rejected": -1.0765049457550049, + "logps/chosen": -66.1854248046875, + "logps/rejected": -35.194976806640625, + "loss": 0.9154, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.086337566375732, + "rewards/margins": 0.05462837219238281, + "rewards/rejected": 4.03170919418335, + "step": 4266 + }, + { + "epoch": 0.69, + "learning_rate": 7.598359141509961e-06, + "logits/chosen": -1.288350224494934, + "logits/rejected": -1.2569984197616577, + "logps/chosen": -76.07906341552734, + "logps/rejected": -88.20510864257812, + "loss": 0.9936, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.9538825750350952, + "rewards/margins": -1.774224877357483, + "rewards/rejected": 3.728107452392578, + "step": 4267 + }, + { + "epoch": 0.69, + "learning_rate": 7.597236197922191e-06, + "logits/chosen": -0.9054418206214905, + "logits/rejected": -0.8766204714775085, + "logps/chosen": -20.480192184448242, + "logps/rejected": -6.6649041175842285, + "loss": 1.0512, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.1329762935638428, + "rewards/margins": 1.4294507503509521, + "rewards/rejected": 0.7035254836082458, + "step": 4268 + }, + { + "epoch": 0.69, + "learning_rate": 7.596113074889976e-06, + "logits/chosen": -0.7140533924102783, + "logits/rejected": -0.7682337760925293, + "logps/chosen": -14.651447296142578, + "logps/rejected": -27.582962036132812, + "loss": 1.3874, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.8669161200523376, + "rewards/margins": -0.1920667290687561, + "rewards/rejected": 1.0589828491210938, + "step": 4269 + }, + { + "epoch": 0.69, + "learning_rate": 7.594989772490911e-06, + "logits/chosen": -1.0767717361450195, + "logits/rejected": -0.8995426893234253, + "logps/chosen": -59.66118621826172, + "logps/rejected": -30.54408836364746, + "loss": 0.1153, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.587613821029663, + "rewards/margins": 1.4227004051208496, + "rewards/rejected": 2.1649134159088135, + "step": 4270 + }, + { + "epoch": 0.69, + "learning_rate": 7.593866290802608e-06, + "logits/chosen": -1.1148039102554321, + "logits/rejected": -1.151801586151123, + "logps/chosen": -168.1429443359375, + "logps/rejected": -73.73624420166016, + "loss": 1.398, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.163890361785889, + "rewards/margins": 0.46395063400268555, + "rewards/rejected": 5.699939727783203, + "step": 4271 + }, + { + "epoch": 0.69, + "learning_rate": 7.592742629902688e-06, + "logits/chosen": -0.8795556426048279, + "logits/rejected": -0.8797140717506409, + "logps/chosen": -41.96864318847656, + "logps/rejected": -49.60868453979492, + "loss": 1.3035, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.3993237018585205, + "rewards/margins": 0.2864716053009033, + "rewards/rejected": 1.1128520965576172, + "step": 4272 + }, + { + "epoch": 0.69, + "learning_rate": 7.5916187898687845e-06, + "logits/chosen": -1.148630142211914, + "logits/rejected": -0.920193612575531, + "logps/chosen": -60.889671325683594, + "logps/rejected": -13.839957237243652, + "loss": 2.3714, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.121784210205078, + "rewards/margins": 2.3412959575653076, + "rewards/rejected": 0.7804883122444153, + "step": 4273 + }, + { + "epoch": 0.69, + "learning_rate": 7.5904947707785434e-06, + "logits/chosen": -0.9023545384407043, + "logits/rejected": -0.9023545384407043, + "logps/chosen": -32.345462799072266, + "logps/rejected": -32.345462799072266, + "loss": 0.3469, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.0869014263153076, + "rewards/margins": 0.0, + "rewards/rejected": 2.0869014263153076, + "step": 4274 + }, + { + "epoch": 0.69, + "learning_rate": 7.5893705727096265e-06, + "logits/chosen": -1.0557304620742798, + "logits/rejected": -1.1930972337722778, + "logps/chosen": -133.47467041015625, + "logps/rejected": -144.8691864013672, + "loss": 1.4574, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.8676834106445312, + "rewards/margins": -2.622854709625244, + "rewards/rejected": 5.490538120269775, + "step": 4275 + }, + { + "epoch": 0.69, + "learning_rate": 7.588246195739703e-06, + "logits/chosen": -1.284062385559082, + "logits/rejected": -1.2657465934753418, + "logps/chosen": -42.398746490478516, + "logps/rejected": -75.63304138183594, + "loss": 0.2591, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.288003921508789, + "rewards/margins": 0.5627200603485107, + "rewards/rejected": 2.7252838611602783, + "step": 4276 + }, + { + "epoch": 0.69, + "learning_rate": 7.5871216399464585e-06, + "logits/chosen": -0.8193684816360474, + "logits/rejected": -0.6684210300445557, + "logps/chosen": -40.34541702270508, + "logps/rejected": -17.64181137084961, + "loss": 0.2589, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.273811101913452, + "rewards/margins": 3.473066568374634, + "rewards/rejected": -0.19925557076931, + "step": 4277 + }, + { + "epoch": 0.69, + "learning_rate": 7.585996905407586e-06, + "logits/chosen": -0.9127013087272644, + "logits/rejected": -0.8096671104431152, + "logps/chosen": -133.6650848388672, + "logps/rejected": -67.8989028930664, + "loss": 1.2245, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.7686173915863037, + "rewards/margins": 1.4337610006332397, + "rewards/rejected": 1.334856390953064, + "step": 4278 + }, + { + "epoch": 0.69, + "learning_rate": 7.584871992200799e-06, + "logits/chosen": -0.8887621164321899, + "logits/rejected": -0.8383712768554688, + "logps/chosen": -70.0302734375, + "logps/rejected": -72.14086151123047, + "loss": 0.5629, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.4786971807479858, + "rewards/margins": -0.6928833723068237, + "rewards/rejected": 2.1715805530548096, + "step": 4279 + }, + { + "epoch": 0.69, + "learning_rate": 7.583746900403815e-06, + "logits/chosen": -1.4179060459136963, + "logits/rejected": -1.4678239822387695, + "logps/chosen": -248.6177978515625, + "logps/rejected": -139.71902465820312, + "loss": 0.7002, + "rewards/accuracies": 1.0, + "rewards/chosen": 7.389895915985107, + "rewards/margins": 1.5063843727111816, + "rewards/rejected": 5.883511543273926, + "step": 4280 + }, + { + "epoch": 0.69, + "learning_rate": 7.582621630094368e-06, + "logits/chosen": -1.1572664976119995, + "logits/rejected": -0.8849239945411682, + "logps/chosen": -166.41075134277344, + "logps/rejected": -110.34334564208984, + "loss": 1.6423, + "rewards/accuracies": 1.0, + "rewards/chosen": 7.836317539215088, + "rewards/margins": 3.2935996055603027, + "rewards/rejected": 4.542717933654785, + "step": 4281 + }, + { + "epoch": 0.7, + "learning_rate": 7.581496181350203e-06, + "logits/chosen": -0.7872434258460999, + "logits/rejected": -0.9135515093803406, + "logps/chosen": -20.84357452392578, + "logps/rejected": -56.354801177978516, + "loss": 2.05, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.6992267966270447, + "rewards/margins": -3.8575453758239746, + "rewards/rejected": 4.556772232055664, + "step": 4282 + }, + { + "epoch": 0.7, + "learning_rate": 7.580370554249077e-06, + "logits/chosen": -1.147071123123169, + "logits/rejected": -1.1354385614395142, + "logps/chosen": -112.96050262451172, + "logps/rejected": -51.9600830078125, + "loss": 0.2725, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.1200387477874756, + "rewards/margins": 0.6362999677658081, + "rewards/rejected": 1.4837387800216675, + "step": 4283 + }, + { + "epoch": 0.7, + "learning_rate": 7.579244748868764e-06, + "logits/chosen": -1.1873888969421387, + "logits/rejected": -1.1105170249938965, + "logps/chosen": -92.46896362304688, + "logps/rejected": -71.20855712890625, + "loss": 0.1722, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.325402021408081, + "rewards/margins": 1.0869964361190796, + "rewards/rejected": 1.2384055852890015, + "step": 4284 + }, + { + "epoch": 0.7, + "learning_rate": 7.578118765287041e-06, + "logits/chosen": -1.3771127462387085, + "logits/rejected": -1.3723431825637817, + "logps/chosen": -67.51094818115234, + "logps/rejected": -51.863380432128906, + "loss": 0.0455, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.2689201831817627, + "rewards/margins": 2.6169865131378174, + "rewards/rejected": -0.3480663299560547, + "step": 4285 + }, + { + "epoch": 0.7, + "learning_rate": 7.5769926035817075e-06, + "logits/chosen": -0.8567197322845459, + "logits/rejected": -0.8639463782310486, + "logps/chosen": -101.61589050292969, + "logps/rejected": -136.25872802734375, + "loss": 0.4213, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9711227416992188, + "rewards/margins": 0.8790113925933838, + "rewards/rejected": 2.092111349105835, + "step": 4286 + }, + { + "epoch": 0.7, + "learning_rate": 7.575866263830569e-06, + "logits/chosen": -1.0072025060653687, + "logits/rejected": -0.9396783113479614, + "logps/chosen": -87.14283752441406, + "logps/rejected": -55.591224670410156, + "loss": 2.6828, + "rewards/accuracies": 0.0, + "rewards/chosen": 4.159689426422119, + "rewards/margins": -0.3680410385131836, + "rewards/rejected": 4.527730464935303, + "step": 4287 + }, + { + "epoch": 0.7, + "learning_rate": 7.574739746111444e-06, + "logits/chosen": -1.0129884481430054, + "logits/rejected": -1.0749374628067017, + "logps/chosen": -126.85323333740234, + "logps/rejected": -121.36790466308594, + "loss": 0.1148, + "rewards/accuracies": 1.0, + "rewards/chosen": 7.436559200286865, + "rewards/margins": 1.602628231048584, + "rewards/rejected": 5.833930969238281, + "step": 4288 + }, + { + "epoch": 0.7, + "learning_rate": 7.5736130505021655e-06, + "logits/chosen": -1.165415644645691, + "logits/rejected": -0.9108714461326599, + "logps/chosen": -100.17007446289062, + "logps/rejected": -61.428794860839844, + "loss": 0.2536, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.283067226409912, + "rewards/margins": 0.46079087257385254, + "rewards/rejected": 3.8222763538360596, + "step": 4289 + }, + { + "epoch": 0.7, + "learning_rate": 7.572486177080576e-06, + "logits/chosen": -1.1445144414901733, + "logits/rejected": -1.1006841659545898, + "logps/chosen": -62.65723419189453, + "logps/rejected": -66.93029022216797, + "loss": 0.3914, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.674048662185669, + "rewards/margins": 0.07395637035369873, + "rewards/rejected": 1.6000922918319702, + "step": 4290 + }, + { + "epoch": 0.7, + "learning_rate": 7.571359125924533e-06, + "logits/chosen": -1.272143006324768, + "logits/rejected": -1.2962700128555298, + "logps/chosen": -136.83584594726562, + "logps/rejected": -100.74386596679688, + "loss": 1.3503, + "rewards/accuracies": 0.0, + "rewards/chosen": 7.130856513977051, + "rewards/margins": -0.8886137008666992, + "rewards/rejected": 8.01947021484375, + "step": 4291 + }, + { + "epoch": 0.7, + "learning_rate": 7.570231897111907e-06, + "logits/chosen": -1.146157145500183, + "logits/rejected": -0.9002304673194885, + "logps/chosen": -66.09381866455078, + "logps/rejected": -14.48731803894043, + "loss": 0.054, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.970265865325928, + "rewards/margins": 3.140293598175049, + "rewards/rejected": 1.829972267150879, + "step": 4292 + }, + { + "epoch": 0.7, + "learning_rate": 7.569104490720574e-06, + "logits/chosen": -1.204386591911316, + "logits/rejected": -0.9062519669532776, + "logps/chosen": -161.28857421875, + "logps/rejected": -72.53156280517578, + "loss": 0.0039, + "rewards/accuracies": 1.0, + "rewards/chosen": 9.750988960266113, + "rewards/margins": 5.033638000488281, + "rewards/rejected": 4.717350959777832, + "step": 4293 + }, + { + "epoch": 0.7, + "learning_rate": 7.567976906828431e-06, + "logits/chosen": -0.9074811935424805, + "logits/rejected": -0.8953065276145935, + "logps/chosen": -74.30831909179688, + "logps/rejected": -64.74978637695312, + "loss": 0.7014, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.6764869689941406, + "rewards/margins": 0.022022247314453125, + "rewards/rejected": 1.6544647216796875, + "step": 4294 + }, + { + "epoch": 0.7, + "learning_rate": 7.566849145513382e-06, + "logits/chosen": -1.0434379577636719, + "logits/rejected": -1.0234456062316895, + "logps/chosen": -119.09750366210938, + "logps/rejected": -59.9731559753418, + "loss": 0.8641, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.843833923339844, + "rewards/margins": 0.2788872718811035, + "rewards/rejected": 5.56494665145874, + "step": 4295 + }, + { + "epoch": 0.7, + "learning_rate": 7.5657212068533445e-06, + "logits/chosen": -1.3598414659500122, + "logits/rejected": -1.2251535654067993, + "logps/chosen": -127.83734130859375, + "logps/rejected": -29.02825164794922, + "loss": 0.1387, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.942410469055176, + "rewards/margins": 2.8543312549591064, + "rewards/rejected": 3.0880792140960693, + "step": 4296 + }, + { + "epoch": 0.7, + "learning_rate": 7.564593090926249e-06, + "logits/chosen": -1.2543569803237915, + "logits/rejected": -1.358251929283142, + "logps/chosen": -78.03800964355469, + "logps/rejected": -148.92489624023438, + "loss": 2.5184, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.056391954421997, + "rewards/margins": -3.937025308609009, + "rewards/rejected": 4.993417263031006, + "step": 4297 + }, + { + "epoch": 0.7, + "learning_rate": 7.563464797810038e-06, + "logits/chosen": -0.6330429911613464, + "logits/rejected": -0.722891628742218, + "logps/chosen": -74.41688537597656, + "logps/rejected": -98.69691467285156, + "loss": 1.9148, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.0230660438537598, + "rewards/margins": -3.7360787391662598, + "rewards/rejected": 5.7591447830200195, + "step": 4298 + }, + { + "epoch": 0.7, + "learning_rate": 7.562336327582664e-06, + "logits/chosen": -1.4436579942703247, + "logits/rejected": -1.4375675916671753, + "logps/chosen": -158.72610473632812, + "logps/rejected": -55.55780792236328, + "loss": 0.8168, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.903241157531738, + "rewards/margins": 3.986283302307129, + "rewards/rejected": 2.9169578552246094, + "step": 4299 + }, + { + "epoch": 0.7, + "learning_rate": 7.561207680322096e-06, + "logits/chosen": -0.6954014301300049, + "logits/rejected": -0.6946279406547546, + "logps/chosen": -51.313053131103516, + "logps/rejected": -76.43153381347656, + "loss": 0.8606, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.11285438388586044, + "rewards/margins": -1.2944058179855347, + "rewards/rejected": 1.4072601795196533, + "step": 4300 + }, + { + "epoch": 0.7, + "learning_rate": 7.56007885610631e-06, + "logits/chosen": -1.1195971965789795, + "logits/rejected": -1.0505502223968506, + "logps/chosen": -88.38134002685547, + "logps/rejected": -33.47477722167969, + "loss": 0.3902, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.5721259117126465, + "rewards/margins": 2.8383777141571045, + "rewards/rejected": 2.733748197555542, + "step": 4301 + }, + { + "epoch": 0.7, + "learning_rate": 7.5589498550133e-06, + "logits/chosen": -1.0476391315460205, + "logits/rejected": -0.9604671597480774, + "logps/chosen": -82.53703308105469, + "logps/rejected": -71.03772735595703, + "loss": 0.4527, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.409773349761963, + "rewards/margins": 2.192671775817871, + "rewards/rejected": 4.217101573944092, + "step": 4302 + }, + { + "epoch": 0.7, + "learning_rate": 7.5578206771210675e-06, + "logits/chosen": -1.0041719675064087, + "logits/rejected": -0.9805108904838562, + "logps/chosen": -54.749019622802734, + "logps/rejected": -66.45294189453125, + "loss": 0.5132, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.0591305494308472, + "rewards/margins": -0.5027561187744141, + "rewards/rejected": 1.5618866682052612, + "step": 4303 + }, + { + "epoch": 0.7, + "learning_rate": 7.556691322507627e-06, + "logits/chosen": -1.3599073886871338, + "logits/rejected": -1.358681321144104, + "logps/chosen": -67.09310913085938, + "logps/rejected": -74.1680908203125, + "loss": 2.8032, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.7714080810546875, + "rewards/margins": -1.4965128898620605, + "rewards/rejected": 4.267920970916748, + "step": 4304 + }, + { + "epoch": 0.7, + "learning_rate": 7.555561791251011e-06, + "logits/chosen": -0.796884298324585, + "logits/rejected": -0.8366526365280151, + "logps/chosen": -49.713043212890625, + "logps/rejected": -69.27850341796875, + "loss": 0.2462, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.887933373451233, + "rewards/margins": 0.6125396490097046, + "rewards/rejected": 1.2753937244415283, + "step": 4305 + }, + { + "epoch": 0.7, + "learning_rate": 7.554432083429253e-06, + "logits/chosen": -0.8632510900497437, + "logits/rejected": -0.9631519913673401, + "logps/chosen": -94.04849243164062, + "logps/rejected": -124.88727569580078, + "loss": 0.7409, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.8658905029296875, + "rewards/margins": -0.8139228820800781, + "rewards/rejected": 4.679813385009766, + "step": 4306 + }, + { + "epoch": 0.7, + "learning_rate": 7.553302199120409e-06, + "logits/chosen": -1.2191436290740967, + "logits/rejected": -1.1864053010940552, + "logps/chosen": -42.00611114501953, + "logps/rejected": -45.07206726074219, + "loss": 0.1435, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.495058536529541, + "rewards/margins": 1.1131397485733032, + "rewards/rejected": 1.3819187879562378, + "step": 4307 + }, + { + "epoch": 0.7, + "learning_rate": 7.552172138402545e-06, + "logits/chosen": -1.0214534997940063, + "logits/rejected": -0.9450980424880981, + "logps/chosen": -66.9538345336914, + "logps/rejected": -39.62617492675781, + "loss": 0.3577, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.5947701930999756, + "rewards/margins": 0.1926441192626953, + "rewards/rejected": 2.4021260738372803, + "step": 4308 + }, + { + "epoch": 0.7, + "learning_rate": 7.5510419013537325e-06, + "logits/chosen": -0.9466806054115295, + "logits/rejected": -0.9466806054115295, + "logps/chosen": -52.717796325683594, + "logps/rejected": -52.717796325683594, + "loss": 1.2773, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.5420150756835938, + "rewards/margins": 0.0, + "rewards/rejected": 1.5420150756835938, + "step": 4309 + }, + { + "epoch": 0.7, + "learning_rate": 7.549911488052064e-06, + "logits/chosen": -0.9773104190826416, + "logits/rejected": -0.9764395356178284, + "logps/chosen": -59.550575256347656, + "logps/rejected": -160.1633758544922, + "loss": 1.4112, + "rewards/accuracies": 0.0, + "rewards/chosen": 4.351314544677734, + "rewards/margins": -1.9950156211853027, + "rewards/rejected": 6.346330165863037, + "step": 4310 + }, + { + "epoch": 0.7, + "learning_rate": 7.54878089857564e-06, + "logits/chosen": -1.020459771156311, + "logits/rejected": -1.0490683317184448, + "logps/chosen": -76.80360412597656, + "logps/rejected": -159.38201904296875, + "loss": 0.8172, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.089665174484253, + "rewards/margins": 0.2900512218475342, + "rewards/rejected": 1.7996139526367188, + "step": 4311 + }, + { + "epoch": 0.7, + "learning_rate": 7.547650133002571e-06, + "logits/chosen": -0.9742486476898193, + "logits/rejected": -0.8837748169898987, + "logps/chosen": -55.15801239013672, + "logps/rejected": -53.33345031738281, + "loss": 0.4522, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.2864692211151123, + "rewards/margins": 1.4469795227050781, + "rewards/rejected": 0.839489758014679, + "step": 4312 + }, + { + "epoch": 0.7, + "learning_rate": 7.546519191410985e-06, + "logits/chosen": -0.8683869242668152, + "logits/rejected": -0.8683869242668152, + "logps/chosen": -66.28463745117188, + "logps/rejected": -66.28463745117188, + "loss": 1.49, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.575648546218872, + "rewards/margins": 0.0, + "rewards/rejected": 3.575648546218872, + "step": 4313 + }, + { + "epoch": 0.7, + "learning_rate": 7.545388073879018e-06, + "logits/chosen": -1.2356783151626587, + "logits/rejected": -1.1957019567489624, + "logps/chosen": -77.13127136230469, + "logps/rejected": -73.31950378417969, + "loss": 1.3916, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.9192100763320923, + "rewards/margins": -0.7720619440078735, + "rewards/rejected": 2.691272020339966, + "step": 4314 + }, + { + "epoch": 0.7, + "learning_rate": 7.54425678048482e-06, + "logits/chosen": -0.9998934864997864, + "logits/rejected": -1.0923742055892944, + "logps/chosen": -43.33342742919922, + "logps/rejected": -102.49418640136719, + "loss": 2.0446, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.132404327392578, + "rewards/margins": -4.054001808166504, + "rewards/rejected": 6.186406135559082, + "step": 4315 + }, + { + "epoch": 0.7, + "learning_rate": 7.543125311306552e-06, + "logits/chosen": -1.1241744756698608, + "logits/rejected": -1.0652587413787842, + "logps/chosen": -43.69550323486328, + "logps/rejected": -32.10185241699219, + "loss": 0.4417, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.2430877685546875, + "rewards/margins": -0.2491910457611084, + "rewards/rejected": 2.492278814315796, + "step": 4316 + }, + { + "epoch": 0.7, + "learning_rate": 7.541993666422388e-06, + "logits/chosen": -1.160063624382019, + "logits/rejected": -0.97536700963974, + "logps/chosen": -119.45051574707031, + "logps/rejected": -11.922662734985352, + "loss": 1.0187, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.641027927398682, + "rewards/margins": 4.984818458557129, + "rewards/rejected": 0.656209409236908, + "step": 4317 + }, + { + "epoch": 0.7, + "learning_rate": 7.540861845910514e-06, + "logits/chosen": -1.036710500717163, + "logits/rejected": -1.0720094442367554, + "logps/chosen": -180.71307373046875, + "logps/rejected": -71.39418029785156, + "loss": 1.2715, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.353668212890625, + "rewards/margins": 2.072202205657959, + "rewards/rejected": 2.281466007232666, + "step": 4318 + }, + { + "epoch": 0.7, + "learning_rate": 7.539729849849129e-06, + "logits/chosen": -1.146490454673767, + "logits/rejected": -1.0773216485977173, + "logps/chosen": -58.870967864990234, + "logps/rejected": -64.40467071533203, + "loss": 0.5146, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.390503406524658, + "rewards/margins": -0.5359225273132324, + "rewards/rejected": 2.9264259338378906, + "step": 4319 + }, + { + "epoch": 0.7, + "learning_rate": 7.5385976783164426e-06, + "logits/chosen": -1.1953099966049194, + "logits/rejected": -1.2020272016525269, + "logps/chosen": -152.71414184570312, + "logps/rejected": -106.40826416015625, + "loss": 0.24, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.380453586578369, + "rewards/margins": 0.5611252784729004, + "rewards/rejected": 1.8193283081054688, + "step": 4320 + }, + { + "epoch": 0.7, + "learning_rate": 7.537465331390676e-06, + "logits/chosen": -1.0476173162460327, + "logits/rejected": -1.1657123565673828, + "logps/chosen": -99.6001205444336, + "logps/rejected": -74.05563354492188, + "loss": 4.1322, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.5995262265205383, + "rewards/margins": -2.7731094360351562, + "rewards/rejected": 3.37263560295105, + "step": 4321 + }, + { + "epoch": 0.7, + "learning_rate": 7.536332809150066e-06, + "logits/chosen": -0.6766168475151062, + "logits/rejected": -0.6837208867073059, + "logps/chosen": -5.62360954284668, + "logps/rejected": -1.9308135509490967, + "loss": 1.0947, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.3436656892299652, + "rewards/margins": -0.026389628648757935, + "rewards/rejected": 0.37005531787872314, + "step": 4322 + }, + { + "epoch": 0.7, + "learning_rate": 7.535200111672858e-06, + "logits/chosen": -0.9997658729553223, + "logits/rejected": -1.0668212175369263, + "logps/chosen": -84.24156188964844, + "logps/rejected": -91.36192321777344, + "loss": 0.7791, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.821589708328247, + "rewards/margins": -0.4534316062927246, + "rewards/rejected": 3.2750213146209717, + "step": 4323 + }, + { + "epoch": 0.7, + "learning_rate": 7.534067239037311e-06, + "logits/chosen": -0.9168442487716675, + "logits/rejected": -0.8804215788841248, + "logps/chosen": -61.008148193359375, + "logps/rejected": -66.2655258178711, + "loss": 1.1884, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.2966537475585938, + "rewards/margins": -0.3687340021133423, + "rewards/rejected": 1.665387749671936, + "step": 4324 + }, + { + "epoch": 0.7, + "learning_rate": 7.532934191321693e-06, + "logits/chosen": -0.6973574757575989, + "logits/rejected": -0.6973574757575989, + "logps/chosen": -63.255821228027344, + "logps/rejected": -63.255821228027344, + "loss": 0.6382, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.623329997062683, + "rewards/margins": 0.0, + "rewards/rejected": 1.623329997062683, + "step": 4325 + }, + { + "epoch": 0.7, + "learning_rate": 7.531800968604292e-06, + "logits/chosen": -1.2935222387313843, + "logits/rejected": -1.1801023483276367, + "logps/chosen": -111.49609375, + "logps/rejected": -89.97470092773438, + "loss": 0.0862, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.9204912185668945, + "rewards/margins": 1.6741859912872314, + "rewards/rejected": 3.246305227279663, + "step": 4326 + }, + { + "epoch": 0.7, + "learning_rate": 7.530667570963398e-06, + "logits/chosen": -0.9933344125747681, + "logits/rejected": -1.0323760509490967, + "logps/chosen": -70.40750885009766, + "logps/rejected": -44.15047073364258, + "loss": 0.7898, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.778913140296936, + "rewards/margins": 0.05427360534667969, + "rewards/rejected": 1.7246395349502563, + "step": 4327 + }, + { + "epoch": 0.7, + "learning_rate": 7.529533998477321e-06, + "logits/chosen": -0.8644538521766663, + "logits/rejected": -0.7373619675636292, + "logps/chosen": -73.29768371582031, + "logps/rejected": -49.7869873046875, + "loss": 0.4488, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.3635414838790894, + "rewards/margins": 0.053618669509887695, + "rewards/rejected": 1.3099228143692017, + "step": 4328 + }, + { + "epoch": 0.7, + "learning_rate": 7.528400251224379e-06, + "logits/chosen": -1.0310200452804565, + "logits/rejected": -0.9983519315719604, + "logps/chosen": -99.12026977539062, + "logps/rejected": -123.96039581298828, + "loss": 1.1743, + "rewards/accuracies": 0.0, + "rewards/chosen": 5.739202976226807, + "rewards/margins": -0.6061882972717285, + "rewards/rejected": 6.345391273498535, + "step": 4329 + }, + { + "epoch": 0.7, + "learning_rate": 7.527266329282905e-06, + "logits/chosen": -1.039849877357483, + "logits/rejected": -0.9828201532363892, + "logps/chosen": -71.51799011230469, + "logps/rejected": -38.427852630615234, + "loss": 1.1766, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.725914716720581, + "rewards/margins": 0.5324809551239014, + "rewards/rejected": 2.1934337615966797, + "step": 4330 + }, + { + "epoch": 0.7, + "learning_rate": 7.52613223273124e-06, + "logits/chosen": -1.2520416975021362, + "logits/rejected": -1.272314190864563, + "logps/chosen": -53.891845703125, + "logps/rejected": -77.86474609375, + "loss": 0.8586, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1714065074920654, + "rewards/margins": 0.9831817150115967, + "rewards/rejected": 2.1882247924804688, + "step": 4331 + }, + { + "epoch": 0.7, + "learning_rate": 7.52499796164774e-06, + "logits/chosen": -0.7988656163215637, + "logits/rejected": -0.7578577995300293, + "logps/chosen": -108.87078857421875, + "logps/rejected": -70.62641906738281, + "loss": 1.2881, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.6675659418106079, + "rewards/margins": -0.36833345890045166, + "rewards/rejected": 1.0358994007110596, + "step": 4332 + }, + { + "epoch": 0.7, + "learning_rate": 7.523863516110772e-06, + "logits/chosen": -0.8817204236984253, + "logits/rejected": -0.8807383179664612, + "logps/chosen": -5.787557601928711, + "logps/rejected": -2.707082748413086, + "loss": 1.022, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.09680294990539551, + "rewards/margins": -0.09502001106739044, + "rewards/rejected": 0.19182296097278595, + "step": 4333 + }, + { + "epoch": 0.7, + "learning_rate": 7.522728896198718e-06, + "logits/chosen": -0.916253924369812, + "logits/rejected": -1.2738760709762573, + "logps/chosen": -64.82794189453125, + "logps/rejected": -33.08189392089844, + "loss": 0.0302, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.069686412811279, + "rewards/margins": 3.589813232421875, + "rewards/rejected": 0.47987326979637146, + "step": 4334 + }, + { + "epoch": 0.7, + "learning_rate": 7.521594101989966e-06, + "logits/chosen": -0.6819652915000916, + "logits/rejected": -0.4687979221343994, + "logps/chosen": -55.54700469970703, + "logps/rejected": -4.090599536895752, + "loss": 0.2467, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.9383186101913452, + "rewards/margins": 1.1538043022155762, + "rewards/rejected": 0.7845143675804138, + "step": 4335 + }, + { + "epoch": 0.7, + "learning_rate": 7.5204591335629204e-06, + "logits/chosen": -1.3729121685028076, + "logits/rejected": -1.4108238220214844, + "logps/chosen": -213.71324157714844, + "logps/rejected": -123.41000366210938, + "loss": 0.4566, + "rewards/accuracies": 0.0, + "rewards/chosen": 5.503355503082275, + "rewards/margins": -0.37949228286743164, + "rewards/rejected": 5.882847785949707, + "step": 4336 + }, + { + "epoch": 0.7, + "learning_rate": 7.519323990995999e-06, + "logits/chosen": -1.0021759271621704, + "logits/rejected": -1.0041433572769165, + "logps/chosen": -114.21736907958984, + "logps/rejected": -108.29883575439453, + "loss": 0.0896, + "rewards/accuracies": 1.0, + "rewards/chosen": 7.077542781829834, + "rewards/margins": 1.8837385177612305, + "rewards/rejected": 5.1938042640686035, + "step": 4337 + }, + { + "epoch": 0.7, + "learning_rate": 7.518188674367628e-06, + "logits/chosen": -1.006274700164795, + "logits/rejected": -1.011078953742981, + "logps/chosen": -75.00926208496094, + "logps/rejected": -93.70760345458984, + "loss": 2.1351, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.7507805824279785, + "rewards/margins": -4.115959167480469, + "rewards/rejected": 6.866739749908447, + "step": 4338 + }, + { + "epoch": 0.7, + "learning_rate": 7.517053183756247e-06, + "logits/chosen": -1.317368745803833, + "logits/rejected": -1.2615758180618286, + "logps/chosen": -66.30294036865234, + "logps/rejected": -90.13151550292969, + "loss": 2.5852, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.087810516357422, + "rewards/margins": -1.8334832191467285, + "rewards/rejected": 4.92129373550415, + "step": 4339 + }, + { + "epoch": 0.7, + "learning_rate": 7.515917519240306e-06, + "logits/chosen": -0.7136962413787842, + "logits/rejected": -0.7136962413787842, + "logps/chosen": -40.936439514160156, + "logps/rejected": -40.936439514160156, + "loss": 0.3472, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.8301418423652649, + "rewards/margins": 0.0, + "rewards/rejected": 0.8301418423652649, + "step": 4340 + }, + { + "epoch": 0.7, + "learning_rate": 7.514781680898271e-06, + "logits/chosen": -1.1638497114181519, + "logits/rejected": -1.0138211250305176, + "logps/chosen": -75.1812744140625, + "logps/rejected": -40.93430709838867, + "loss": 0.0236, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.1138176918029785, + "rewards/margins": 3.160093307495117, + "rewards/rejected": 1.9537242650985718, + "step": 4341 + }, + { + "epoch": 0.7, + "learning_rate": 7.513645668808615e-06, + "logits/chosen": -0.6833085417747498, + "logits/rejected": -0.6278165578842163, + "logps/chosen": -52.70808029174805, + "logps/rejected": -36.452232360839844, + "loss": 0.5762, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.5763225555419922, + "rewards/margins": 0.598598837852478, + "rewards/rejected": 0.9777237176895142, + "step": 4342 + }, + { + "epoch": 0.7, + "learning_rate": 7.51250948304983e-06, + "logits/chosen": -0.7945319414138794, + "logits/rejected": -0.8115755915641785, + "logps/chosen": -52.037109375, + "logps/rejected": -167.93106079101562, + "loss": 0.4421, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.7260162830352783, + "rewards/margins": 0.4958587884902954, + "rewards/rejected": 1.230157494544983, + "step": 4343 + }, + { + "epoch": 0.71, + "learning_rate": 7.511373123700414e-06, + "logits/chosen": -1.226375699043274, + "logits/rejected": -1.216871976852417, + "logps/chosen": -146.00152587890625, + "logps/rejected": -140.42367553710938, + "loss": 0.1114, + "rewards/accuracies": 1.0, + "rewards/chosen": 8.768877983093262, + "rewards/margins": 1.4519681930541992, + "rewards/rejected": 7.3169097900390625, + "step": 4344 + }, + { + "epoch": 0.71, + "learning_rate": 7.510236590838877e-06, + "logits/chosen": -1.1472970247268677, + "logits/rejected": -1.1472970247268677, + "logps/chosen": -45.53657913208008, + "logps/rejected": -45.53657913208008, + "loss": 0.4365, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.8792858123779297, + "rewards/margins": 0.0, + "rewards/rejected": 2.8792858123779297, + "step": 4345 + }, + { + "epoch": 0.71, + "learning_rate": 7.509099884543745e-06, + "logits/chosen": -0.6879247426986694, + "logits/rejected": -0.6938320398330688, + "logps/chosen": -33.35381317138672, + "logps/rejected": -39.10996627807617, + "loss": 1.4406, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.7299445867538452, + "rewards/margins": -0.5296097993850708, + "rewards/rejected": 2.259554386138916, + "step": 4346 + }, + { + "epoch": 0.71, + "learning_rate": 7.50796300489355e-06, + "logits/chosen": -0.962445080280304, + "logits/rejected": -0.9674817323684692, + "logps/chosen": -52.348941802978516, + "logps/rejected": -110.44123840332031, + "loss": 0.4036, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.110067367553711, + "rewards/margins": 1.1560816764831543, + "rewards/rejected": 0.9539856314659119, + "step": 4347 + }, + { + "epoch": 0.71, + "learning_rate": 7.506825951966843e-06, + "logits/chosen": -1.2817456722259521, + "logits/rejected": -1.2392165660858154, + "logps/chosen": -92.99349975585938, + "logps/rejected": -77.05239868164062, + "loss": 0.3817, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.4725539684295654, + "rewards/margins": -0.12874698638916016, + "rewards/rejected": 2.6013009548187256, + "step": 4348 + }, + { + "epoch": 0.71, + "learning_rate": 7.5056887258421825e-06, + "logits/chosen": -0.9239291548728943, + "logits/rejected": -1.0087202787399292, + "logps/chosen": -85.64436340332031, + "logps/rejected": -149.71279907226562, + "loss": 3.1713, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.555987596511841, + "rewards/margins": -2.5415236949920654, + "rewards/rejected": 5.097511291503906, + "step": 4349 + }, + { + "epoch": 0.71, + "learning_rate": 7.50455132659814e-06, + "logits/chosen": -1.2675000429153442, + "logits/rejected": -1.2675000429153442, + "logps/chosen": -27.147289276123047, + "logps/rejected": -27.147289276123047, + "loss": 0.9442, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.070385694503784, + "rewards/margins": 0.0, + "rewards/rejected": 3.070385694503784, + "step": 4350 + }, + { + "epoch": 0.71, + "learning_rate": 7.5034137543133e-06, + "logits/chosen": -1.0613045692443848, + "logits/rejected": -1.0613045692443848, + "logps/chosen": -42.45954513549805, + "logps/rejected": -42.45954513549805, + "loss": 0.4582, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.6417806148529053, + "rewards/margins": 0.0, + "rewards/rejected": 3.6417806148529053, + "step": 4351 + }, + { + "epoch": 0.71, + "learning_rate": 7.5022760090662565e-06, + "logits/chosen": -1.049929141998291, + "logits/rejected": -1.0924575328826904, + "logps/chosen": -56.973541259765625, + "logps/rejected": -52.6580696105957, + "loss": 0.684, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.3018875122070312, + "rewards/margins": -0.9938549995422363, + "rewards/rejected": 3.2957425117492676, + "step": 4352 + }, + { + "epoch": 0.71, + "learning_rate": 7.501138090935616e-06, + "logits/chosen": -0.746794581413269, + "logits/rejected": -0.7304494380950928, + "logps/chosen": -72.00994873046875, + "logps/rejected": -74.20530700683594, + "loss": 0.7307, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.3946106433868408, + "rewards/margins": 0.17817234992980957, + "rewards/rejected": 1.2164382934570312, + "step": 4353 + }, + { + "epoch": 0.71, + "learning_rate": 7.500000000000001e-06, + "logits/chosen": -1.1885020732879639, + "logits/rejected": -1.1246466636657715, + "logps/chosen": -90.50872039794922, + "logps/rejected": -105.09318542480469, + "loss": 1.0641, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.2719650268554688, + "rewards/margins": -1.714167833328247, + "rewards/rejected": 3.986132860183716, + "step": 4354 + }, + { + "epoch": 0.71, + "learning_rate": 7.498861736338041e-06, + "logits/chosen": -0.9397258758544922, + "logits/rejected": -1.0310789346694946, + "logps/chosen": -116.78264617919922, + "logps/rejected": -112.60258483886719, + "loss": 0.7864, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1746652126312256, + "rewards/margins": 1.7332139015197754, + "rewards/rejected": 1.4414513111114502, + "step": 4355 + }, + { + "epoch": 0.71, + "learning_rate": 7.497723300028379e-06, + "logits/chosen": -1.2422531843185425, + "logits/rejected": -1.3715277910232544, + "logps/chosen": -30.512996673583984, + "logps/rejected": -131.71600341796875, + "loss": 1.7766, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.149972915649414, + "rewards/margins": -3.437110424041748, + "rewards/rejected": 5.587083339691162, + "step": 4356 + }, + { + "epoch": 0.71, + "learning_rate": 7.496584691149671e-06, + "logits/chosen": -1.2742254734039307, + "logits/rejected": -1.334430456161499, + "logps/chosen": -68.21485900878906, + "logps/rejected": -72.66372680664062, + "loss": 1.6105, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.4977662563323975, + "rewards/margins": -0.4059615135192871, + "rewards/rejected": 2.9037277698516846, + "step": 4357 + }, + { + "epoch": 0.71, + "learning_rate": 7.495445909780584e-06, + "logits/chosen": -1.5931501388549805, + "logits/rejected": -1.5414416790008545, + "logps/chosen": -78.83479309082031, + "logps/rejected": -8.998199462890625, + "loss": 1.555, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.957605004310608, + "rewards/margins": 1.4522682428359985, + "rewards/rejected": 0.5053367614746094, + "step": 4358 + }, + { + "epoch": 0.71, + "learning_rate": 7.494306955999797e-06, + "logits/chosen": -0.944549560546875, + "logits/rejected": -0.736821711063385, + "logps/chosen": -126.29822540283203, + "logps/rejected": -13.289185523986816, + "loss": 0.8835, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.7237114906311035, + "rewards/margins": 4.753987789154053, + "rewards/rejected": 0.9697238206863403, + "step": 4359 + }, + { + "epoch": 0.71, + "learning_rate": 7.493167829886e-06, + "logits/chosen": -1.1203701496124268, + "logits/rejected": -0.7347626686096191, + "logps/chosen": -178.63247680664062, + "logps/rejected": -39.64065933227539, + "loss": 0.9541, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.059140205383301, + "rewards/margins": 4.523872375488281, + "rewards/rejected": 1.5352680683135986, + "step": 4360 + }, + { + "epoch": 0.71, + "learning_rate": 7.492028531517896e-06, + "logits/chosen": -1.3500266075134277, + "logits/rejected": -1.2902063131332397, + "logps/chosen": -182.3839111328125, + "logps/rejected": -32.322059631347656, + "loss": 0.2181, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.110887289047241, + "rewards/margins": 2.213465929031372, + "rewards/rejected": 0.8974213004112244, + "step": 4361 + }, + { + "epoch": 0.71, + "learning_rate": 7.490889060974202e-06, + "logits/chosen": -1.1041433811187744, + "logits/rejected": -1.1712238788604736, + "logps/chosen": -81.70849609375, + "logps/rejected": -87.46601867675781, + "loss": 1.5484, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.320260763168335, + "rewards/margins": -2.736537218093872, + "rewards/rejected": 5.056797981262207, + "step": 4362 + }, + { + "epoch": 0.71, + "learning_rate": 7.489749418333642e-06, + "logits/chosen": -0.7459812760353088, + "logits/rejected": -0.7005475163459778, + "logps/chosen": -13.553583145141602, + "logps/rejected": -5.738304138183594, + "loss": 0.2978, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.9007986783981323, + "rewards/margins": 0.8593387603759766, + "rewards/rejected": 1.0414599180221558, + "step": 4363 + }, + { + "epoch": 0.71, + "learning_rate": 7.488609603674955e-06, + "logits/chosen": -1.4660850763320923, + "logits/rejected": -1.416758418083191, + "logps/chosen": -124.2698974609375, + "logps/rejected": -80.9090576171875, + "loss": 1.1851, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.195384502410889, + "rewards/margins": 1.6161766052246094, + "rewards/rejected": 4.579207897186279, + "step": 4364 + }, + { + "epoch": 0.71, + "learning_rate": 7.487469617076892e-06, + "logits/chosen": -0.9499734044075012, + "logits/rejected": -0.8990762829780579, + "logps/chosen": -51.275550842285156, + "logps/rejected": -65.05320739746094, + "loss": 0.458, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.329049825668335, + "rewards/margins": -0.16135096549987793, + "rewards/rejected": 2.490400791168213, + "step": 4365 + }, + { + "epoch": 0.71, + "learning_rate": 7.486329458618215e-06, + "logits/chosen": -1.034049153327942, + "logits/rejected": -1.034049153327942, + "logps/chosen": -25.909526824951172, + "logps/rejected": -25.909526824951172, + "loss": 0.9758, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.9228405356407166, + "rewards/margins": 0.0, + "rewards/rejected": 0.9228405356407166, + "step": 4366 + }, + { + "epoch": 0.71, + "learning_rate": 7.485189128377699e-06, + "logits/chosen": -1.3454220294952393, + "logits/rejected": -1.0756388902664185, + "logps/chosen": -93.23773956298828, + "logps/rejected": -78.11032104492188, + "loss": 0.0889, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.68474817276001, + "rewards/margins": 2.917438507080078, + "rewards/rejected": 2.7673096656799316, + "step": 4367 + }, + { + "epoch": 0.71, + "learning_rate": 7.484048626434128e-06, + "logits/chosen": -1.0947133302688599, + "logits/rejected": -1.1086863279342651, + "logps/chosen": -72.82918548583984, + "logps/rejected": -66.0357437133789, + "loss": 0.1763, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.582972764968872, + "rewards/margins": 0.9196174740791321, + "rewards/rejected": 0.66335529088974, + "step": 4368 + }, + { + "epoch": 0.71, + "learning_rate": 7.482907952866303e-06, + "logits/chosen": -0.789871096611023, + "logits/rejected": -0.8760403394699097, + "logps/chosen": -108.24176025390625, + "logps/rejected": -151.56813049316406, + "loss": 1.8435, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.6591957211494446, + "rewards/margins": -3.269674062728882, + "rewards/rejected": 3.9288697242736816, + "step": 4369 + }, + { + "epoch": 0.71, + "learning_rate": 7.4817671077530295e-06, + "logits/chosen": -1.291049838066101, + "logits/rejected": -1.097570776939392, + "logps/chosen": -82.30169677734375, + "logps/rejected": -37.547645568847656, + "loss": 0.6121, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.529745578765869, + "rewards/margins": 3.483684539794922, + "rewards/rejected": 2.0460610389709473, + "step": 4370 + }, + { + "epoch": 0.71, + "learning_rate": 7.480626091173133e-06, + "logits/chosen": -1.366727590560913, + "logits/rejected": -1.339495062828064, + "logps/chosen": -66.8976058959961, + "logps/rejected": -87.16552734375, + "loss": 0.333, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.499110460281372, + "rewards/margins": 1.6158233880996704, + "rewards/rejected": 0.8832870721817017, + "step": 4371 + }, + { + "epoch": 0.71, + "learning_rate": 7.479484903205445e-06, + "logits/chosen": -1.1418991088867188, + "logits/rejected": -1.1367757320404053, + "logps/chosen": -40.712677001953125, + "logps/rejected": -44.15676498413086, + "loss": 0.3709, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.375380754470825, + "rewards/margins": 0.2243342399597168, + "rewards/rejected": 2.1510465145111084, + "step": 4372 + }, + { + "epoch": 0.71, + "learning_rate": 7.478343543928812e-06, + "logits/chosen": -1.2510069608688354, + "logits/rejected": -0.9548189043998718, + "logps/chosen": -118.96870422363281, + "logps/rejected": -91.27824401855469, + "loss": 0.7405, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9260239601135254, + "rewards/margins": 0.34203648567199707, + "rewards/rejected": 2.5839874744415283, + "step": 4373 + }, + { + "epoch": 0.71, + "learning_rate": 7.47720201342209e-06, + "logits/chosen": -0.7724891901016235, + "logits/rejected": -0.7724891901016235, + "logps/chosen": -38.03974914550781, + "logps/rejected": -38.03974914550781, + "loss": 0.6348, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.4361991882324219, + "rewards/margins": 0.0, + "rewards/rejected": 1.4361991882324219, + "step": 4374 + }, + { + "epoch": 0.71, + "learning_rate": 7.476060311764149e-06, + "logits/chosen": -0.910786509513855, + "logits/rejected": -0.8116174340248108, + "logps/chosen": -86.76029968261719, + "logps/rejected": -133.88812255859375, + "loss": 1.2931, + "rewards/accuracies": 0.0, + "rewards/chosen": 4.904262065887451, + "rewards/margins": -0.4615797996520996, + "rewards/rejected": 5.365841865539551, + "step": 4375 + }, + { + "epoch": 0.71, + "learning_rate": 7.474918439033869e-06, + "logits/chosen": -0.9501950144767761, + "logits/rejected": -0.9206207990646362, + "logps/chosen": -38.911537170410156, + "logps/rejected": -38.570369720458984, + "loss": 0.2542, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.5940635204315186, + "rewards/margins": 1.2420992851257324, + "rewards/rejected": 1.3519642353057861, + "step": 4376 + }, + { + "epoch": 0.71, + "learning_rate": 7.473776395310143e-06, + "logits/chosen": -1.0548053979873657, + "logits/rejected": -0.9102526307106018, + "logps/chosen": -109.62649536132812, + "logps/rejected": -22.605480194091797, + "loss": 0.3809, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.3658676147460938, + "rewards/margins": 2.030787944793701, + "rewards/rejected": 0.33507975935935974, + "step": 4377 + }, + { + "epoch": 0.71, + "learning_rate": 7.4726341806718735e-06, + "logits/chosen": -1.1533085107803345, + "logits/rejected": -1.2357863187789917, + "logps/chosen": -106.77814483642578, + "logps/rejected": -101.57960510253906, + "loss": 2.0027, + "rewards/accuracies": 0.0, + "rewards/chosen": 4.420814037322998, + "rewards/margins": -3.896054744720459, + "rewards/rejected": 8.316868782043457, + "step": 4378 + }, + { + "epoch": 0.71, + "learning_rate": 7.471491795197981e-06, + "logits/chosen": -1.1760200262069702, + "logits/rejected": -0.9801954030990601, + "logps/chosen": -100.18559265136719, + "logps/rejected": -48.15098571777344, + "loss": 0.0183, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.229142665863037, + "rewards/margins": 4.1936445236206055, + "rewards/rejected": 2.0354981422424316, + "step": 4379 + }, + { + "epoch": 0.71, + "learning_rate": 7.470349238967389e-06, + "logits/chosen": -1.4130370616912842, + "logits/rejected": -1.3638972043991089, + "logps/chosen": -71.53501892089844, + "logps/rejected": -103.17044830322266, + "loss": 1.0342, + "rewards/accuracies": 0.0, + "rewards/chosen": 4.630422115325928, + "rewards/margins": -1.6438250541687012, + "rewards/rejected": 6.274247169494629, + "step": 4380 + }, + { + "epoch": 0.71, + "learning_rate": 7.469206512059039e-06, + "logits/chosen": -0.7414693236351013, + "logits/rejected": -0.7414693236351013, + "logps/chosen": -25.187284469604492, + "logps/rejected": -25.187284469604492, + "loss": 0.4177, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.3924200534820557, + "rewards/margins": 0.0, + "rewards/rejected": 2.3924200534820557, + "step": 4381 + }, + { + "epoch": 0.71, + "learning_rate": 7.4680636145518835e-06, + "logits/chosen": -1.3602538108825684, + "logits/rejected": -1.356733798980713, + "logps/chosen": -164.65380859375, + "logps/rejected": -113.397216796875, + "loss": 0.2228, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.687597751617432, + "rewards/margins": 0.6145505905151367, + "rewards/rejected": 6.073047161102295, + "step": 4382 + }, + { + "epoch": 0.71, + "learning_rate": 7.466920546524886e-06, + "logits/chosen": -1.0653307437896729, + "logits/rejected": -1.0653307437896729, + "logps/chosen": -84.30426025390625, + "logps/rejected": -84.30426025390625, + "loss": 0.3476, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.936901807785034, + "rewards/margins": 0.0, + "rewards/rejected": 2.936901807785034, + "step": 4383 + }, + { + "epoch": 0.71, + "learning_rate": 7.4657773080570206e-06, + "logits/chosen": -0.976041853427887, + "logits/rejected": -0.9509998559951782, + "logps/chosen": -90.5816421508789, + "logps/rejected": -204.40643310546875, + "loss": 0.4951, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.379091739654541, + "rewards/margins": 0.5792839527130127, + "rewards/rejected": 1.7998077869415283, + "step": 4384 + }, + { + "epoch": 0.71, + "learning_rate": 7.464633899227274e-06, + "logits/chosen": -1.1083790063858032, + "logits/rejected": -1.1217070817947388, + "logps/chosen": -87.73712921142578, + "logps/rejected": -221.19882202148438, + "loss": 4.4009, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.0588997602462769, + "rewards/margins": -8.740148544311523, + "rewards/rejected": 9.79904842376709, + "step": 4385 + }, + { + "epoch": 0.71, + "learning_rate": 7.463490320114646e-06, + "logits/chosen": -0.8258460760116577, + "logits/rejected": -0.7866429686546326, + "logps/chosen": -73.01236724853516, + "logps/rejected": -101.90457153320312, + "loss": 1.0909, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.2619141340255737, + "rewards/margins": 0.9595261216163635, + "rewards/rejected": 0.3023880124092102, + "step": 4386 + }, + { + "epoch": 0.71, + "learning_rate": 7.462346570798147e-06, + "logits/chosen": -1.2398533821105957, + "logits/rejected": -1.2398533821105957, + "logps/chosen": -42.96892547607422, + "logps/rejected": -42.96892547607422, + "loss": 0.3714, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.4062244892120361, + "rewards/margins": 0.0, + "rewards/rejected": 1.4062244892120361, + "step": 4387 + }, + { + "epoch": 0.71, + "learning_rate": 7.4612026513567985e-06, + "logits/chosen": -0.9462363123893738, + "logits/rejected": -0.916653573513031, + "logps/chosen": -55.31908416748047, + "logps/rejected": -37.563499450683594, + "loss": 0.4392, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.1159675121307373, + "rewards/margins": 1.4184679985046387, + "rewards/rejected": 0.6974994540214539, + "step": 4388 + }, + { + "epoch": 0.71, + "learning_rate": 7.460058561869634e-06, + "logits/chosen": -1.0792652368545532, + "logits/rejected": -0.8368945717811584, + "logps/chosen": -158.34654235839844, + "logps/rejected": -27.54556655883789, + "loss": 0.0694, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.012373447418213, + "rewards/margins": 2.3505892753601074, + "rewards/rejected": 1.661784052848816, + "step": 4389 + }, + { + "epoch": 0.71, + "learning_rate": 7.458914302415702e-06, + "logits/chosen": -0.9540476202964783, + "logits/rejected": -0.9112969636917114, + "logps/chosen": -45.97007369995117, + "logps/rejected": -43.174407958984375, + "loss": 0.3576, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.3894306421279907, + "rewards/margins": -0.027923941612243652, + "rewards/rejected": 1.4173545837402344, + "step": 4390 + }, + { + "epoch": 0.71, + "learning_rate": 7.457769873074056e-06, + "logits/chosen": -0.9444832801818848, + "logits/rejected": -0.948334276676178, + "logps/chosen": -7.868496894836426, + "logps/rejected": -3.120981454849243, + "loss": 0.4614, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.4135182499885559, + "rewards/margins": -0.07110956311225891, + "rewards/rejected": 0.4846278131008148, + "step": 4391 + }, + { + "epoch": 0.71, + "learning_rate": 7.45662527392377e-06, + "logits/chosen": -1.24658203125, + "logits/rejected": -1.257201075553894, + "logps/chosen": -101.8095932006836, + "logps/rejected": -80.85356140136719, + "loss": 0.4888, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.637953996658325, + "rewards/margins": 0.5295860767364502, + "rewards/rejected": 2.108367919921875, + "step": 4392 + }, + { + "epoch": 0.71, + "learning_rate": 7.45548050504392e-06, + "logits/chosen": -0.903388261795044, + "logits/rejected": -0.903388261795044, + "logps/chosen": -76.66096496582031, + "logps/rejected": -76.66096496582031, + "loss": 0.3522, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.3998031616210938, + "rewards/margins": 0.0, + "rewards/rejected": 1.3998031616210938, + "step": 4393 + }, + { + "epoch": 0.71, + "learning_rate": 7.454335566513603e-06, + "logits/chosen": -1.1847211122512817, + "logits/rejected": -1.1504665613174438, + "logps/chosen": -95.82862854003906, + "logps/rejected": -96.08686828613281, + "loss": 0.1705, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.975918769836426, + "rewards/margins": 1.4048752784729004, + "rewards/rejected": 3.5710434913635254, + "step": 4394 + }, + { + "epoch": 0.71, + "learning_rate": 7.4531904584119206e-06, + "logits/chosen": -1.0885889530181885, + "logits/rejected": -1.1206943988800049, + "logps/chosen": -63.908592224121094, + "logps/rejected": -98.49345397949219, + "loss": 1.1036, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.136312961578369, + "rewards/margins": -0.40835022926330566, + "rewards/rejected": 2.544663190841675, + "step": 4395 + }, + { + "epoch": 0.71, + "learning_rate": 7.45204518081799e-06, + "logits/chosen": -1.0347906351089478, + "logits/rejected": -0.9523099660873413, + "logps/chosen": -94.71529388427734, + "logps/rejected": -77.08209228515625, + "loss": 0.637, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.516310930252075, + "rewards/margins": -0.805335283279419, + "rewards/rejected": 3.321646213531494, + "step": 4396 + }, + { + "epoch": 0.71, + "learning_rate": 7.450899733810938e-06, + "logits/chosen": -1.3021690845489502, + "logits/rejected": -1.3534977436065674, + "logps/chosen": -164.16732788085938, + "logps/rejected": -60.17243194580078, + "loss": 0.3012, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.731869697570801, + "rewards/margins": 3.3899035453796387, + "rewards/rejected": 1.3419662714004517, + "step": 4397 + }, + { + "epoch": 0.71, + "learning_rate": 7.449754117469905e-06, + "logits/chosen": -1.2314550876617432, + "logits/rejected": -1.2200692892074585, + "logps/chosen": -88.37156677246094, + "logps/rejected": -48.96368408203125, + "loss": 0.3678, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.119387149810791, + "rewards/margins": 0.26879191398620605, + "rewards/rejected": 2.850595235824585, + "step": 4398 + }, + { + "epoch": 0.71, + "learning_rate": 7.448608331874043e-06, + "logits/chosen": -0.9126417636871338, + "logits/rejected": -0.8405904173851013, + "logps/chosen": -79.85218811035156, + "logps/rejected": -27.300241470336914, + "loss": 0.8599, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.4881012439727783, + "rewards/margins": 2.953382968902588, + "rewards/rejected": -0.4652816951274872, + "step": 4399 + }, + { + "epoch": 0.71, + "learning_rate": 7.447462377102514e-06, + "logits/chosen": -1.3991949558258057, + "logits/rejected": -1.4515354633331299, + "logps/chosen": -201.32339477539062, + "logps/rejected": -95.70648956298828, + "loss": 0.0082, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.878933906555176, + "rewards/margins": 4.778066635131836, + "rewards/rejected": 2.100867509841919, + "step": 4400 + }, + { + "epoch": 0.71, + "learning_rate": 7.446316253234493e-06, + "logits/chosen": -0.870635449886322, + "logits/rejected": -0.8818665146827698, + "logps/chosen": -50.299217224121094, + "logps/rejected": -65.5950698852539, + "loss": 0.5072, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.5402908325195312, + "rewards/margins": -0.4422203302383423, + "rewards/rejected": 1.9825111627578735, + "step": 4401 + }, + { + "epoch": 0.71, + "learning_rate": 7.445169960349167e-06, + "logits/chosen": -0.81022709608078, + "logits/rejected": -0.8051672577857971, + "logps/chosen": -18.889446258544922, + "logps/rejected": -15.022187232971191, + "loss": 1.4094, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.6607689261436462, + "rewards/margins": -0.1027684211730957, + "rewards/rejected": 0.7635373473167419, + "step": 4402 + }, + { + "epoch": 0.71, + "learning_rate": 7.444023498525732e-06, + "logits/chosen": -0.8197357058525085, + "logits/rejected": -0.6516880393028259, + "logps/chosen": -164.95059204101562, + "logps/rejected": -75.07818603515625, + "loss": 0.09, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.176300048828125, + "rewards/margins": 1.8130004405975342, + "rewards/rejected": 1.3632996082305908, + "step": 4403 + }, + { + "epoch": 0.71, + "learning_rate": 7.4428768678433996e-06, + "logits/chosen": -1.0967828035354614, + "logits/rejected": -1.1046675443649292, + "logps/chosen": -54.17605209350586, + "logps/rejected": -43.11838150024414, + "loss": 0.4369, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.3881962299346924, + "rewards/margins": -0.3285987377166748, + "rewards/rejected": 2.716794967651367, + "step": 4404 + }, + { + "epoch": 0.71, + "learning_rate": 7.44173006838139e-06, + "logits/chosen": -0.9260168075561523, + "logits/rejected": -0.9402567148208618, + "logps/chosen": -23.739248275756836, + "logps/rejected": -15.14570426940918, + "loss": 1.8434, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.22388458251953125, + "rewards/margins": -0.4546886682510376, + "rewards/rejected": 0.6785732507705688, + "step": 4405 + }, + { + "epoch": 0.72, + "learning_rate": 7.4405831002189365e-06, + "logits/chosen": -1.265971302986145, + "logits/rejected": -1.113900065422058, + "logps/chosen": -99.68710327148438, + "logps/rejected": -114.29052734375, + "loss": 0.0947, + "rewards/accuracies": 1.0, + "rewards/chosen": 7.64552640914917, + "rewards/margins": 2.017643928527832, + "rewards/rejected": 5.627882480621338, + "step": 4406 + }, + { + "epoch": 0.72, + "learning_rate": 7.439435963435284e-06, + "logits/chosen": -1.0658738613128662, + "logits/rejected": -1.101183295249939, + "logps/chosen": -52.82847595214844, + "logps/rejected": -52.77239990234375, + "loss": 1.2903, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.1929237842559814, + "rewards/margins": -1.7130622863769531, + "rewards/rejected": 3.9059860706329346, + "step": 4407 + }, + { + "epoch": 0.72, + "learning_rate": 7.438288658109688e-06, + "logits/chosen": -0.7903972268104553, + "logits/rejected": -0.7903972268104553, + "logps/chosen": -36.85448455810547, + "logps/rejected": -36.85448455810547, + "loss": 1.1066, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.099612832069397, + "rewards/margins": 0.0, + "rewards/rejected": 1.099612832069397, + "step": 4408 + }, + { + "epoch": 0.72, + "learning_rate": 7.437141184321417e-06, + "logits/chosen": -1.1861546039581299, + "logits/rejected": -1.1650534868240356, + "logps/chosen": -33.48025894165039, + "logps/rejected": -20.724353790283203, + "loss": 0.3657, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.230184555053711, + "rewards/margins": -0.04607653617858887, + "rewards/rejected": 2.2762610912323, + "step": 4409 + }, + { + "epoch": 0.72, + "learning_rate": 7.435993542149751e-06, + "logits/chosen": -0.9435105323791504, + "logits/rejected": -0.8868604302406311, + "logps/chosen": -61.59873580932617, + "logps/rejected": -56.82658386230469, + "loss": 0.3275, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.603851795196533, + "rewards/margins": 2.057806968688965, + "rewards/rejected": 1.546044945716858, + "step": 4410 + }, + { + "epoch": 0.72, + "learning_rate": 7.434845731673981e-06, + "logits/chosen": -1.0422923564910889, + "logits/rejected": -0.568600058555603, + "logps/chosen": -45.10345458984375, + "logps/rejected": -97.69978332519531, + "loss": 0.2323, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.9007339477539062, + "rewards/margins": 0.5275360345840454, + "rewards/rejected": 1.3731979131698608, + "step": 4411 + }, + { + "epoch": 0.72, + "learning_rate": 7.4336977529734085e-06, + "logits/chosen": -1.6092373132705688, + "logits/rejected": -1.543769359588623, + "logps/chosen": -104.18267822265625, + "logps/rejected": -108.66222381591797, + "loss": 0.8502, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.2619316577911377, + "rewards/margins": 0.9127670526504517, + "rewards/rejected": 1.349164605140686, + "step": 4412 + }, + { + "epoch": 0.72, + "learning_rate": 7.432549606127351e-06, + "logits/chosen": -0.8618365526199341, + "logits/rejected": -0.8618365526199341, + "logps/chosen": -106.88652038574219, + "logps/rejected": -106.88652038574219, + "loss": 0.4141, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.2955513000488281, + "rewards/margins": 0.0, + "rewards/rejected": 1.2955513000488281, + "step": 4413 + }, + { + "epoch": 0.72, + "learning_rate": 7.431401291215131e-06, + "logits/chosen": -1.118774175643921, + "logits/rejected": -1.084647297859192, + "logps/chosen": -105.1588363647461, + "logps/rejected": -28.81822395324707, + "loss": 0.1688, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.2556374073028564, + "rewards/margins": 0.9548352956771851, + "rewards/rejected": 1.3008021116256714, + "step": 4414 + }, + { + "epoch": 0.72, + "learning_rate": 7.4302528083160896e-06, + "logits/chosen": -1.2914594411849976, + "logits/rejected": -1.2538808584213257, + "logps/chosen": -131.56463623046875, + "logps/rejected": -164.85281372070312, + "loss": 1.2368, + "rewards/accuracies": 0.0, + "rewards/chosen": 5.344174385070801, + "rewards/margins": -2.0675048828125, + "rewards/rejected": 7.411679267883301, + "step": 4415 + }, + { + "epoch": 0.72, + "learning_rate": 7.429104157509574e-06, + "logits/chosen": -0.8428117632865906, + "logits/rejected": -0.8102978467941284, + "logps/chosen": -93.89989471435547, + "logps/rejected": -63.299041748046875, + "loss": 0.7294, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.1168259382247925, + "rewards/margins": -1.191785454750061, + "rewards/rejected": 2.3086113929748535, + "step": 4416 + }, + { + "epoch": 0.72, + "learning_rate": 7.427955338874944e-06, + "logits/chosen": -0.9512845277786255, + "logits/rejected": -0.8379102945327759, + "logps/chosen": -67.49949645996094, + "logps/rejected": -61.99828338623047, + "loss": 0.2966, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.918145775794983, + "rewards/margins": 0.2686774730682373, + "rewards/rejected": 1.6494683027267456, + "step": 4417 + }, + { + "epoch": 0.72, + "learning_rate": 7.426806352491575e-06, + "logits/chosen": -1.4249744415283203, + "logits/rejected": -1.545108675956726, + "logps/chosen": -71.6881103515625, + "logps/rejected": -196.01539611816406, + "loss": 3.999, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.252887010574341, + "rewards/margins": -7.907658576965332, + "rewards/rejected": 10.160545349121094, + "step": 4418 + }, + { + "epoch": 0.72, + "learning_rate": 7.425657198438849e-06, + "logits/chosen": -0.9436843991279602, + "logits/rejected": -1.070920705795288, + "logps/chosen": -44.85126495361328, + "logps/rejected": -130.96749877929688, + "loss": 3.2469, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.7038230895996094, + "rewards/margins": -3.169867992401123, + "rewards/rejected": 5.873691082000732, + "step": 4419 + }, + { + "epoch": 0.72, + "learning_rate": 7.4245078767961635e-06, + "logits/chosen": -1.0278561115264893, + "logits/rejected": -1.067889928817749, + "logps/chosen": -86.38056945800781, + "logps/rejected": -98.51136779785156, + "loss": 1.9785, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.7513649463653564, + "rewards/margins": -0.682950496673584, + "rewards/rejected": 2.4343154430389404, + "step": 4420 + }, + { + "epoch": 0.72, + "learning_rate": 7.4233583876429224e-06, + "logits/chosen": -1.1336441040039062, + "logits/rejected": -1.060206651687622, + "logps/chosen": -79.38351440429688, + "logps/rejected": -24.175939559936523, + "loss": 0.7312, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.5293426513671875, + "rewards/margins": 1.6600526571273804, + "rewards/rejected": 0.8692899942398071, + "step": 4421 + }, + { + "epoch": 0.72, + "learning_rate": 7.422208731058549e-06, + "logits/chosen": -1.0015345811843872, + "logits/rejected": -1.0041372776031494, + "logps/chosen": -12.754716873168945, + "logps/rejected": -2.077526330947876, + "loss": 2.1031, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.1574636548757553, + "rewards/margins": -0.1710178107023239, + "rewards/rejected": 0.3284814655780792, + "step": 4422 + }, + { + "epoch": 0.72, + "learning_rate": 7.42105890712247e-06, + "logits/chosen": -0.9870411157608032, + "logits/rejected": -1.0300949811935425, + "logps/chosen": -57.77312469482422, + "logps/rejected": -55.218505859375, + "loss": 0.272, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.5097107887268066, + "rewards/margins": 0.7122772932052612, + "rewards/rejected": 1.7974334955215454, + "step": 4423 + }, + { + "epoch": 0.72, + "learning_rate": 7.41990891591413e-06, + "logits/chosen": -1.0971356630325317, + "logits/rejected": -1.0253888368606567, + "logps/chosen": -59.17234420776367, + "logps/rejected": -90.24043273925781, + "loss": 0.8182, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.263573169708252, + "rewards/margins": -1.3987607955932617, + "rewards/rejected": 4.662333965301514, + "step": 4424 + }, + { + "epoch": 0.72, + "learning_rate": 7.41875875751298e-06, + "logits/chosen": -1.4857707023620605, + "logits/rejected": -1.6032787561416626, + "logps/chosen": -192.60284423828125, + "logps/rejected": -151.4193115234375, + "loss": 0.5276, + "rewards/accuracies": 0.0, + "rewards/chosen": 7.364505290985107, + "rewards/margins": -0.5017180442810059, + "rewards/rejected": 7.866223335266113, + "step": 4425 + }, + { + "epoch": 0.72, + "learning_rate": 7.417608431998487e-06, + "logits/chosen": -1.001507043838501, + "logits/rejected": -1.0209028720855713, + "logps/chosen": -107.8370132446289, + "logps/rejected": -96.43272399902344, + "loss": 0.1584, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2754065990448, + "rewards/margins": 2.6050949096679688, + "rewards/rejected": 0.6703117489814758, + "step": 4426 + }, + { + "epoch": 0.72, + "learning_rate": 7.416457939450128e-06, + "logits/chosen": -1.3615742921829224, + "logits/rejected": -1.3076435327529907, + "logps/chosen": -68.40469360351562, + "logps/rejected": -15.119983673095703, + "loss": 0.1685, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.9287452697753906, + "rewards/margins": 3.050398826599121, + "rewards/rejected": 0.8783464431762695, + "step": 4427 + }, + { + "epoch": 0.72, + "learning_rate": 7.415307279947389e-06, + "logits/chosen": -1.0740233659744263, + "logits/rejected": -0.9826996922492981, + "logps/chosen": -56.58575439453125, + "logps/rejected": -54.628440856933594, + "loss": 0.9442, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.0844742059707642, + "rewards/margins": 0.36283302307128906, + "rewards/rejected": 0.7216411828994751, + "step": 4428 + }, + { + "epoch": 0.72, + "learning_rate": 7.414156453569771e-06, + "logits/chosen": -1.2299810647964478, + "logits/rejected": -1.2034778594970703, + "logps/chosen": -60.65924835205078, + "logps/rejected": -31.146873474121094, + "loss": 0.0774, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.0564751625061035, + "rewards/margins": 2.3449513912200928, + "rewards/rejected": 2.7115237712860107, + "step": 4429 + }, + { + "epoch": 0.72, + "learning_rate": 7.413005460396785e-06, + "logits/chosen": -1.1111605167388916, + "logits/rejected": -0.9882960319519043, + "logps/chosen": -106.11201477050781, + "logps/rejected": -60.31264114379883, + "loss": 0.373, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.2010772228240967, + "rewards/margins": 0.6792467832565308, + "rewards/rejected": 1.521830439567566, + "step": 4430 + }, + { + "epoch": 0.72, + "learning_rate": 7.411854300507954e-06, + "logits/chosen": -0.9299795627593994, + "logits/rejected": -0.8455745577812195, + "logps/chosen": -59.3704833984375, + "logps/rejected": -53.07160186767578, + "loss": 0.4104, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.7266663312911987, + "rewards/margins": -0.19127726554870605, + "rewards/rejected": 1.9179435968399048, + "step": 4431 + }, + { + "epoch": 0.72, + "learning_rate": 7.41070297398281e-06, + "logits/chosen": -1.382452130317688, + "logits/rejected": -1.0495723485946655, + "logps/chosen": -117.74906921386719, + "logps/rejected": -103.21688842773438, + "loss": 1.1721, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.879070997238159, + "rewards/margins": 2.1971144676208496, + "rewards/rejected": 1.6819565296173096, + "step": 4432 + }, + { + "epoch": 0.72, + "learning_rate": 7.409551480900903e-06, + "logits/chosen": -0.755847692489624, + "logits/rejected": -0.7619654536247253, + "logps/chosen": -5.347097396850586, + "logps/rejected": -2.673344850540161, + "loss": 0.4028, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.008468151092529297, + "rewards/margins": -0.20860432088375092, + "rewards/rejected": 0.20013616979122162, + "step": 4433 + }, + { + "epoch": 0.72, + "learning_rate": 7.408399821341787e-06, + "logits/chosen": -0.9636113047599792, + "logits/rejected": -0.9636113047599792, + "logps/chosen": -73.11636352539062, + "logps/rejected": -73.11636352539062, + "loss": 0.3486, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.7178878784179688, + "rewards/margins": 0.0, + "rewards/rejected": 2.7178878784179688, + "step": 4434 + }, + { + "epoch": 0.72, + "learning_rate": 7.407247995385033e-06, + "logits/chosen": -1.2141274213790894, + "logits/rejected": -1.1895310878753662, + "logps/chosen": -130.30398559570312, + "logps/rejected": -77.46501922607422, + "loss": 0.7971, + "rewards/accuracies": 1.0, + "rewards/chosen": 7.124639987945557, + "rewards/margins": 0.5443854331970215, + "rewards/rejected": 6.580254554748535, + "step": 4435 + }, + { + "epoch": 0.72, + "learning_rate": 7.4060960031102194e-06, + "logits/chosen": -1.2037798166275024, + "logits/rejected": -1.2639057636260986, + "logps/chosen": -104.39065551757812, + "logps/rejected": -182.76112365722656, + "loss": 1.4164, + "rewards/accuracies": 0.0, + "rewards/chosen": 5.259347438812256, + "rewards/margins": -2.7490925788879395, + "rewards/rejected": 8.008440017700195, + "step": 4436 + }, + { + "epoch": 0.72, + "learning_rate": 7.404943844596939e-06, + "logits/chosen": -0.882865309715271, + "logits/rejected": -0.8968662619590759, + "logps/chosen": -7.388345718383789, + "logps/rejected": -23.22612762451172, + "loss": 0.5713, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.3784540295600891, + "rewards/margins": -0.5630682110786438, + "rewards/rejected": 0.9415222406387329, + "step": 4437 + }, + { + "epoch": 0.72, + "learning_rate": 7.403791519924794e-06, + "logits/chosen": -1.1087323427200317, + "logits/rejected": -1.059328317642212, + "logps/chosen": -99.3670425415039, + "logps/rejected": -52.05771255493164, + "loss": 0.6989, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.3010849952697754, + "rewards/margins": -1.0741617679595947, + "rewards/rejected": 3.37524676322937, + "step": 4438 + }, + { + "epoch": 0.72, + "learning_rate": 7.4026390291734004e-06, + "logits/chosen": -0.926679253578186, + "logits/rejected": -1.0792826414108276, + "logps/chosen": -77.98078918457031, + "logps/rejected": -73.86100006103516, + "loss": 1.681, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.0231781005859375, + "rewards/margins": -3.2592859268188477, + "rewards/rejected": 5.282464027404785, + "step": 4439 + }, + { + "epoch": 0.72, + "learning_rate": 7.401486372422384e-06, + "logits/chosen": -1.0306075811386108, + "logits/rejected": -1.0432615280151367, + "logps/chosen": -59.73387145996094, + "logps/rejected": -50.110897064208984, + "loss": 0.6694, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.778919219970703, + "rewards/margins": -0.29589056968688965, + "rewards/rejected": 3.0748097896575928, + "step": 4440 + }, + { + "epoch": 0.72, + "learning_rate": 7.4003335497513815e-06, + "logits/chosen": -1.4052999019622803, + "logits/rejected": -1.274010419845581, + "logps/chosen": -107.41728973388672, + "logps/rejected": -90.7950439453125, + "loss": 2.8489, + "rewards/accuracies": 1.0, + "rewards/chosen": 7.291439056396484, + "rewards/margins": 0.9955954551696777, + "rewards/rejected": 6.295843601226807, + "step": 4441 + }, + { + "epoch": 0.72, + "learning_rate": 7.399180561240044e-06, + "logits/chosen": -0.8251886963844299, + "logits/rejected": -0.8170022964477539, + "logps/chosen": -92.16456604003906, + "logps/rejected": -60.0677490234375, + "loss": 1.8151, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.3619064092636108, + "rewards/margins": -2.0778756141662598, + "rewards/rejected": 3.439781904220581, + "step": 4442 + }, + { + "epoch": 0.72, + "learning_rate": 7.398027406968031e-06, + "logits/chosen": -0.9430221915245056, + "logits/rejected": -0.9465221762657166, + "logps/chosen": -5.811469078063965, + "logps/rejected": -1.1961500644683838, + "loss": 0.3306, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.28689461946487427, + "rewards/margins": 0.0824660211801529, + "rewards/rejected": 0.20442859828472137, + "step": 4443 + }, + { + "epoch": 0.72, + "learning_rate": 7.396874087015014e-06, + "logits/chosen": -0.990280032157898, + "logits/rejected": -1.0004512071609497, + "logps/chosen": -40.740718841552734, + "logps/rejected": -60.7643928527832, + "loss": 0.6421, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.4158187806606293, + "rewards/margins": 0.11924666166305542, + "rewards/rejected": 0.29657211899757385, + "step": 4444 + }, + { + "epoch": 0.72, + "learning_rate": 7.3957206014606765e-06, + "logits/chosen": -1.3728892803192139, + "logits/rejected": -1.4415132999420166, + "logps/chosen": -101.31739807128906, + "logps/rejected": -93.64759826660156, + "loss": 0.8811, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.8024933338165283, + "rewards/margins": -0.21701574325561523, + "rewards/rejected": 2.0195090770721436, + "step": 4445 + }, + { + "epoch": 0.72, + "learning_rate": 7.394566950384715e-06, + "logits/chosen": -0.7006092667579651, + "logits/rejected": -0.7006092667579651, + "logps/chosen": -1.4430701732635498, + "logps/rejected": -1.4430701732635498, + "loss": 0.5877, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.22264955937862396, + "rewards/margins": 0.0, + "rewards/rejected": 0.22264955937862396, + "step": 4446 + }, + { + "epoch": 0.72, + "learning_rate": 7.3934131338668335e-06, + "logits/chosen": -1.2346837520599365, + "logits/rejected": -1.1813290119171143, + "logps/chosen": -71.49925231933594, + "logps/rejected": -3.9630532264709473, + "loss": 1.887, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.023090362548828, + "rewards/margins": 1.3765602111816406, + "rewards/rejected": 0.6465300917625427, + "step": 4447 + }, + { + "epoch": 0.72, + "learning_rate": 7.392259151986753e-06, + "logits/chosen": -0.6789625287055969, + "logits/rejected": -0.7108398079872131, + "logps/chosen": -16.86044692993164, + "logps/rejected": -30.05096435546875, + "loss": 2.0555, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.2790952920913696, + "rewards/margins": -0.44027388095855713, + "rewards/rejected": 1.7193691730499268, + "step": 4448 + }, + { + "epoch": 0.72, + "learning_rate": 7.3911050048242e-06, + "logits/chosen": -0.5924581289291382, + "logits/rejected": -0.5955014824867249, + "logps/chosen": -2.2301809787750244, + "logps/rejected": -20.442501068115234, + "loss": 2.0782, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2881027162075043, + "rewards/margins": 0.1355968415737152, + "rewards/rejected": 0.15250587463378906, + "step": 4449 + }, + { + "epoch": 0.72, + "learning_rate": 7.389950692458916e-06, + "logits/chosen": -0.6829071640968323, + "logits/rejected": -0.6829071640968323, + "logps/chosen": -0.6182975769042969, + "logps/rejected": -0.6182975769042969, + "loss": 0.6367, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.21216332912445068, + "rewards/margins": 0.0, + "rewards/rejected": 0.21216332912445068, + "step": 4450 + }, + { + "epoch": 0.72, + "learning_rate": 7.388796214970653e-06, + "logits/chosen": -0.998375415802002, + "logits/rejected": -0.9394423365592957, + "logps/chosen": -65.65069580078125, + "logps/rejected": -81.13301086425781, + "loss": 0.7164, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.362027883529663, + "rewards/margins": -0.6490089893341064, + "rewards/rejected": 4.0110368728637695, + "step": 4451 + }, + { + "epoch": 0.72, + "learning_rate": 7.3876415724391745e-06, + "logits/chosen": -1.317570686340332, + "logits/rejected": -1.2598942518234253, + "logps/chosen": -86.07110595703125, + "logps/rejected": -84.25343322753906, + "loss": 0.4852, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.584963321685791, + "rewards/margins": 0.8847969770431519, + "rewards/rejected": 1.7001663446426392, + "step": 4452 + }, + { + "epoch": 0.72, + "learning_rate": 7.3864867649442564e-06, + "logits/chosen": -1.187605381011963, + "logits/rejected": -1.153617262840271, + "logps/chosen": -125.71981048583984, + "logps/rejected": -74.61331939697266, + "loss": 0.0168, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.738529205322266, + "rewards/margins": 3.404127597808838, + "rewards/rejected": 1.3344017267227173, + "step": 4453 + }, + { + "epoch": 0.72, + "learning_rate": 7.385331792565682e-06, + "logits/chosen": -0.8458655476570129, + "logits/rejected": -0.8154367804527283, + "logps/chosen": -23.704818725585938, + "logps/rejected": -5.295336723327637, + "loss": 1.3387, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.4934820234775543, + "rewards/margins": -0.2402377426624298, + "rewards/rejected": 0.7337197661399841, + "step": 4454 + }, + { + "epoch": 0.72, + "learning_rate": 7.384176655383253e-06, + "logits/chosen": -0.94263756275177, + "logits/rejected": -0.9706481695175171, + "logps/chosen": -68.0697250366211, + "logps/rejected": -90.52079010009766, + "loss": 2.4443, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.8095359802246094, + "rewards/margins": -1.2341821193695068, + "rewards/rejected": 3.043718099594116, + "step": 4455 + }, + { + "epoch": 0.72, + "learning_rate": 7.383021353476775e-06, + "logits/chosen": -0.8076866269111633, + "logits/rejected": -0.7706487774848938, + "logps/chosen": -74.50141143798828, + "logps/rejected": -112.91062927246094, + "loss": 0.7342, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.348024845123291, + "rewards/margins": -0.4449303150177002, + "rewards/rejected": 3.792955160140991, + "step": 4456 + }, + { + "epoch": 0.72, + "learning_rate": 7.381865886926069e-06, + "logits/chosen": -1.1661887168884277, + "logits/rejected": -1.1736321449279785, + "logps/chosen": -132.38145446777344, + "logps/rejected": -47.089637756347656, + "loss": 0.0551, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.883604526519775, + "rewards/margins": 2.937885284423828, + "rewards/rejected": 2.9457192420959473, + "step": 4457 + }, + { + "epoch": 0.72, + "learning_rate": 7.38071025581097e-06, + "logits/chosen": -1.2652558088302612, + "logits/rejected": -1.3686047792434692, + "logps/chosen": -135.8892822265625, + "logps/rejected": -177.48080444335938, + "loss": 2.2588, + "rewards/accuracies": 0.0, + "rewards/chosen": 4.2899169921875, + "rewards/margins": -4.439272880554199, + "rewards/rejected": 8.7291898727417, + "step": 4458 + }, + { + "epoch": 0.72, + "learning_rate": 7.379554460211318e-06, + "logits/chosen": -0.9670031666755676, + "logits/rejected": -0.9242235422134399, + "logps/chosen": -79.9405746459961, + "logps/rejected": -105.09532165527344, + "loss": 0.2015, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.171990156173706, + "rewards/margins": 1.5247352123260498, + "rewards/rejected": 0.6472549438476562, + "step": 4459 + }, + { + "epoch": 0.72, + "learning_rate": 7.378398500206967e-06, + "logits/chosen": -0.8133024573326111, + "logits/rejected": -0.8928736448287964, + "logps/chosen": -97.2618179321289, + "logps/rejected": -137.52328491210938, + "loss": 1.8357, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.548973798751831, + "rewards/margins": -2.802349328994751, + "rewards/rejected": 5.351323127746582, + "step": 4460 + }, + { + "epoch": 0.72, + "learning_rate": 7.377242375877785e-06, + "logits/chosen": -1.0614501237869263, + "logits/rejected": -1.0645065307617188, + "logps/chosen": -32.517581939697266, + "logps/rejected": -2.7883968353271484, + "loss": 0.938, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.7638840079307556, + "rewards/margins": 0.19379979372024536, + "rewards/rejected": 0.5700842142105103, + "step": 4461 + }, + { + "epoch": 0.72, + "learning_rate": 7.376086087303649e-06, + "logits/chosen": -0.9812135696411133, + "logits/rejected": -0.9914979934692383, + "logps/chosen": -60.65764617919922, + "logps/rejected": -75.04113006591797, + "loss": 0.455, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.4301764965057373, + "rewards/margins": 0.7016326189041138, + "rewards/rejected": 1.7285438776016235, + "step": 4462 + }, + { + "epoch": 0.72, + "learning_rate": 7.374929634564446e-06, + "logits/chosen": -1.1301711797714233, + "logits/rejected": -1.0458356142044067, + "logps/chosen": -66.172607421875, + "logps/rejected": -52.24577331542969, + "loss": 0.2549, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.126448154449463, + "rewards/margins": 1.3653016090393066, + "rewards/rejected": 2.7611465454101562, + "step": 4463 + }, + { + "epoch": 0.72, + "learning_rate": 7.3737730177400775e-06, + "logits/chosen": -1.2281721830368042, + "logits/rejected": -1.1057288646697998, + "logps/chosen": -153.11383056640625, + "logps/rejected": -42.68048095703125, + "loss": 0.0808, + "rewards/accuracies": 1.0, + "rewards/chosen": 8.357327461242676, + "rewards/margins": 5.762205123901367, + "rewards/rejected": 2.5951225757598877, + "step": 4464 + }, + { + "epoch": 0.72, + "learning_rate": 7.372616236910456e-06, + "logits/chosen": -1.0974717140197754, + "logits/rejected": -1.0591275691986084, + "logps/chosen": -58.30499267578125, + "logps/rejected": -74.28617095947266, + "loss": 0.2891, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.8972641229629517, + "rewards/margins": 0.3873603343963623, + "rewards/rejected": 1.5099037885665894, + "step": 4465 + }, + { + "epoch": 0.72, + "learning_rate": 7.371459292155501e-06, + "logits/chosen": -1.221353530883789, + "logits/rejected": -1.3603748083114624, + "logps/chosen": -196.43460083007812, + "logps/rejected": -144.39617919921875, + "loss": 1.318, + "rewards/accuracies": 0.0, + "rewards/chosen": 5.252810955047607, + "rewards/margins": -1.3788237571716309, + "rewards/rejected": 6.631634712219238, + "step": 4466 + }, + { + "epoch": 0.73, + "learning_rate": 7.370302183555148e-06, + "logits/chosen": -1.0935750007629395, + "logits/rejected": -1.0900856256484985, + "logps/chosen": -61.871177673339844, + "logps/rejected": -51.71681213378906, + "loss": 0.7953, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.485621690750122, + "rewards/margins": 0.03020477294921875, + "rewards/rejected": 1.4554169178009033, + "step": 4467 + }, + { + "epoch": 0.73, + "learning_rate": 7.369144911189342e-06, + "logits/chosen": -1.1635668277740479, + "logits/rejected": -1.153137445449829, + "logps/chosen": -108.78292083740234, + "logps/rejected": -81.57428741455078, + "loss": 0.9777, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.0903937816619873, + "rewards/margins": -0.4614992141723633, + "rewards/rejected": 3.5518929958343506, + "step": 4468 + }, + { + "epoch": 0.73, + "learning_rate": 7.36798747513804e-06, + "logits/chosen": -1.1616731882095337, + "logits/rejected": -1.1649558544158936, + "logps/chosen": -117.43992614746094, + "logps/rejected": -99.12554931640625, + "loss": 0.6435, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.2896149158477783, + "rewards/margins": 0.02652895450592041, + "rewards/rejected": 1.263085961341858, + "step": 4469 + }, + { + "epoch": 0.73, + "learning_rate": 7.366829875481209e-06, + "logits/chosen": -1.0431114435195923, + "logits/rejected": -1.0353810787200928, + "logps/chosen": -113.14555358886719, + "logps/rejected": -103.1962890625, + "loss": 1.5345, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.6461334228515625, + "rewards/margins": -1.830026388168335, + "rewards/rejected": 3.4761598110198975, + "step": 4470 + }, + { + "epoch": 0.73, + "learning_rate": 7.365672112298829e-06, + "logits/chosen": -0.9793500900268555, + "logits/rejected": -0.958732545375824, + "logps/chosen": -42.122257232666016, + "logps/rejected": -62.02729415893555, + "loss": 0.9191, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.6746540069580078, + "rewards/margins": -0.8276498317718506, + "rewards/rejected": 2.5023038387298584, + "step": 4471 + }, + { + "epoch": 0.73, + "learning_rate": 7.36451418567089e-06, + "logits/chosen": -1.5243284702301025, + "logits/rejected": -1.3451461791992188, + "logps/chosen": -117.46726989746094, + "logps/rejected": -34.161766052246094, + "loss": 0.0239, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.422218322753906, + "rewards/margins": 5.0665998458862305, + "rewards/rejected": 0.35561829805374146, + "step": 4472 + }, + { + "epoch": 0.73, + "learning_rate": 7.363356095677395e-06, + "logits/chosen": -1.2526689767837524, + "logits/rejected": -1.027893304824829, + "logps/chosen": -142.84739685058594, + "logps/rejected": -78.36518096923828, + "loss": 0.6342, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.303321838378906, + "rewards/margins": 1.9370079040527344, + "rewards/rejected": 3.366313934326172, + "step": 4473 + }, + { + "epoch": 0.73, + "learning_rate": 7.362197842398355e-06, + "logits/chosen": -0.9963791966438293, + "logits/rejected": -0.8837172389030457, + "logps/chosen": -75.54682159423828, + "logps/rejected": -28.5377197265625, + "loss": 0.8082, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.59233558177948, + "rewards/margins": 0.8763348460197449, + "rewards/rejected": 0.7160007357597351, + "step": 4474 + }, + { + "epoch": 0.73, + "learning_rate": 7.361039425913797e-06, + "logits/chosen": -0.576636016368866, + "logits/rejected": -0.576636016368866, + "logps/chosen": -62.822296142578125, + "logps/rejected": -62.822296142578125, + "loss": 0.3657, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.1097183227539062, + "rewards/margins": 0.0, + "rewards/rejected": 1.1097183227539062, + "step": 4475 + }, + { + "epoch": 0.73, + "learning_rate": 7.359880846303753e-06, + "logits/chosen": -1.246291995048523, + "logits/rejected": -1.132066249847412, + "logps/chosen": -59.400054931640625, + "logps/rejected": -19.746055603027344, + "loss": 0.8033, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.493658423423767, + "rewards/margins": 0.4357055425643921, + "rewards/rejected": 1.057952880859375, + "step": 4476 + }, + { + "epoch": 0.73, + "learning_rate": 7.358722103648274e-06, + "logits/chosen": -1.4663227796554565, + "logits/rejected": -1.4813649654388428, + "logps/chosen": -65.52081298828125, + "logps/rejected": -72.0123291015625, + "loss": 1.6076, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.043132781982422, + "rewards/margins": -0.3006737232208252, + "rewards/rejected": 2.343806505203247, + "step": 4477 + }, + { + "epoch": 0.73, + "learning_rate": 7.357563198027414e-06, + "logits/chosen": -0.8294779062271118, + "logits/rejected": -0.769978940486908, + "logps/chosen": -39.81224060058594, + "logps/rejected": -54.84925842285156, + "loss": 0.2044, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.677311658859253, + "rewards/margins": 1.3398345708847046, + "rewards/rejected": 1.3374770879745483, + "step": 4478 + }, + { + "epoch": 0.73, + "learning_rate": 7.356404129521246e-06, + "logits/chosen": -0.9003432989120483, + "logits/rejected": -0.9327372908592224, + "logps/chosen": -127.69731903076172, + "logps/rejected": -113.15986633300781, + "loss": 1.6292, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.4481712579727173, + "rewards/margins": -1.855918049812317, + "rewards/rejected": 3.304089307785034, + "step": 4479 + }, + { + "epoch": 0.73, + "learning_rate": 7.355244898209848e-06, + "logits/chosen": -0.8617494106292725, + "logits/rejected": -0.9075683951377869, + "logps/chosen": -73.46270751953125, + "logps/rejected": -57.580360412597656, + "loss": 0.7258, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.6931648254394531, + "rewards/margins": 0.31931984424591064, + "rewards/rejected": 1.3738449811935425, + "step": 4480 + }, + { + "epoch": 0.73, + "learning_rate": 7.3540855041733135e-06, + "logits/chosen": -1.348388910293579, + "logits/rejected": -1.2636696100234985, + "logps/chosen": -90.16278076171875, + "logps/rejected": -57.92431640625, + "loss": 0.8937, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.6450088024139404, + "rewards/margins": -1.380213975906372, + "rewards/rejected": 4.0252227783203125, + "step": 4481 + }, + { + "epoch": 0.73, + "learning_rate": 7.3529259474917455e-06, + "logits/chosen": -0.9667494893074036, + "logits/rejected": -0.8860461115837097, + "logps/chosen": -60.79497528076172, + "logps/rejected": -105.1043701171875, + "loss": 0.432, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.9935585260391235, + "rewards/margins": -0.2862664461135864, + "rewards/rejected": 2.27982497215271, + "step": 4482 + }, + { + "epoch": 0.73, + "learning_rate": 7.351766228245259e-06, + "logits/chosen": -1.0737435817718506, + "logits/rejected": -0.9819753170013428, + "logps/chosen": -116.20962524414062, + "logps/rejected": -117.78404998779297, + "loss": 0.1606, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.682217597961426, + "rewards/margins": 1.0223703384399414, + "rewards/rejected": 3.6598472595214844, + "step": 4483 + }, + { + "epoch": 0.73, + "learning_rate": 7.350606346513977e-06, + "logits/chosen": -1.217878818511963, + "logits/rejected": -1.2400072813034058, + "logps/chosen": -176.14334106445312, + "logps/rejected": -105.07398223876953, + "loss": 0.0869, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.486728191375732, + "rewards/margins": 2.799463987350464, + "rewards/rejected": 2.6872642040252686, + "step": 4484 + }, + { + "epoch": 0.73, + "learning_rate": 7.349446302378039e-06, + "logits/chosen": -1.3803240060806274, + "logits/rejected": -1.3574974536895752, + "logps/chosen": -60.868900299072266, + "logps/rejected": -91.87217712402344, + "loss": 0.6207, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.6848599910736084, + "rewards/margins": -0.8820962905883789, + "rewards/rejected": 3.5669562816619873, + "step": 4485 + }, + { + "epoch": 0.73, + "learning_rate": 7.348286095917591e-06, + "logits/chosen": -0.895262598991394, + "logits/rejected": -0.836593508720398, + "logps/chosen": -61.1068229675293, + "logps/rejected": -50.54672622680664, + "loss": 0.9015, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.0439609289169312, + "rewards/margins": -1.2947906255722046, + "rewards/rejected": 2.3387515544891357, + "step": 4486 + }, + { + "epoch": 0.73, + "learning_rate": 7.347125727212796e-06, + "logits/chosen": -1.1293878555297852, + "logits/rejected": -1.1418038606643677, + "logps/chosen": -98.11148071289062, + "logps/rejected": -117.22966003417969, + "loss": 0.5181, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.4827148914337158, + "rewards/margins": -0.33989107608795166, + "rewards/rejected": 1.8226059675216675, + "step": 4487 + }, + { + "epoch": 0.73, + "learning_rate": 7.345965196343821e-06, + "logits/chosen": -1.1003409624099731, + "logits/rejected": -0.9824909567832947, + "logps/chosen": -80.27874755859375, + "logps/rejected": -35.69740295410156, + "loss": 0.0615, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0691933631896973, + "rewards/margins": 2.0684289932250977, + "rewards/rejected": 1.0007644891738892, + "step": 4488 + }, + { + "epoch": 0.73, + "learning_rate": 7.3448045033908476e-06, + "logits/chosen": -0.5899011492729187, + "logits/rejected": -0.6069542765617371, + "logps/chosen": -8.791805267333984, + "logps/rejected": -23.131013870239258, + "loss": 1.8389, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.23117895424365997, + "rewards/margins": -0.9254897236824036, + "rewards/rejected": 1.1566686630249023, + "step": 4489 + }, + { + "epoch": 0.73, + "learning_rate": 7.34364364843407e-06, + "logits/chosen": -0.9677128791809082, + "logits/rejected": -0.9815183877944946, + "logps/chosen": -86.84712219238281, + "logps/rejected": -32.653499603271484, + "loss": 1.3679, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.717611074447632, + "rewards/margins": -1.3026707172393799, + "rewards/rejected": 4.020281791687012, + "step": 4490 + }, + { + "epoch": 0.73, + "learning_rate": 7.3424826315536925e-06, + "logits/chosen": -0.9093325734138489, + "logits/rejected": -0.9443758130073547, + "logps/chosen": -88.11270141601562, + "logps/rejected": -106.74918365478516, + "loss": 1.1079, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.6925002932548523, + "rewards/margins": -2.024463653564453, + "rewards/rejected": 2.71696400642395, + "step": 4491 + }, + { + "epoch": 0.73, + "learning_rate": 7.3413214528299295e-06, + "logits/chosen": -1.2524287700653076, + "logits/rejected": -1.0764211416244507, + "logps/chosen": -98.83476257324219, + "logps/rejected": -52.717220306396484, + "loss": 0.3474, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.053544521331787, + "rewards/margins": 4.054527282714844, + "rewards/rejected": 1.9990170001983643, + "step": 4492 + }, + { + "epoch": 0.73, + "learning_rate": 7.3401601123430075e-06, + "logits/chosen": -1.3629729747772217, + "logits/rejected": -1.320873498916626, + "logps/chosen": -148.65866088867188, + "logps/rejected": -69.03109741210938, + "loss": 1.3767, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.756848335266113, + "rewards/margins": 1.848508596420288, + "rewards/rejected": 2.908339738845825, + "step": 4493 + }, + { + "epoch": 0.73, + "learning_rate": 7.338998610173166e-06, + "logits/chosen": -1.0072139501571655, + "logits/rejected": -0.9996488690376282, + "logps/chosen": -22.443483352661133, + "logps/rejected": -29.463258743286133, + "loss": 0.7495, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.3939821422100067, + "rewards/margins": -0.4100894629955292, + "rewards/rejected": 0.8040716052055359, + "step": 4494 + }, + { + "epoch": 0.73, + "learning_rate": 7.337836946400652e-06, + "logits/chosen": -1.213310956954956, + "logits/rejected": -1.1368931531906128, + "logps/chosen": -93.26188659667969, + "logps/rejected": -101.275146484375, + "loss": 1.8189, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.4529473781585693, + "rewards/margins": 0.38211822509765625, + "rewards/rejected": 2.070829153060913, + "step": 4495 + }, + { + "epoch": 0.73, + "learning_rate": 7.336675121105725e-06, + "logits/chosen": -1.1279274225234985, + "logits/rejected": -1.1279274225234985, + "logps/chosen": -71.23355102539062, + "logps/rejected": -71.23355102539062, + "loss": 0.3528, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.278472900390625, + "rewards/margins": 0.0, + "rewards/rejected": 3.278472900390625, + "step": 4496 + }, + { + "epoch": 0.73, + "learning_rate": 7.335513134368656e-06, + "logits/chosen": -0.7160957455635071, + "logits/rejected": -0.7690942883491516, + "logps/chosen": -72.32072448730469, + "logps/rejected": -68.66233825683594, + "loss": 0.6011, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.878509521484375, + "rewards/margins": -0.48116767406463623, + "rewards/rejected": 1.3596771955490112, + "step": 4497 + }, + { + "epoch": 0.73, + "learning_rate": 7.3343509862697295e-06, + "logits/chosen": -1.160035490989685, + "logits/rejected": -1.262477159500122, + "logps/chosen": -67.98277282714844, + "logps/rejected": -85.71124267578125, + "loss": 0.7882, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.5887161493301392, + "rewards/margins": -1.297634243965149, + "rewards/rejected": 2.886350393295288, + "step": 4498 + }, + { + "epoch": 0.73, + "learning_rate": 7.333188676889238e-06, + "logits/chosen": -1.0996004343032837, + "logits/rejected": -1.0870180130004883, + "logps/chosen": -57.8416633605957, + "logps/rejected": -59.149085998535156, + "loss": 1.2595, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.4197232723236084, + "rewards/margins": -1.6502087116241455, + "rewards/rejected": 4.069931983947754, + "step": 4499 + }, + { + "epoch": 0.73, + "learning_rate": 7.3320262063074855e-06, + "logits/chosen": -1.214530348777771, + "logits/rejected": -1.1051476001739502, + "logps/chosen": -103.39404296875, + "logps/rejected": -123.49819946289062, + "loss": 0.7355, + "rewards/accuracies": 0.0, + "rewards/chosen": 6.736423015594482, + "rewards/margins": -0.8492856025695801, + "rewards/rejected": 7.5857086181640625, + "step": 4500 + }, + { + "epoch": 0.73, + "learning_rate": 7.330863574604787e-06, + "logits/chosen": -0.8023880124092102, + "logits/rejected": -0.8178669810295105, + "logps/chosen": -46.71404266357422, + "logps/rejected": -66.10453033447266, + "loss": 0.6403, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.5682716369628906, + "rewards/margins": 0.5474143028259277, + "rewards/rejected": 2.020857334136963, + "step": 4501 + }, + { + "epoch": 0.73, + "learning_rate": 7.329700781861472e-06, + "logits/chosen": -1.047160029411316, + "logits/rejected": -1.1275091171264648, + "logps/chosen": -57.13261032104492, + "logps/rejected": -73.63029479980469, + "loss": 1.2302, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.5605884790420532, + "rewards/margins": -2.3642635345458984, + "rewards/rejected": 3.924852132797241, + "step": 4502 + }, + { + "epoch": 0.73, + "learning_rate": 7.328537828157876e-06, + "logits/chosen": -1.186996340751648, + "logits/rejected": -1.2237893342971802, + "logps/chosen": -73.00601196289062, + "logps/rejected": -65.31666564941406, + "loss": 1.3528, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.203472852706909, + "rewards/margins": 0.10101151466369629, + "rewards/rejected": 2.102461338043213, + "step": 4503 + }, + { + "epoch": 0.73, + "learning_rate": 7.327374713574349e-06, + "logits/chosen": -1.2886064052581787, + "logits/rejected": -1.288262128829956, + "logps/chosen": -68.20718383789062, + "logps/rejected": -40.38604736328125, + "loss": 0.3064, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.6636825799942017, + "rewards/margins": 0.1775909662246704, + "rewards/rejected": 1.4860916137695312, + "step": 4504 + }, + { + "epoch": 0.73, + "learning_rate": 7.326211438191251e-06, + "logits/chosen": -0.6419256925582886, + "logits/rejected": -0.6393586993217468, + "logps/chosen": -2.327805519104004, + "logps/rejected": -7.608016490936279, + "loss": 0.375, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.339404433965683, + "rewards/margins": 0.058547377586364746, + "rewards/rejected": 0.28085705637931824, + "step": 4505 + }, + { + "epoch": 0.73, + "learning_rate": 7.325048002088955e-06, + "logits/chosen": -1.066763162612915, + "logits/rejected": -1.1164655685424805, + "logps/chosen": -56.56668472290039, + "logps/rejected": -72.56144714355469, + "loss": 0.242, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.7124996185302734, + "rewards/margins": 0.6073360443115234, + "rewards/rejected": 2.10516357421875, + "step": 4506 + }, + { + "epoch": 0.73, + "learning_rate": 7.323884405347841e-06, + "logits/chosen": -1.118831992149353, + "logits/rejected": -1.1151734590530396, + "logps/chosen": -37.83186340332031, + "logps/rejected": -63.4039421081543, + "loss": 0.4809, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1007766723632812, + "rewards/margins": 0.2894725799560547, + "rewards/rejected": 2.8113040924072266, + "step": 4507 + }, + { + "epoch": 0.73, + "learning_rate": 7.322720648048303e-06, + "logits/chosen": -1.2073047161102295, + "logits/rejected": -0.9840716123580933, + "logps/chosen": -125.46173095703125, + "logps/rejected": -54.68760299682617, + "loss": 0.3383, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.9811553955078125, + "rewards/margins": 3.0505244731903076, + "rewards/rejected": 1.9306309223175049, + "step": 4508 + }, + { + "epoch": 0.73, + "learning_rate": 7.321556730270745e-06, + "logits/chosen": -1.3037136793136597, + "logits/rejected": -1.3369770050048828, + "logps/chosen": -218.64305114746094, + "logps/rejected": -164.75259399414062, + "loss": 0.0627, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.908708095550537, + "rewards/margins": 3.906245231628418, + "rewards/rejected": 2.002462863922119, + "step": 4509 + }, + { + "epoch": 0.73, + "learning_rate": 7.3203926520955846e-06, + "logits/chosen": -0.8001412153244019, + "logits/rejected": -0.8076909780502319, + "logps/chosen": -69.17611694335938, + "logps/rejected": -66.83145141601562, + "loss": 0.4544, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.5710738897323608, + "rewards/margins": 0.20304405689239502, + "rewards/rejected": 1.3680298328399658, + "step": 4510 + }, + { + "epoch": 0.73, + "learning_rate": 7.319228413603247e-06, + "logits/chosen": -0.9186557531356812, + "logits/rejected": -0.9036352038383484, + "logps/chosen": -1.5561884641647339, + "logps/rejected": -13.368916511535645, + "loss": 0.8027, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.3812393844127655, + "rewards/margins": -0.39879652857780457, + "rewards/rejected": 0.7800359129905701, + "step": 4511 + }, + { + "epoch": 0.73, + "learning_rate": 7.318064014874172e-06, + "logits/chosen": -0.8849425911903381, + "logits/rejected": -1.0358691215515137, + "logps/chosen": -122.13936614990234, + "logps/rejected": -139.0333251953125, + "loss": 1.9707, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.6058143973350525, + "rewards/margins": -2.4347190856933594, + "rewards/rejected": 3.0405335426330566, + "step": 4512 + }, + { + "epoch": 0.73, + "learning_rate": 7.316899455988806e-06, + "logits/chosen": -1.039725661277771, + "logits/rejected": -0.9950140118598938, + "logps/chosen": -41.48188400268555, + "logps/rejected": -54.49524688720703, + "loss": 0.3957, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.2042043209075928, + "rewards/margins": 0.0033795833587646484, + "rewards/rejected": 2.200824737548828, + "step": 4513 + }, + { + "epoch": 0.73, + "learning_rate": 7.315734737027612e-06, + "logits/chosen": -1.0594441890716553, + "logits/rejected": -0.9842268228530884, + "logps/chosen": -93.16409301757812, + "logps/rejected": -15.353769302368164, + "loss": 0.7554, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.8410614132881165, + "rewards/margins": -0.029382705688476562, + "rewards/rejected": 0.870444118976593, + "step": 4514 + }, + { + "epoch": 0.73, + "learning_rate": 7.3145698580710575e-06, + "logits/chosen": -0.7348765134811401, + "logits/rejected": -0.7913209795951843, + "logps/chosen": -32.146461486816406, + "logps/rejected": -109.77540588378906, + "loss": 0.3871, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.1025383472442627, + "rewards/margins": 0.17762070894241333, + "rewards/rejected": 0.9249176383018494, + "step": 4515 + }, + { + "epoch": 0.73, + "learning_rate": 7.313404819199628e-06, + "logits/chosen": -1.2273956537246704, + "logits/rejected": -1.0607740879058838, + "logps/chosen": -51.02861404418945, + "logps/rejected": -83.57972717285156, + "loss": 1.2698, + "rewards/accuracies": 0.0, + "rewards/chosen": 5.66752815246582, + "rewards/margins": -0.8191189765930176, + "rewards/rejected": 6.486647129058838, + "step": 4516 + }, + { + "epoch": 0.73, + "learning_rate": 7.312239620493815e-06, + "logits/chosen": -1.0998027324676514, + "logits/rejected": -1.0998027324676514, + "logps/chosen": -61.196651458740234, + "logps/rejected": -61.196651458740234, + "loss": 0.5126, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.652355670928955, + "rewards/margins": 0.0, + "rewards/rejected": 3.652355670928955, + "step": 4517 + }, + { + "epoch": 0.73, + "learning_rate": 7.311074262034121e-06, + "logits/chosen": -0.9916449785232544, + "logits/rejected": -0.9265441298484802, + "logps/chosen": -61.44941711425781, + "logps/rejected": -85.8285140991211, + "loss": 0.6383, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.1293076276779175, + "rewards/margins": -0.3556426763534546, + "rewards/rejected": 1.484950304031372, + "step": 4518 + }, + { + "epoch": 0.73, + "learning_rate": 7.309908743901065e-06, + "logits/chosen": -1.1621689796447754, + "logits/rejected": -1.2027204036712646, + "logps/chosen": -80.9092025756836, + "logps/rejected": -92.5026626586914, + "loss": 0.7056, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.730336904525757, + "rewards/margins": 1.127042531967163, + "rewards/rejected": 1.6032943725585938, + "step": 4519 + }, + { + "epoch": 0.73, + "learning_rate": 7.308743066175172e-06, + "logits/chosen": -0.6833528876304626, + "logits/rejected": -0.7570302486419678, + "logps/chosen": -66.19389343261719, + "logps/rejected": -62.43857192993164, + "loss": 0.4268, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.008544921875, + "rewards/margins": -0.0061872005462646484, + "rewards/rejected": 2.0147321224212646, + "step": 4520 + }, + { + "epoch": 0.73, + "learning_rate": 7.307577228936976e-06, + "logits/chosen": -0.6583237648010254, + "logits/rejected": -0.5947725772857666, + "logps/chosen": -69.70956420898438, + "logps/rejected": -74.28179931640625, + "loss": 0.3213, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.1894891262054443, + "rewards/margins": 0.20820856094360352, + "rewards/rejected": 1.9812805652618408, + "step": 4521 + }, + { + "epoch": 0.73, + "learning_rate": 7.30641123226703e-06, + "logits/chosen": -1.1312271356582642, + "logits/rejected": -1.1312271356582642, + "logps/chosen": -54.87626266479492, + "logps/rejected": -54.87626266479492, + "loss": 0.3643, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.8044209480285645, + "rewards/margins": 0.0, + "rewards/rejected": 2.8044209480285645, + "step": 4522 + }, + { + "epoch": 0.73, + "learning_rate": 7.305245076245891e-06, + "logits/chosen": -1.2833417654037476, + "logits/rejected": -1.2595804929733276, + "logps/chosen": -90.31072235107422, + "logps/rejected": -67.40206909179688, + "loss": 0.2861, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.9872230291366577, + "rewards/margins": 1.0270607471466064, + "rewards/rejected": 0.960162341594696, + "step": 4523 + }, + { + "epoch": 0.73, + "learning_rate": 7.3040787609541285e-06, + "logits/chosen": -1.3593089580535889, + "logits/rejected": -1.310219168663025, + "logps/chosen": -117.52778625488281, + "logps/rejected": -82.2646255493164, + "loss": 0.1395, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.087022542953491, + "rewards/margins": 1.8560646772384644, + "rewards/rejected": 1.2309578657150269, + "step": 4524 + }, + { + "epoch": 0.73, + "learning_rate": 7.302912286472326e-06, + "logits/chosen": -1.3960649967193604, + "logits/rejected": -1.2703713178634644, + "logps/chosen": -135.9697265625, + "logps/rejected": -82.5721206665039, + "loss": 0.0543, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.010076999664307, + "rewards/margins": 3.6916115283966064, + "rewards/rejected": 1.3184654712677002, + "step": 4525 + }, + { + "epoch": 0.73, + "learning_rate": 7.301745652881073e-06, + "logits/chosen": -0.6794248223304749, + "logits/rejected": -0.5793164968490601, + "logps/chosen": -65.5699234008789, + "logps/rejected": -54.67262268066406, + "loss": 1.3122, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1578896045684814, + "rewards/margins": 1.8523674011230469, + "rewards/rejected": 1.3055222034454346, + "step": 4526 + }, + { + "epoch": 0.73, + "learning_rate": 7.300578860260978e-06, + "logits/chosen": -1.1143848896026611, + "logits/rejected": -1.0691511631011963, + "logps/chosen": -93.3685531616211, + "logps/rejected": -63.19145202636719, + "loss": 0.6878, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.506964921951294, + "rewards/margins": -0.13956522941589355, + "rewards/rejected": 2.6465301513671875, + "step": 4527 + }, + { + "epoch": 0.73, + "learning_rate": 7.29941190869265e-06, + "logits/chosen": -1.068341612815857, + "logits/rejected": -1.0004407167434692, + "logps/chosen": -91.18446350097656, + "logps/rejected": -60.20686721801758, + "loss": 1.0623, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.3533082008361816, + "rewards/margins": -0.1600658893585205, + "rewards/rejected": 2.513374090194702, + "step": 4528 + }, + { + "epoch": 0.74, + "learning_rate": 7.298244798256717e-06, + "logits/chosen": -0.7495597004890442, + "logits/rejected": -0.7495597004890442, + "logps/chosen": -22.866207122802734, + "logps/rejected": -22.866207122802734, + "loss": 0.3631, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.6638439297676086, + "rewards/margins": 0.0, + "rewards/rejected": 0.6638439297676086, + "step": 4529 + }, + { + "epoch": 0.74, + "learning_rate": 7.297077529033814e-06, + "logits/chosen": -1.4759474992752075, + "logits/rejected": -1.469240665435791, + "logps/chosen": -130.68997192382812, + "logps/rejected": -55.77894592285156, + "loss": 2.2872, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.744636535644531, + "rewards/margins": 1.9993255138397217, + "rewards/rejected": 3.7453110218048096, + "step": 4530 + }, + { + "epoch": 0.74, + "learning_rate": 7.29591010110459e-06, + "logits/chosen": -1.0682239532470703, + "logits/rejected": -1.0598156452178955, + "logps/chosen": -58.12535095214844, + "logps/rejected": -7.680813789367676, + "loss": 0.3633, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.6950401663780212, + "rewards/margins": 0.0374874472618103, + "rewards/rejected": 0.6575527191162109, + "step": 4531 + }, + { + "epoch": 0.74, + "learning_rate": 7.294742514549701e-06, + "logits/chosen": -1.0286482572555542, + "logits/rejected": -1.0290745496749878, + "logps/chosen": -2.0896406173706055, + "logps/rejected": -1.6615790128707886, + "loss": 0.7547, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3811890780925751, + "rewards/margins": 0.07492110133171082, + "rewards/rejected": 0.30626797676086426, + "step": 4532 + }, + { + "epoch": 0.74, + "learning_rate": 7.293574769449818e-06, + "logits/chosen": -1.0638489723205566, + "logits/rejected": -1.0939946174621582, + "logps/chosen": -69.81452941894531, + "logps/rejected": -195.88836669921875, + "loss": 0.4605, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.7844529151916504, + "rewards/margins": 1.9731736183166504, + "rewards/rejected": 0.811279296875, + "step": 4533 + }, + { + "epoch": 0.74, + "learning_rate": 7.292406865885619e-06, + "logits/chosen": -1.2229539155960083, + "logits/rejected": -1.2290621995925903, + "logps/chosen": -29.71670913696289, + "logps/rejected": -67.98893737792969, + "loss": 1.1841, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.242858648300171, + "rewards/margins": -1.340040922164917, + "rewards/rejected": 3.582899570465088, + "step": 4534 + }, + { + "epoch": 0.74, + "learning_rate": 7.291238803937799e-06, + "logits/chosen": -1.0072529315948486, + "logits/rejected": -0.9946090579032898, + "logps/chosen": -52.449310302734375, + "logps/rejected": -61.558631896972656, + "loss": 0.7802, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.9574952125549316, + "rewards/margins": -0.6975104808807373, + "rewards/rejected": 3.655005693435669, + "step": 4535 + }, + { + "epoch": 0.74, + "learning_rate": 7.290070583687057e-06, + "logits/chosen": -1.1859149932861328, + "logits/rejected": -0.987525999546051, + "logps/chosen": -83.04997253417969, + "logps/rejected": -48.39024353027344, + "loss": 0.1421, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.814735412597656, + "rewards/margins": 3.8769690990448, + "rewards/rejected": 0.9377662539482117, + "step": 4536 + }, + { + "epoch": 0.74, + "learning_rate": 7.2889022052141046e-06, + "logits/chosen": -0.9512184262275696, + "logits/rejected": -0.835951030254364, + "logps/chosen": -45.905723571777344, + "logps/rejected": -32.96799850463867, + "loss": 0.3391, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.245256185531616, + "rewards/margins": 0.8836926221847534, + "rewards/rejected": 1.3615635633468628, + "step": 4537 + }, + { + "epoch": 0.74, + "learning_rate": 7.287733668599669e-06, + "logits/chosen": -1.1179414987564087, + "logits/rejected": -0.9765899181365967, + "logps/chosen": -64.24994659423828, + "logps/rejected": -37.982269287109375, + "loss": 0.3652, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.875817060470581, + "rewards/margins": -0.03594970703125, + "rewards/rejected": 2.911766767501831, + "step": 4538 + }, + { + "epoch": 0.74, + "learning_rate": 7.286564973924484e-06, + "logits/chosen": -1.0753791332244873, + "logits/rejected": -0.9991151690483093, + "logps/chosen": -60.84056854248047, + "logps/rejected": -54.18960189819336, + "loss": 1.6358, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.995668888092041, + "rewards/margins": -1.301276683807373, + "rewards/rejected": 4.296945571899414, + "step": 4539 + }, + { + "epoch": 0.74, + "learning_rate": 7.285396121269293e-06, + "logits/chosen": -1.0486934185028076, + "logits/rejected": -1.0637364387512207, + "logps/chosen": -96.55252838134766, + "logps/rejected": -92.08411407470703, + "loss": 0.3988, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.6858673095703125, + "rewards/margins": 0.14310526847839355, + "rewards/rejected": 1.542762041091919, + "step": 4540 + }, + { + "epoch": 0.74, + "learning_rate": 7.284227110714857e-06, + "logits/chosen": -0.9879822134971619, + "logits/rejected": -0.9297524690628052, + "logps/chosen": -98.59154510498047, + "logps/rejected": -97.14845275878906, + "loss": 1.7242, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.048093557357788, + "rewards/margins": -2.8935463428497314, + "rewards/rejected": 5.9416399002075195, + "step": 4541 + }, + { + "epoch": 0.74, + "learning_rate": 7.28305794234194e-06, + "logits/chosen": -1.0901155471801758, + "logits/rejected": -1.0277286767959595, + "logps/chosen": -46.33224868774414, + "logps/rejected": -65.1333236694336, + "loss": 0.3085, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.340778112411499, + "rewards/margins": 0.1845684051513672, + "rewards/rejected": 2.156209707260132, + "step": 4542 + }, + { + "epoch": 0.74, + "learning_rate": 7.281888616231323e-06, + "logits/chosen": -1.2149134874343872, + "logits/rejected": -1.202731966972351, + "logps/chosen": -65.52287292480469, + "logps/rejected": -78.30270385742188, + "loss": 0.7735, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.7279961109161377, + "rewards/margins": -1.1199562549591064, + "rewards/rejected": 2.847952365875244, + "step": 4543 + }, + { + "epoch": 0.74, + "learning_rate": 7.280719132463793e-06, + "logits/chosen": -0.7042579650878906, + "logits/rejected": -0.6181289553642273, + "logps/chosen": -33.9012565612793, + "logps/rejected": -50.72847366333008, + "loss": 1.4221, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.430763602256775, + "rewards/margins": -2.2290701866149902, + "rewards/rejected": 3.6598339080810547, + "step": 4544 + }, + { + "epoch": 0.74, + "learning_rate": 7.27954949112015e-06, + "logits/chosen": -1.3804954290390015, + "logits/rejected": -1.283963680267334, + "logps/chosen": -91.31795501708984, + "logps/rejected": -87.92625427246094, + "loss": 0.2882, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.5342750549316406, + "rewards/margins": 0.4712791442871094, + "rewards/rejected": 2.0629959106445312, + "step": 4545 + }, + { + "epoch": 0.74, + "learning_rate": 7.278379692281209e-06, + "logits/chosen": -0.9456243515014648, + "logits/rejected": -0.945221483707428, + "logps/chosen": -7.175380706787109, + "logps/rejected": -6.512770175933838, + "loss": 0.354, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.4156615436077118, + "rewards/margins": 0.032471537590026855, + "rewards/rejected": 0.38319000601768494, + "step": 4546 + }, + { + "epoch": 0.74, + "learning_rate": 7.277209736027788e-06, + "logits/chosen": -1.214707612991333, + "logits/rejected": -1.1520627737045288, + "logps/chosen": -112.54710388183594, + "logps/rejected": -78.08406066894531, + "loss": 0.3561, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.1168441772460938, + "rewards/margins": -0.011491537094116211, + "rewards/rejected": 2.12833571434021, + "step": 4547 + }, + { + "epoch": 0.74, + "learning_rate": 7.276039622440722e-06, + "logits/chosen": -1.302783727645874, + "logits/rejected": -1.2805503606796265, + "logps/chosen": -189.39633178710938, + "logps/rejected": -101.67778015136719, + "loss": 1.7676, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.778146266937256, + "rewards/margins": 4.063982963562012, + "rewards/rejected": 2.714163303375244, + "step": 4548 + }, + { + "epoch": 0.74, + "learning_rate": 7.274869351600853e-06, + "logits/chosen": -1.2198940515518188, + "logits/rejected": -1.2563470602035522, + "logps/chosen": -240.45147705078125, + "logps/rejected": -130.395263671875, + "loss": 0.1701, + "rewards/accuracies": 1.0, + "rewards/chosen": 8.644055366516113, + "rewards/margins": 0.9253115653991699, + "rewards/rejected": 7.718743801116943, + "step": 4549 + }, + { + "epoch": 0.74, + "learning_rate": 7.273698923589038e-06, + "logits/chosen": -1.2327567338943481, + "logits/rejected": -1.1833139657974243, + "logps/chosen": -66.7497787475586, + "logps/rejected": -50.813812255859375, + "loss": 1.1813, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.3983147144317627, + "rewards/margins": 0.7841103076934814, + "rewards/rejected": 1.6142044067382812, + "step": 4550 + }, + { + "epoch": 0.74, + "learning_rate": 7.272528338486141e-06, + "logits/chosen": -1.3351722955703735, + "logits/rejected": -1.278450608253479, + "logps/chosen": -131.15347290039062, + "logps/rejected": -76.05724334716797, + "loss": 0.1476, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.1533203125, + "rewards/margins": 1.1687750816345215, + "rewards/rejected": 2.9845452308654785, + "step": 4551 + }, + { + "epoch": 0.74, + "learning_rate": 7.271357596373039e-06, + "logits/chosen": -0.8022750616073608, + "logits/rejected": -0.9105742573738098, + "logps/chosen": -262.9432678222656, + "logps/rejected": -107.2043685913086, + "loss": 0.0416, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.819577217102051, + "rewards/margins": 2.5043387413024902, + "rewards/rejected": 4.3152384757995605, + "step": 4552 + }, + { + "epoch": 0.74, + "learning_rate": 7.270186697330618e-06, + "logits/chosen": -0.9184826016426086, + "logits/rejected": -0.8974273800849915, + "logps/chosen": -45.52177429199219, + "logps/rejected": -47.444122314453125, + "loss": 0.8718, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.9868552684783936, + "rewards/margins": -1.5431244373321533, + "rewards/rejected": 4.529979705810547, + "step": 4553 + }, + { + "epoch": 0.74, + "learning_rate": 7.2690156414397775e-06, + "logits/chosen": -0.6804640889167786, + "logits/rejected": -0.6804640889167786, + "logps/chosen": -18.398372650146484, + "logps/rejected": -18.398372650146484, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.49720287322998047, + "rewards/margins": 0.0, + "rewards/rejected": 0.49720287322998047, + "step": 4554 + }, + { + "epoch": 0.74, + "learning_rate": 7.267844428781425e-06, + "logits/chosen": -0.7915049195289612, + "logits/rejected": -0.7915049195289612, + "logps/chosen": -25.347932815551758, + "logps/rejected": -25.347932815551758, + "loss": 0.3803, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.3820924758911133, + "rewards/margins": 0.0, + "rewards/rejected": 0.3820924758911133, + "step": 4555 + }, + { + "epoch": 0.74, + "learning_rate": 7.266673059436482e-06, + "logits/chosen": -1.158975601196289, + "logits/rejected": -1.1383839845657349, + "logps/chosen": -77.1110610961914, + "logps/rejected": -61.6538200378418, + "loss": 1.7401, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.5366623401641846, + "rewards/margins": -0.0484241247177124, + "rewards/rejected": 1.585086464881897, + "step": 4556 + }, + { + "epoch": 0.74, + "learning_rate": 7.265501533485879e-06, + "logits/chosen": -1.452292561531067, + "logits/rejected": -1.4954302310943604, + "logps/chosen": -122.87339782714844, + "logps/rejected": -128.00022888183594, + "loss": 1.4936, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.1328811645507812, + "rewards/margins": -2.9022536277770996, + "rewards/rejected": 6.035134792327881, + "step": 4557 + }, + { + "epoch": 0.74, + "learning_rate": 7.264329851010554e-06, + "logits/chosen": -1.132216453552246, + "logits/rejected": -1.0911319255828857, + "logps/chosen": -75.6567153930664, + "logps/rejected": -74.12246704101562, + "loss": 2.5626, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.9143669605255127, + "rewards/margins": -4.346521377563477, + "rewards/rejected": 6.260888576507568, + "step": 4558 + }, + { + "epoch": 0.74, + "learning_rate": 7.263158012091463e-06, + "logits/chosen": -0.6371063590049744, + "logits/rejected": -0.6444323658943176, + "logps/chosen": -1.365614891052246, + "logps/rejected": -2.7447779178619385, + "loss": 0.9207, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.21488559246063232, + "rewards/margins": 0.01596979796886444, + "rewards/rejected": 0.19891579449176788, + "step": 4559 + }, + { + "epoch": 0.74, + "learning_rate": 7.261986016809568e-06, + "logits/chosen": -1.1552996635437012, + "logits/rejected": -1.1325602531433105, + "logps/chosen": -72.85582733154297, + "logps/rejected": -71.51417541503906, + "loss": 1.2294, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.423058271408081, + "rewards/margins": -2.3542983531951904, + "rewards/rejected": 4.7773566246032715, + "step": 4560 + }, + { + "epoch": 0.74, + "learning_rate": 7.260813865245842e-06, + "logits/chosen": -1.0551774501800537, + "logits/rejected": -1.007705807685852, + "logps/chosen": -69.8311767578125, + "logps/rejected": -91.15817260742188, + "loss": 0.582, + "rewards/accuracies": 0.0, + "rewards/chosen": 6.664854526519775, + "rewards/margins": -0.6866316795349121, + "rewards/rejected": 7.3514862060546875, + "step": 4561 + }, + { + "epoch": 0.74, + "learning_rate": 7.2596415574812695e-06, + "logits/chosen": -1.4467124938964844, + "logits/rejected": -1.4573490619659424, + "logps/chosen": -187.15118408203125, + "logps/rejected": -119.39234924316406, + "loss": 0.0418, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.327670574188232, + "rewards/margins": 2.6459169387817383, + "rewards/rejected": 3.681753635406494, + "step": 4562 + }, + { + "epoch": 0.74, + "learning_rate": 7.258469093596846e-06, + "logits/chosen": -1.1737726926803589, + "logits/rejected": -1.151687741279602, + "logps/chosen": -21.794570922851562, + "logps/rejected": -2.147198438644409, + "loss": 0.6337, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.9987573623657227, + "rewards/margins": 0.421755313873291, + "rewards/rejected": 0.5770020484924316, + "step": 4563 + }, + { + "epoch": 0.74, + "learning_rate": 7.2572964736735786e-06, + "logits/chosen": -0.5195397138595581, + "logits/rejected": -0.5670664310455322, + "logps/chosen": -99.811279296875, + "logps/rejected": -78.82861328125, + "loss": 0.8187, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.687020182609558, + "rewards/margins": -1.3184469938278198, + "rewards/rejected": 3.005467176437378, + "step": 4564 + }, + { + "epoch": 0.74, + "learning_rate": 7.256123697792483e-06, + "logits/chosen": -1.109175443649292, + "logits/rejected": -1.0425455570220947, + "logps/chosen": -44.94915008544922, + "logps/rejected": -64.8672866821289, + "loss": 0.4137, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.4611268043518066, + "rewards/margins": -0.11671280860900879, + "rewards/rejected": 2.5778396129608154, + "step": 4565 + }, + { + "epoch": 0.74, + "learning_rate": 7.254950766034589e-06, + "logits/chosen": -1.2434940338134766, + "logits/rejected": -1.1441993713378906, + "logps/chosen": -109.4710922241211, + "logps/rejected": -16.320167541503906, + "loss": 0.0483, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.4864296913146973, + "rewards/margins": 2.3121585845947266, + "rewards/rejected": 0.17427101731300354, + "step": 4566 + }, + { + "epoch": 0.74, + "learning_rate": 7.253777678480932e-06, + "logits/chosen": -1.0552862882614136, + "logits/rejected": -0.925003170967102, + "logps/chosen": -167.04815673828125, + "logps/rejected": -126.00505065917969, + "loss": 0.1745, + "rewards/accuracies": 1.0, + "rewards/chosen": 8.528338432312012, + "rewards/margins": 0.8803143501281738, + "rewards/rejected": 7.648024082183838, + "step": 4567 + }, + { + "epoch": 0.74, + "learning_rate": 7.252604435212564e-06, + "logits/chosen": -1.1498732566833496, + "logits/rejected": -1.083959698677063, + "logps/chosen": -74.96359252929688, + "logps/rejected": -31.98151969909668, + "loss": 0.1956, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.7351654767990112, + "rewards/margins": 1.0498418807983398, + "rewards/rejected": 0.6853235363960266, + "step": 4568 + }, + { + "epoch": 0.74, + "learning_rate": 7.2514310363105435e-06, + "logits/chosen": -0.9771916270256042, + "logits/rejected": -0.8578182458877563, + "logps/chosen": -28.21247100830078, + "logps/rejected": -22.551328659057617, + "loss": 0.9998, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.7168102264404297, + "rewards/margins": 1.7780593633651733, + "rewards/rejected": 0.9387508630752563, + "step": 4569 + }, + { + "epoch": 0.74, + "learning_rate": 7.250257481855941e-06, + "logits/chosen": -0.8328349590301514, + "logits/rejected": -0.8354790806770325, + "logps/chosen": -2.825299024581909, + "logps/rejected": -1.7706822156906128, + "loss": 1.2973, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.28484615683555603, + "rewards/margins": -0.09938442707061768, + "rewards/rejected": 0.3842305839061737, + "step": 4570 + }, + { + "epoch": 0.74, + "learning_rate": 7.249083771929839e-06, + "logits/chosen": -1.0114420652389526, + "logits/rejected": -1.1040033102035522, + "logps/chosen": -55.261383056640625, + "logps/rejected": -91.41012573242188, + "loss": 1.3154, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.5995171070098877, + "rewards/margins": -2.248685598373413, + "rewards/rejected": 4.848202705383301, + "step": 4571 + }, + { + "epoch": 0.74, + "learning_rate": 7.24790990661333e-06, + "logits/chosen": -0.903104305267334, + "logits/rejected": -0.8947146534919739, + "logps/chosen": -35.922142028808594, + "logps/rejected": -43.571319580078125, + "loss": 0.3606, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.596874952316284, + "rewards/margins": 0.012775421142578125, + "rewards/rejected": 2.584099531173706, + "step": 4572 + }, + { + "epoch": 0.74, + "learning_rate": 7.246735885987515e-06, + "logits/chosen": -1.1403621435165405, + "logits/rejected": -1.214084267616272, + "logps/chosen": -50.2901611328125, + "logps/rejected": -110.43733978271484, + "loss": 1.1354, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.646005392074585, + "rewards/margins": -2.025348424911499, + "rewards/rejected": 4.671353816986084, + "step": 4573 + }, + { + "epoch": 0.74, + "learning_rate": 7.245561710133511e-06, + "logits/chosen": -1.3109006881713867, + "logits/rejected": -1.299869179725647, + "logps/chosen": -30.24867820739746, + "logps/rejected": -40.351768493652344, + "loss": 0.7598, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.257293224334717, + "rewards/margins": -1.23909592628479, + "rewards/rejected": 3.496389150619507, + "step": 4574 + }, + { + "epoch": 0.74, + "learning_rate": 7.244387379132438e-06, + "logits/chosen": -0.7760980129241943, + "logits/rejected": -0.8286079168319702, + "logps/chosen": -26.125896453857422, + "logps/rejected": -33.08730697631836, + "loss": 0.726, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.3579158782958984, + "rewards/margins": -0.7386758327484131, + "rewards/rejected": 3.0965917110443115, + "step": 4575 + }, + { + "epoch": 0.74, + "learning_rate": 7.2432128930654354e-06, + "logits/chosen": -1.2294777631759644, + "logits/rejected": -1.204393744468689, + "logps/chosen": -45.313575744628906, + "logps/rejected": -34.569759368896484, + "loss": 0.3776, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.771101474761963, + "rewards/margins": 1.3798553943634033, + "rewards/rejected": 2.3912460803985596, + "step": 4576 + }, + { + "epoch": 0.74, + "learning_rate": 7.242038252013648e-06, + "logits/chosen": -1.3995015621185303, + "logits/rejected": -1.4004113674163818, + "logps/chosen": -43.93145751953125, + "logps/rejected": -38.97450256347656, + "loss": 1.131, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.8218865394592285, + "rewards/margins": -1.360811710357666, + "rewards/rejected": 4.1826982498168945, + "step": 4577 + }, + { + "epoch": 0.74, + "learning_rate": 7.24086345605823e-06, + "logits/chosen": -1.0516690015792847, + "logits/rejected": -1.0685662031173706, + "logps/chosen": -57.47289276123047, + "logps/rejected": -129.7274169921875, + "loss": 1.4055, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.3916420042514801, + "rewards/margins": -1.8532525300979614, + "rewards/rejected": 2.244894504547119, + "step": 4578 + }, + { + "epoch": 0.74, + "learning_rate": 7.239688505280351e-06, + "logits/chosen": -0.7483472228050232, + "logits/rejected": -0.7714723944664001, + "logps/chosen": -40.34184646606445, + "logps/rejected": -79.37373352050781, + "loss": 0.2572, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.3500133752822876, + "rewards/margins": 0.4358936548233032, + "rewards/rejected": 0.9141197204589844, + "step": 4579 + }, + { + "epoch": 0.74, + "learning_rate": 7.238513399761189e-06, + "logits/chosen": -0.9768766164779663, + "logits/rejected": -0.9768766164779663, + "logps/chosen": -17.618793487548828, + "logps/rejected": -17.618793487548828, + "loss": 0.5145, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.2906988263130188, + "rewards/margins": 0.0, + "rewards/rejected": 0.2906988263130188, + "step": 4580 + }, + { + "epoch": 0.74, + "learning_rate": 7.237338139581932e-06, + "logits/chosen": -0.9698712825775146, + "logits/rejected": -0.9735144972801208, + "logps/chosen": -3.1230990886688232, + "logps/rejected": -13.111818313598633, + "loss": 0.5025, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.32568123936653137, + "rewards/margins": -0.4389781057834625, + "rewards/rejected": 0.7646593451499939, + "step": 4581 + }, + { + "epoch": 0.74, + "learning_rate": 7.2361627248237795e-06, + "logits/chosen": -1.2531551122665405, + "logits/rejected": -1.2964504957199097, + "logps/chosen": -62.662879943847656, + "logps/rejected": -94.59262084960938, + "loss": 1.4742, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.1718850135803223, + "rewards/margins": -2.6929407119750977, + "rewards/rejected": 5.86482572555542, + "step": 4582 + }, + { + "epoch": 0.74, + "learning_rate": 7.234987155567941e-06, + "logits/chosen": -0.9116156697273254, + "logits/rejected": -0.9116156697273254, + "logps/chosen": -30.536481857299805, + "logps/rejected": -30.536481857299805, + "loss": 0.5136, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.7579416632652283, + "rewards/margins": 0.0, + "rewards/rejected": 0.7579416632652283, + "step": 4583 + }, + { + "epoch": 0.74, + "learning_rate": 7.233811431895639e-06, + "logits/chosen": -1.3107795715332031, + "logits/rejected": -1.3436287641525269, + "logps/chosen": -141.0684814453125, + "logps/rejected": -160.11093139648438, + "loss": 1.4045, + "rewards/accuracies": 0.0, + "rewards/chosen": 5.360250949859619, + "rewards/margins": -2.734363079071045, + "rewards/rejected": 8.094614028930664, + "step": 4584 + }, + { + "epoch": 0.74, + "learning_rate": 7.232635553888101e-06, + "logits/chosen": -1.1239004135131836, + "logits/rejected": -1.2738441228866577, + "logps/chosen": -65.57940673828125, + "logps/rejected": -120.11927032470703, + "loss": 0.9297, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.5185532569885254, + "rewards/margins": -1.2657265663146973, + "rewards/rejected": 4.784279823303223, + "step": 4585 + }, + { + "epoch": 0.74, + "learning_rate": 7.231459521626574e-06, + "logits/chosen": -0.8039199709892273, + "logits/rejected": -0.8039199709892273, + "logps/chosen": -26.90561866760254, + "logps/rejected": -26.90561866760254, + "loss": 1.3507, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.0612542629241943, + "rewards/margins": 0.0, + "rewards/rejected": 2.0612542629241943, + "step": 4586 + }, + { + "epoch": 0.74, + "learning_rate": 7.230283335192307e-06, + "logits/chosen": -0.5336661338806152, + "logits/rejected": -0.4686485826969147, + "logps/chosen": -44.35509490966797, + "logps/rejected": -62.395286560058594, + "loss": 0.4686, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.713387370109558, + "rewards/margins": -0.36427605152130127, + "rewards/rejected": 2.0776634216308594, + "step": 4587 + }, + { + "epoch": 0.74, + "learning_rate": 7.229106994666564e-06, + "logits/chosen": -0.7923916578292847, + "logits/rejected": -0.8017273545265198, + "logps/chosen": -55.273441314697266, + "logps/rejected": -78.79735565185547, + "loss": 0.749, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.4695758819580078, + "rewards/margins": -1.0363101959228516, + "rewards/rejected": 2.5058860778808594, + "step": 4588 + }, + { + "epoch": 0.74, + "learning_rate": 7.227930500130621e-06, + "logits/chosen": -0.9508748054504395, + "logits/rejected": -0.9062403440475464, + "logps/chosen": -50.0221061706543, + "logps/rejected": -23.42013931274414, + "loss": 0.3908, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.520077109336853, + "rewards/margins": 1.3389625549316406, + "rewards/rejected": 0.1811145842075348, + "step": 4589 + }, + { + "epoch": 0.75, + "learning_rate": 7.226753851665761e-06, + "logits/chosen": -1.2547976970672607, + "logits/rejected": -1.3098398447036743, + "logps/chosen": -166.8768310546875, + "logps/rejected": -150.27975463867188, + "loss": 0.2934, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.62078857421875, + "rewards/margins": 0.3118925094604492, + "rewards/rejected": 6.308896064758301, + "step": 4590 + }, + { + "epoch": 0.75, + "learning_rate": 7.22557704935328e-06, + "logits/chosen": -0.8940562009811401, + "logits/rejected": -0.7824411392211914, + "logps/chosen": -58.22935104370117, + "logps/rejected": -22.70069122314453, + "loss": 0.319, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.266589879989624, + "rewards/margins": 2.103682041168213, + "rewards/rejected": 0.16290779411792755, + "step": 4591 + }, + { + "epoch": 0.75, + "learning_rate": 7.224400093274483e-06, + "logits/chosen": -1.0309778451919556, + "logits/rejected": -0.9926586151123047, + "logps/chosen": -31.45254135131836, + "logps/rejected": -45.062774658203125, + "loss": 1.5003, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.624277114868164, + "rewards/margins": -1.4363696575164795, + "rewards/rejected": 3.0606467723846436, + "step": 4592 + }, + { + "epoch": 0.75, + "learning_rate": 7.223222983510687e-06, + "logits/chosen": -1.1503677368164062, + "logits/rejected": -0.9106374979019165, + "logps/chosen": -84.05067443847656, + "logps/rejected": -27.356250762939453, + "loss": 0.0635, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.4642302989959717, + "rewards/margins": 2.6194918155670166, + "rewards/rejected": 0.8447384238243103, + "step": 4593 + }, + { + "epoch": 0.75, + "learning_rate": 7.22204572014322e-06, + "logits/chosen": -0.8939138650894165, + "logits/rejected": -0.8537652492523193, + "logps/chosen": -92.07756042480469, + "logps/rejected": -74.21928405761719, + "loss": 0.296, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.7308220863342285, + "rewards/margins": 1.4473869800567627, + "rewards/rejected": 1.2834351062774658, + "step": 4594 + }, + { + "epoch": 0.75, + "learning_rate": 7.2208683032534175e-06, + "logits/chosen": -1.1398316621780396, + "logits/rejected": -0.9256826639175415, + "logps/chosen": -62.612728118896484, + "logps/rejected": -86.48377990722656, + "loss": 0.016, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.922081470489502, + "rewards/margins": 3.6876683235168457, + "rewards/rejected": 2.2344131469726562, + "step": 4595 + }, + { + "epoch": 0.75, + "learning_rate": 7.219690732922631e-06, + "logits/chosen": -1.0051084756851196, + "logits/rejected": -1.0051084756851196, + "logps/chosen": -103.54926300048828, + "logps/rejected": -103.54926300048828, + "loss": 1.4604, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.3267784118652344, + "rewards/margins": 0.0, + "rewards/rejected": 1.3267784118652344, + "step": 4596 + }, + { + "epoch": 0.75, + "learning_rate": 7.218513009232216e-06, + "logits/chosen": -1.1983002424240112, + "logits/rejected": -1.1847859621047974, + "logps/chosen": -51.90006637573242, + "logps/rejected": -66.08114624023438, + "loss": 0.5319, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.2946643829345703, + "rewards/margins": -0.5686931610107422, + "rewards/rejected": 1.8633575439453125, + "step": 4597 + }, + { + "epoch": 0.75, + "learning_rate": 7.217335132263545e-06, + "logits/chosen": -0.8272575736045837, + "logits/rejected": -0.8372445106506348, + "logps/chosen": -27.485761642456055, + "logps/rejected": -68.42879486083984, + "loss": 0.5838, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.30784663558006287, + "rewards/margins": -0.14783117175102234, + "rewards/rejected": 0.4556778073310852, + "step": 4598 + }, + { + "epoch": 0.75, + "learning_rate": 7.216157102097997e-06, + "logits/chosen": -0.7683327198028564, + "logits/rejected": -0.830672562122345, + "logps/chosen": -58.38478088378906, + "logps/rejected": -52.56507873535156, + "loss": 0.9651, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.512427568435669, + "rewards/margins": -0.7009284496307373, + "rewards/rejected": 4.213356018066406, + "step": 4599 + }, + { + "epoch": 0.75, + "learning_rate": 7.214978918816962e-06, + "logits/chosen": -0.9510802030563354, + "logits/rejected": -0.9328389167785645, + "logps/chosen": -46.92605209350586, + "logps/rejected": -127.03541564941406, + "loss": 0.4571, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.743703842163086, + "rewards/margins": 1.397141695022583, + "rewards/rejected": 0.3465622067451477, + "step": 4600 + }, + { + "epoch": 0.75, + "learning_rate": 7.213800582501843e-06, + "logits/chosen": -1.4024691581726074, + "logits/rejected": -1.294716715812683, + "logps/chosen": -78.35636138916016, + "logps/rejected": -52.86285400390625, + "loss": 0.5832, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.842352390289307, + "rewards/margins": 1.8989410400390625, + "rewards/rejected": 2.943411350250244, + "step": 4601 + }, + { + "epoch": 0.75, + "learning_rate": 7.212622093234049e-06, + "logits/chosen": -1.1738300323486328, + "logits/rejected": -1.3356181383132935, + "logps/chosen": -59.61628723144531, + "logps/rejected": -114.39285278320312, + "loss": 1.7167, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.518247365951538, + "rewards/margins": -1.4400322437286377, + "rewards/rejected": 4.958279609680176, + "step": 4602 + }, + { + "epoch": 0.75, + "learning_rate": 7.211443451095007e-06, + "logits/chosen": -1.2019240856170654, + "logits/rejected": -1.276423454284668, + "logps/chosen": -81.66504669189453, + "logps/rejected": -97.58037567138672, + "loss": 0.5961, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.2600791454315186, + "rewards/margins": -0.8293321132659912, + "rewards/rejected": 4.08941125869751, + "step": 4603 + }, + { + "epoch": 0.75, + "learning_rate": 7.210264656166146e-06, + "logits/chosen": -1.1934581995010376, + "logits/rejected": -1.165022611618042, + "logps/chosen": -60.7960090637207, + "logps/rejected": -70.96539306640625, + "loss": 0.4755, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9138011932373047, + "rewards/margins": 0.1938793659210205, + "rewards/rejected": 2.719921827316284, + "step": 4604 + }, + { + "epoch": 0.75, + "learning_rate": 7.2090857085289115e-06, + "logits/chosen": -0.4612613022327423, + "logits/rejected": -0.4890420734882355, + "logps/chosen": -11.241912841796875, + "logps/rejected": -48.0767707824707, + "loss": 0.564, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.20524311065673828, + "rewards/margins": -0.7227270007133484, + "rewards/rejected": 0.9279701113700867, + "step": 4605 + }, + { + "epoch": 0.75, + "learning_rate": 7.2079066082647566e-06, + "logits/chosen": -1.2283549308776855, + "logits/rejected": -1.2051832675933838, + "logps/chosen": -61.15425109863281, + "logps/rejected": -62.11039352416992, + "loss": 0.8612, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.8249748945236206, + "rewards/margins": 0.17829322814941406, + "rewards/rejected": 1.6466816663742065, + "step": 4606 + }, + { + "epoch": 0.75, + "learning_rate": 7.206727355455147e-06, + "logits/chosen": -1.238621711730957, + "logits/rejected": -1.2061936855316162, + "logps/chosen": -37.8948974609375, + "logps/rejected": -42.15777587890625, + "loss": 0.5863, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.735427141189575, + "rewards/margins": 0.5788702964782715, + "rewards/rejected": 2.1565568447113037, + "step": 4607 + }, + { + "epoch": 0.75, + "learning_rate": 7.205547950181556e-06, + "logits/chosen": -1.3801183700561523, + "logits/rejected": -1.2821217775344849, + "logps/chosen": -80.3992919921875, + "logps/rejected": -37.947898864746094, + "loss": 0.161, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.664203643798828, + "rewards/margins": 1.2288222312927246, + "rewards/rejected": 3.4353814125061035, + "step": 4608 + }, + { + "epoch": 0.75, + "learning_rate": 7.204368392525472e-06, + "logits/chosen": -0.9588225483894348, + "logits/rejected": -0.8888917565345764, + "logps/chosen": -99.92050170898438, + "logps/rejected": -49.11582946777344, + "loss": 0.4052, + "rewards/accuracies": 0.0, + "rewards/chosen": 5.095532417297363, + "rewards/margins": -0.009005546569824219, + "rewards/rejected": 5.1045379638671875, + "step": 4609 + }, + { + "epoch": 0.75, + "learning_rate": 7.20318868256839e-06, + "logits/chosen": -1.1806998252868652, + "logits/rejected": -1.2291353940963745, + "logps/chosen": -53.32191848754883, + "logps/rejected": -40.96345901489258, + "loss": 0.7156, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.611502468585968, + "rewards/margins": -1.0154869556427002, + "rewards/rejected": 1.6269893646240234, + "step": 4610 + }, + { + "epoch": 0.75, + "learning_rate": 7.202008820391817e-06, + "logits/chosen": -1.0902769565582275, + "logits/rejected": -1.0654289722442627, + "logps/chosen": -33.939308166503906, + "logps/rejected": -13.889053344726562, + "loss": 0.3035, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.1583389043807983, + "rewards/margins": 0.18530195951461792, + "rewards/rejected": 0.9730369448661804, + "step": 4611 + }, + { + "epoch": 0.75, + "learning_rate": 7.20082880607727e-06, + "logits/chosen": -1.3514900207519531, + "logits/rejected": -1.39883553981781, + "logps/chosen": -73.29806518554688, + "logps/rejected": -71.16918182373047, + "loss": 0.6686, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.0856903791427612, + "rewards/margins": -0.9561728239059448, + "rewards/rejected": 2.041863203048706, + "step": 4612 + }, + { + "epoch": 0.75, + "learning_rate": 7.199648639706276e-06, + "logits/chosen": -1.0493117570877075, + "logits/rejected": -1.0680019855499268, + "logps/chosen": -78.76087951660156, + "logps/rejected": -67.75056457519531, + "loss": 0.4514, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.274676561355591, + "rewards/margins": -0.3669724464416504, + "rewards/rejected": 2.641649007797241, + "step": 4613 + }, + { + "epoch": 0.75, + "learning_rate": 7.198468321360376e-06, + "logits/chosen": -1.5318363904953003, + "logits/rejected": -1.2805595397949219, + "logps/chosen": -82.15792846679688, + "logps/rejected": -57.43323516845703, + "loss": 0.8874, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.856771945953369, + "rewards/margins": 2.995318651199341, + "rewards/rejected": 1.8614532947540283, + "step": 4614 + }, + { + "epoch": 0.75, + "learning_rate": 7.197287851121115e-06, + "logits/chosen": -1.0557754039764404, + "logits/rejected": -0.8894128203392029, + "logps/chosen": -111.24028015136719, + "logps/rejected": -32.74522399902344, + "loss": 0.1321, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.798442363739014, + "rewards/margins": 2.3806300163269043, + "rewards/rejected": 2.4178123474121094, + "step": 4615 + }, + { + "epoch": 0.75, + "learning_rate": 7.196107229070055e-06, + "logits/chosen": -1.1663823127746582, + "logits/rejected": -1.1179414987564087, + "logps/chosen": -34.72677230834961, + "logps/rejected": -92.00618743896484, + "loss": 1.0603, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.3469581604003906, + "rewards/margins": 0.6175400018692017, + "rewards/rejected": 1.729418158531189, + "step": 4616 + }, + { + "epoch": 0.75, + "learning_rate": 7.194926455288766e-06, + "logits/chosen": -0.9158944487571716, + "logits/rejected": -0.9647402167320251, + "logps/chosen": -49.276248931884766, + "logps/rejected": -87.60086822509766, + "loss": 0.6648, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.7320408821105957, + "rewards/margins": 0.15227103233337402, + "rewards/rejected": 2.5797698497772217, + "step": 4617 + }, + { + "epoch": 0.75, + "learning_rate": 7.193745529858827e-06, + "logits/chosen": -0.7442492246627808, + "logits/rejected": -0.6356205344200134, + "logps/chosen": -30.78396987915039, + "logps/rejected": -15.285441398620605, + "loss": 1.0092, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.107407331466675, + "rewards/margins": 0.27007555961608887, + "rewards/rejected": 1.837331771850586, + "step": 4618 + }, + { + "epoch": 0.75, + "learning_rate": 7.192564452861829e-06, + "logits/chosen": -1.4147586822509766, + "logits/rejected": -1.30230712890625, + "logps/chosen": -175.4984130859375, + "logps/rejected": -43.48006820678711, + "loss": 0.4767, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.775482177734375, + "rewards/margins": 1.5259959697723389, + "rewards/rejected": 3.249486207962036, + "step": 4619 + }, + { + "epoch": 0.75, + "learning_rate": 7.191383224379374e-06, + "logits/chosen": -1.2004915475845337, + "logits/rejected": -1.1688953638076782, + "logps/chosen": -93.59347534179688, + "logps/rejected": -78.6715087890625, + "loss": 1.3689, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.0557236671447754, + "rewards/margins": -2.1911802291870117, + "rewards/rejected": 4.246903896331787, + "step": 4620 + }, + { + "epoch": 0.75, + "learning_rate": 7.190201844493073e-06, + "logits/chosen": -1.1530500650405884, + "logits/rejected": -1.1219406127929688, + "logps/chosen": -45.00489807128906, + "logps/rejected": -77.63921356201172, + "loss": 0.7234, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.164876699447632, + "rewards/margins": -1.1776397228240967, + "rewards/rejected": 4.3425164222717285, + "step": 4621 + }, + { + "epoch": 0.75, + "learning_rate": 7.189020313284549e-06, + "logits/chosen": -0.9904966354370117, + "logits/rejected": -0.9333704113960266, + "logps/chosen": -77.81547546386719, + "logps/rejected": -26.041542053222656, + "loss": 1.6146, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.7792938947677612, + "rewards/margins": 1.307230830192566, + "rewards/rejected": 0.4720630645751953, + "step": 4622 + }, + { + "epoch": 0.75, + "learning_rate": 7.1878386308354334e-06, + "logits/chosen": -1.091690182685852, + "logits/rejected": -1.1167997121810913, + "logps/chosen": -59.778053283691406, + "logps/rejected": -148.5334930419922, + "loss": 0.6428, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.4918373823165894, + "rewards/margins": -0.42575907707214355, + "rewards/rejected": 1.917596459388733, + "step": 4623 + }, + { + "epoch": 0.75, + "learning_rate": 7.186656797227371e-06, + "logits/chosen": -1.1246416568756104, + "logits/rejected": -1.1246416568756104, + "logps/chosen": -44.322425842285156, + "logps/rejected": -44.322425842285156, + "loss": 0.3628, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.454205274581909, + "rewards/margins": 0.0, + "rewards/rejected": 2.454205274581909, + "step": 4624 + }, + { + "epoch": 0.75, + "learning_rate": 7.185474812542013e-06, + "logits/chosen": -0.63698410987854, + "logits/rejected": -0.615287721157074, + "logps/chosen": -41.00211715698242, + "logps/rejected": -78.26615905761719, + "loss": 2.6337, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.698219656944275, + "rewards/margins": 0.24922513961791992, + "rewards/rejected": 1.448994517326355, + "step": 4625 + }, + { + "epoch": 0.75, + "learning_rate": 7.184292676861024e-06, + "logits/chosen": -0.6703947186470032, + "logits/rejected": -0.6665577292442322, + "logps/chosen": -41.795528411865234, + "logps/rejected": -38.823570251464844, + "loss": 0.7857, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.392599105834961, + "rewards/margins": -0.5643215179443359, + "rewards/rejected": 2.956920623779297, + "step": 4626 + }, + { + "epoch": 0.75, + "learning_rate": 7.183110390266081e-06, + "logits/chosen": -0.8572003245353699, + "logits/rejected": -0.8419802188873291, + "logps/chosen": -32.4081916809082, + "logps/rejected": -22.537302017211914, + "loss": 0.7965, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.1894271820783615, + "rewards/margins": -0.36621588468551636, + "rewards/rejected": 0.5556430816650391, + "step": 4627 + }, + { + "epoch": 0.75, + "learning_rate": 7.181927952838865e-06, + "logits/chosen": -1.3471323251724243, + "logits/rejected": -1.3617886304855347, + "logps/chosen": -78.53295135498047, + "logps/rejected": -68.98011779785156, + "loss": 0.6832, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.8347610235214233, + "rewards/margins": -0.6983047723770142, + "rewards/rejected": 2.5330657958984375, + "step": 4628 + }, + { + "epoch": 0.75, + "learning_rate": 7.180745364661075e-06, + "logits/chosen": -0.881374716758728, + "logits/rejected": -0.8866407871246338, + "logps/chosen": -10.60384750366211, + "logps/rejected": -2.0679562091827393, + "loss": 0.4162, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.09317226707935333, + "rewards/margins": -0.26040083169937134, + "rewards/rejected": 0.35357311367988586, + "step": 4629 + }, + { + "epoch": 0.75, + "learning_rate": 7.179562625814414e-06, + "logits/chosen": -0.7352977395057678, + "logits/rejected": -0.7352977395057678, + "logps/chosen": -26.407930374145508, + "logps/rejected": -26.407930374145508, + "loss": 0.3495, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.445285677909851, + "rewards/margins": 0.0, + "rewards/rejected": 1.445285677909851, + "step": 4630 + }, + { + "epoch": 0.75, + "learning_rate": 7.178379736380597e-06, + "logits/chosen": -1.060476541519165, + "logits/rejected": -1.2736990451812744, + "logps/chosen": -45.92436218261719, + "logps/rejected": -106.571533203125, + "loss": 3.1032, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.6106979846954346, + "rewards/margins": -5.050447463989258, + "rewards/rejected": 7.661145210266113, + "step": 4631 + }, + { + "epoch": 0.75, + "learning_rate": 7.1771966964413545e-06, + "logits/chosen": -1.1122759580612183, + "logits/rejected": -1.0930598974227905, + "logps/chosen": -91.91615295410156, + "logps/rejected": -90.10945892333984, + "loss": 1.9, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.2304054498672485, + "rewards/margins": -2.9182848930358887, + "rewards/rejected": 4.148690223693848, + "step": 4632 + }, + { + "epoch": 0.75, + "learning_rate": 7.17601350607842e-06, + "logits/chosen": -1.0667527914047241, + "logits/rejected": -1.0656096935272217, + "logps/chosen": -49.65674591064453, + "logps/rejected": -101.39347839355469, + "loss": 1.0495, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.0142173767089844, + "rewards/margins": 0.27491605281829834, + "rewards/rejected": 1.739301323890686, + "step": 4633 + }, + { + "epoch": 0.75, + "learning_rate": 7.174830165373542e-06, + "logits/chosen": -1.0656373500823975, + "logits/rejected": -1.0653094053268433, + "logps/chosen": -22.26972007751465, + "logps/rejected": -71.39320373535156, + "loss": 0.3502, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.5668855905532837, + "rewards/margins": 0.3378725051879883, + "rewards/rejected": 1.2290130853652954, + "step": 4634 + }, + { + "epoch": 0.75, + "learning_rate": 7.173646674408479e-06, + "logits/chosen": -1.3115365505218506, + "logits/rejected": -0.9931527376174927, + "logps/chosen": -85.40313720703125, + "logps/rejected": -78.76075744628906, + "loss": 1.7701, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.458264112472534, + "rewards/margins": -1.9621851444244385, + "rewards/rejected": 4.420449256896973, + "step": 4635 + }, + { + "epoch": 0.75, + "learning_rate": 7.172463033264997e-06, + "logits/chosen": -0.9804337620735168, + "logits/rejected": -1.0734952688217163, + "logps/chosen": -164.3924560546875, + "logps/rejected": -56.550323486328125, + "loss": 0.0823, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.567822456359863, + "rewards/margins": 1.9579203128814697, + "rewards/rejected": 2.6099021434783936, + "step": 4636 + }, + { + "epoch": 0.75, + "learning_rate": 7.171279242024876e-06, + "logits/chosen": -0.8692879676818848, + "logits/rejected": -0.8905484676361084, + "logps/chosen": -132.8440399169922, + "logps/rejected": -75.62650299072266, + "loss": 1.4518, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.7683396339416504, + "rewards/margins": -0.27159571647644043, + "rewards/rejected": 3.039935350418091, + "step": 4637 + }, + { + "epoch": 0.75, + "learning_rate": 7.1700953007699035e-06, + "logits/chosen": -1.0094468593597412, + "logits/rejected": -1.0076121091842651, + "logps/chosen": -75.65087890625, + "logps/rejected": -76.83795166015625, + "loss": 0.9099, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.5047378540039062, + "rewards/margins": -1.6422209739685059, + "rewards/rejected": 4.146958827972412, + "step": 4638 + }, + { + "epoch": 0.75, + "learning_rate": 7.168911209581879e-06, + "logits/chosen": -1.1676952838897705, + "logits/rejected": -1.1795663833618164, + "logps/chosen": -105.53892517089844, + "logps/rejected": -48.86300277709961, + "loss": 1.0014, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.3471176624298096, + "rewards/margins": 1.2418360710144043, + "rewards/rejected": 2.1052815914154053, + "step": 4639 + }, + { + "epoch": 0.75, + "learning_rate": 7.167726968542613e-06, + "logits/chosen": -0.3846064805984497, + "logits/rejected": -0.3846064805984497, + "logps/chosen": -37.11005783081055, + "logps/rejected": -37.11005783081055, + "loss": 0.5103, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.6895519495010376, + "rewards/margins": 0.0, + "rewards/rejected": 0.6895519495010376, + "step": 4640 + }, + { + "epoch": 0.75, + "learning_rate": 7.166542577733925e-06, + "logits/chosen": -1.0938022136688232, + "logits/rejected": -1.0729738473892212, + "logps/chosen": -51.0267448425293, + "logps/rejected": -31.130563735961914, + "loss": 3.0783, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.8863842487335205, + "rewards/margins": -0.857508659362793, + "rewards/rejected": 2.7438929080963135, + "step": 4641 + }, + { + "epoch": 0.75, + "learning_rate": 7.165358037237644e-06, + "logits/chosen": -1.3525183200836182, + "logits/rejected": -1.3696870803833008, + "logps/chosen": -13.783000946044922, + "logps/rejected": -17.972373962402344, + "loss": 0.7195, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.5667735934257507, + "rewards/margins": -1.1553668975830078, + "rewards/rejected": 1.7221405506134033, + "step": 4642 + }, + { + "epoch": 0.75, + "learning_rate": 7.1641733471356124e-06, + "logits/chosen": -0.8242707848548889, + "logits/rejected": -0.8167340159416199, + "logps/chosen": -21.514209747314453, + "logps/rejected": -3.850649118423462, + "loss": 1.4288, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.46701622009277344, + "rewards/margins": 0.15150004625320435, + "rewards/rejected": 0.3155161738395691, + "step": 4643 + }, + { + "epoch": 0.75, + "learning_rate": 7.162988507509681e-06, + "logits/chosen": -0.866155743598938, + "logits/rejected": -0.8489328026771545, + "logps/chosen": -39.09849548339844, + "logps/rejected": -40.68779373168945, + "loss": 0.8829, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.3009560108184814, + "rewards/margins": 1.0079517364501953, + "rewards/rejected": 1.2930042743682861, + "step": 4644 + }, + { + "epoch": 0.75, + "learning_rate": 7.161803518441708e-06, + "logits/chosen": -1.4073764085769653, + "logits/rejected": -1.3436652421951294, + "logps/chosen": -81.29475402832031, + "logps/rejected": -32.565608978271484, + "loss": 0.2147, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.8986961841583252, + "rewards/margins": 0.7237430810928345, + "rewards/rejected": 1.1749531030654907, + "step": 4645 + }, + { + "epoch": 0.75, + "learning_rate": 7.160618380013568e-06, + "logits/chosen": -1.3893041610717773, + "logits/rejected": -1.2635270357131958, + "logps/chosen": -93.64008331298828, + "logps/rejected": -51.13318634033203, + "loss": 0.2384, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.0205161571502686, + "rewards/margins": 0.5563613176345825, + "rewards/rejected": 1.464154839515686, + "step": 4646 + }, + { + "epoch": 0.75, + "learning_rate": 7.159433092307142e-06, + "logits/chosen": -1.2085862159729004, + "logits/rejected": -0.9957360625267029, + "logps/chosen": -126.34601593017578, + "logps/rejected": -63.952701568603516, + "loss": 0.3628, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.686077117919922, + "rewards/margins": 3.4139902591705322, + "rewards/rejected": 2.2720868587493896, + "step": 4647 + }, + { + "epoch": 0.75, + "learning_rate": 7.158247655404321e-06, + "logits/chosen": -1.250758171081543, + "logits/rejected": -1.170599341392517, + "logps/chosen": -133.0086669921875, + "logps/rejected": -65.98420715332031, + "loss": 0.3009, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.440822124481201, + "rewards/margins": 2.8938043117523193, + "rewards/rejected": 3.547017812728882, + "step": 4648 + }, + { + "epoch": 0.75, + "learning_rate": 7.157062069387009e-06, + "logits/chosen": -0.6685304641723633, + "logits/rejected": -0.6774255633354187, + "logps/chosen": -22.78618621826172, + "logps/rejected": -19.42438507080078, + "loss": 0.3233, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.37060967087745667, + "rewards/margins": 0.1551361083984375, + "rewards/rejected": 0.21547356247901917, + "step": 4649 + }, + { + "epoch": 0.75, + "learning_rate": 7.155876334337119e-06, + "logits/chosen": -1.218267560005188, + "logits/rejected": -1.225395917892456, + "logps/chosen": -82.5212173461914, + "logps/rejected": -106.89419555664062, + "loss": 1.2163, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.7680625915527344, + "rewards/margins": -1.0076866149902344, + "rewards/rejected": 2.7757492065429688, + "step": 4650 + }, + { + "epoch": 0.75, + "learning_rate": 7.154690450336573e-06, + "logits/chosen": -0.8981297612190247, + "logits/rejected": -0.7776532769203186, + "logps/chosen": -30.840307235717773, + "logps/rejected": -59.272056579589844, + "loss": 0.0713, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.478034734725952, + "rewards/margins": 2.20504093170166, + "rewards/rejected": 0.27299386262893677, + "step": 4651 + }, + { + "epoch": 0.76, + "learning_rate": 7.153504417467305e-06, + "logits/chosen": -0.9964144825935364, + "logits/rejected": -1.1774482727050781, + "logps/chosen": -67.21232604980469, + "logps/rejected": -168.85171508789062, + "loss": 3.4162, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.8275688290596008, + "rewards/margins": -6.137009143829346, + "rewards/rejected": 6.964578151702881, + "step": 4652 + }, + { + "epoch": 0.76, + "learning_rate": 7.152318235811257e-06, + "logits/chosen": -1.2372385263442993, + "logits/rejected": -1.2521872520446777, + "logps/chosen": -37.26204299926758, + "logps/rejected": -51.961265563964844, + "loss": 0.3971, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.022351026535034, + "rewards/margins": 0.1034461259841919, + "rewards/rejected": 1.9189049005508423, + "step": 4653 + }, + { + "epoch": 0.76, + "learning_rate": 7.151131905450386e-06, + "logits/chosen": -1.2412443161010742, + "logits/rejected": -1.241377353668213, + "logps/chosen": -50.85439682006836, + "logps/rejected": -90.41808319091797, + "loss": 0.4662, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.252718687057495, + "rewards/margins": -0.2995579242706299, + "rewards/rejected": 2.552276611328125, + "step": 4654 + }, + { + "epoch": 0.76, + "learning_rate": 7.149945426466654e-06, + "logits/chosen": -1.0466653108596802, + "logits/rejected": -0.7823995351791382, + "logps/chosen": -112.06868743896484, + "logps/rejected": -15.874271392822266, + "loss": 0.1785, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.254624128341675, + "rewards/margins": 2.0522894859313965, + "rewards/rejected": 1.2023347616195679, + "step": 4655 + }, + { + "epoch": 0.76, + "learning_rate": 7.148758798942037e-06, + "logits/chosen": -1.1235120296478271, + "logits/rejected": -1.131436824798584, + "logps/chosen": -70.14351654052734, + "logps/rejected": -97.25970458984375, + "loss": 0.5777, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.0406386852264404, + "rewards/margins": -0.7744722366333008, + "rewards/rejected": 2.815110921859741, + "step": 4656 + }, + { + "epoch": 0.76, + "learning_rate": 7.147572022958518e-06, + "logits/chosen": -1.295843243598938, + "logits/rejected": -1.3807449340820312, + "logps/chosen": -157.2158660888672, + "logps/rejected": -118.59647369384766, + "loss": 3.3443, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.8115158081054688, + "rewards/margins": -4.764328956604004, + "rewards/rejected": 7.575844764709473, + "step": 4657 + }, + { + "epoch": 0.76, + "learning_rate": 7.146385098598092e-06, + "logits/chosen": -0.9298610091209412, + "logits/rejected": -0.9303746819496155, + "logps/chosen": -60.70856475830078, + "logps/rejected": -91.90507507324219, + "loss": 0.2459, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.4480340480804443, + "rewards/margins": 0.4681375026702881, + "rewards/rejected": 1.9798965454101562, + "step": 4658 + }, + { + "epoch": 0.76, + "learning_rate": 7.145198025942765e-06, + "logits/chosen": -1.1337149143218994, + "logits/rejected": -1.0601670742034912, + "logps/chosen": -50.24323654174805, + "logps/rejected": -61.23136901855469, + "loss": 0.1705, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.112278461456299, + "rewards/margins": 1.001697301864624, + "rewards/rejected": 3.110581159591675, + "step": 4659 + }, + { + "epoch": 0.76, + "learning_rate": 7.144010805074554e-06, + "logits/chosen": -0.9001777172088623, + "logits/rejected": -0.816514790058136, + "logps/chosen": -159.328125, + "logps/rejected": -22.793045043945312, + "loss": 0.0556, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.536492824554443, + "rewards/margins": 4.3550920486450195, + "rewards/rejected": 0.18140088021755219, + "step": 4660 + }, + { + "epoch": 0.76, + "learning_rate": 7.142823436075482e-06, + "logits/chosen": -1.018721580505371, + "logits/rejected": -1.018721580505371, + "logps/chosen": -21.40857696533203, + "logps/rejected": -21.40857696533203, + "loss": 0.9964, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.5401912927627563, + "rewards/margins": 0.0, + "rewards/rejected": 1.5401912927627563, + "step": 4661 + }, + { + "epoch": 0.76, + "learning_rate": 7.141635919027586e-06, + "logits/chosen": -1.1409075260162354, + "logits/rejected": -1.0835050344467163, + "logps/chosen": -57.69826889038086, + "logps/rejected": -64.84645080566406, + "loss": 0.3252, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.7473514080047607, + "rewards/margins": 0.5385637283325195, + "rewards/rejected": 3.208787679672241, + "step": 4662 + }, + { + "epoch": 0.76, + "learning_rate": 7.140448254012912e-06, + "logits/chosen": -1.379022240638733, + "logits/rejected": -1.2913267612457275, + "logps/chosen": -129.6839599609375, + "logps/rejected": -106.64042663574219, + "loss": 2.2368, + "rewards/accuracies": 0.0, + "rewards/chosen": 4.441533088684082, + "rewards/margins": -4.458865165710449, + "rewards/rejected": 8.900398254394531, + "step": 4663 + }, + { + "epoch": 0.76, + "learning_rate": 7.139260441113519e-06, + "logits/chosen": -1.2709016799926758, + "logits/rejected": -1.2986726760864258, + "logps/chosen": -137.33189392089844, + "logps/rejected": -186.99111938476562, + "loss": 1.2559, + "rewards/accuracies": 0.0, + "rewards/chosen": 7.6586503982543945, + "rewards/margins": -2.4158830642700195, + "rewards/rejected": 10.074533462524414, + "step": 4664 + }, + { + "epoch": 0.76, + "learning_rate": 7.1380724804114696e-06, + "logits/chosen": -0.872917652130127, + "logits/rejected": -0.872917652130127, + "logps/chosen": -1.3670114278793335, + "logps/rejected": -1.3670114278793335, + "loss": 0.3578, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.40463972091674805, + "rewards/margins": 0.0, + "rewards/rejected": 0.40463972091674805, + "step": 4665 + }, + { + "epoch": 0.76, + "learning_rate": 7.136884371988844e-06, + "logits/chosen": -1.0538021326065063, + "logits/rejected": -1.1426082849502563, + "logps/chosen": -179.41055297851562, + "logps/rejected": -51.389251708984375, + "loss": 0.1034, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.838922023773193, + "rewards/margins": 2.2355153560638428, + "rewards/rejected": 2.6034066677093506, + "step": 4666 + }, + { + "epoch": 0.76, + "learning_rate": 7.135696115927726e-06, + "logits/chosen": -1.184020757675171, + "logits/rejected": -1.0924323797225952, + "logps/chosen": -188.67889404296875, + "logps/rejected": -66.40792846679688, + "loss": 0.2786, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.292366027832031, + "rewards/margins": 1.6103026866912842, + "rewards/rejected": 3.682063341140747, + "step": 4667 + }, + { + "epoch": 0.76, + "learning_rate": 7.134507712310215e-06, + "logits/chosen": -0.9092155694961548, + "logits/rejected": -1.297916054725647, + "logps/chosen": -44.95453643798828, + "logps/rejected": -30.607421875, + "loss": 0.1697, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2977898120880127, + "rewards/margins": 1.2787280082702637, + "rewards/rejected": 2.019061803817749, + "step": 4668 + }, + { + "epoch": 0.76, + "learning_rate": 7.133319161218418e-06, + "logits/chosen": -0.9700253009796143, + "logits/rejected": -0.9405590295791626, + "logps/chosen": -34.01390075683594, + "logps/rejected": -23.4741268157959, + "loss": 1.7213, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.6377266645431519, + "rewards/margins": -0.14412164688110352, + "rewards/rejected": 1.7818483114242554, + "step": 4669 + }, + { + "epoch": 0.76, + "learning_rate": 7.1321304627344526e-06, + "logits/chosen": -0.9608793258666992, + "logits/rejected": -0.8554600477218628, + "logps/chosen": -59.90487289428711, + "logps/rejected": -53.247314453125, + "loss": 0.2383, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0311391353607178, + "rewards/margins": 0.5072612762451172, + "rewards/rejected": 2.5238778591156006, + "step": 4670 + }, + { + "epoch": 0.76, + "learning_rate": 7.130941616940446e-06, + "logits/chosen": -1.1330076456069946, + "logits/rejected": -1.0385923385620117, + "logps/chosen": -79.6090087890625, + "logps/rejected": -48.130714416503906, + "loss": 1.0449, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.6949570178985596, + "rewards/margins": 2.7843704223632812, + "rewards/rejected": 0.9105865359306335, + "step": 4671 + }, + { + "epoch": 0.76, + "learning_rate": 7.1297526239185375e-06, + "logits/chosen": -0.6133046746253967, + "logits/rejected": -0.6002622246742249, + "logps/chosen": -3.640355110168457, + "logps/rejected": -12.452455520629883, + "loss": 1.0453, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.3824121057987213, + "rewards/margins": -0.3355744183063507, + "rewards/rejected": 0.717986524105072, + "step": 4672 + }, + { + "epoch": 0.76, + "learning_rate": 7.128563483750874e-06, + "logits/chosen": -1.200527548789978, + "logits/rejected": -1.078678011894226, + "logps/chosen": -86.49622344970703, + "logps/rejected": -69.0752944946289, + "loss": 1.7828, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.6433815956115723, + "rewards/margins": 1.583984375, + "rewards/rejected": 2.0593972206115723, + "step": 4673 + }, + { + "epoch": 0.76, + "learning_rate": 7.127374196519616e-06, + "logits/chosen": -1.2526882886886597, + "logits/rejected": -1.2724907398223877, + "logps/chosen": -54.673274993896484, + "logps/rejected": -63.700599670410156, + "loss": 0.7977, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.9916369915008545, + "rewards/margins": -0.8446407318115234, + "rewards/rejected": 3.836277723312378, + "step": 4674 + }, + { + "epoch": 0.76, + "learning_rate": 7.126184762306929e-06, + "logits/chosen": -0.6141265630722046, + "logits/rejected": -0.6201328039169312, + "logps/chosen": -1.8557775020599365, + "logps/rejected": -4.096288681030273, + "loss": 0.748, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.2654671370983124, + "rewards/margins": -0.2083839774131775, + "rewards/rejected": 0.47385111451148987, + "step": 4675 + }, + { + "epoch": 0.76, + "learning_rate": 7.1249951811949935e-06, + "logits/chosen": -1.3446842432022095, + "logits/rejected": -1.3212429285049438, + "logps/chosen": -69.92603302001953, + "logps/rejected": -78.36019134521484, + "loss": 0.2553, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.6647926568984985, + "rewards/margins": 0.42408907413482666, + "rewards/rejected": 1.2407035827636719, + "step": 4676 + }, + { + "epoch": 0.76, + "learning_rate": 7.123805453265998e-06, + "logits/chosen": -1.1442242860794067, + "logits/rejected": -1.077689528465271, + "logps/chosen": -78.03190612792969, + "logps/rejected": -70.5472183227539, + "loss": 0.0982, + "rewards/accuracies": 1.0, + "rewards/chosen": 7.063298225402832, + "rewards/margins": 1.5517878532409668, + "rewards/rejected": 5.511510372161865, + "step": 4677 + }, + { + "epoch": 0.76, + "learning_rate": 7.122615578602141e-06, + "logits/chosen": -1.1865849494934082, + "logits/rejected": -1.0383127927780151, + "logps/chosen": -138.294189453125, + "logps/rejected": -52.00949478149414, + "loss": 0.3001, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.369714379310608, + "rewards/margins": 0.301876425743103, + "rewards/rejected": 1.0678379535675049, + "step": 4678 + }, + { + "epoch": 0.76, + "learning_rate": 7.121425557285632e-06, + "logits/chosen": -0.9493733048439026, + "logits/rejected": -0.9858086109161377, + "logps/chosen": -24.222915649414062, + "logps/rejected": -66.90631103515625, + "loss": 0.2806, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.7295040488243103, + "rewards/margins": 0.6866284012794495, + "rewards/rejected": 0.04287567362189293, + "step": 4679 + }, + { + "epoch": 0.76, + "learning_rate": 7.120235389398689e-06, + "logits/chosen": -1.002320408821106, + "logits/rejected": -0.9233319163322449, + "logps/chosen": -52.86530303955078, + "logps/rejected": -57.14854049682617, + "loss": 0.9734, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.701221466064453, + "rewards/margins": 1.3678324222564697, + "rewards/rejected": 2.3333890438079834, + "step": 4680 + }, + { + "epoch": 0.76, + "learning_rate": 7.1190450750235425e-06, + "logits/chosen": -1.5218340158462524, + "logits/rejected": -1.4447497129440308, + "logps/chosen": -93.31517028808594, + "logps/rejected": -79.08182525634766, + "loss": 1.8311, + "rewards/accuracies": 0.0, + "rewards/chosen": 4.95120096206665, + "rewards/margins": -0.6272649765014648, + "rewards/rejected": 5.578465938568115, + "step": 4681 + }, + { + "epoch": 0.76, + "learning_rate": 7.117854614242434e-06, + "logits/chosen": -1.183838129043579, + "logits/rejected": -1.137431025505066, + "logps/chosen": -62.22175979614258, + "logps/rejected": -40.979881286621094, + "loss": 0.1925, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.5075595378875732, + "rewards/margins": 0.9032297134399414, + "rewards/rejected": 2.604329824447632, + "step": 4682 + }, + { + "epoch": 0.76, + "learning_rate": 7.11666400713761e-06, + "logits/chosen": -1.1109665632247925, + "logits/rejected": -1.0406088829040527, + "logps/chosen": -96.30828857421875, + "logps/rejected": -70.84083557128906, + "loss": 0.7213, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.285464763641357, + "rewards/margins": 0.16469192504882812, + "rewards/rejected": 4.120772838592529, + "step": 4683 + }, + { + "epoch": 0.76, + "learning_rate": 7.1154732537913305e-06, + "logits/chosen": -1.2942352294921875, + "logits/rejected": -0.9139288067817688, + "logps/chosen": -76.64811706542969, + "logps/rejected": -94.2618637084961, + "loss": 0.6977, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.2482872009277344, + "rewards/margins": -1.10662841796875, + "rewards/rejected": 3.3549156188964844, + "step": 4684 + }, + { + "epoch": 0.76, + "learning_rate": 7.114282354285866e-06, + "logits/chosen": -0.796090304851532, + "logits/rejected": -0.815281867980957, + "logps/chosen": -43.303504943847656, + "logps/rejected": -42.48248291015625, + "loss": 0.3208, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.9194729328155518, + "rewards/margins": 0.11989176273345947, + "rewards/rejected": 1.7995811700820923, + "step": 4685 + }, + { + "epoch": 0.76, + "learning_rate": 7.113091308703498e-06, + "logits/chosen": -0.9472126364707947, + "logits/rejected": -0.9646843671798706, + "logps/chosen": -88.05812072753906, + "logps/rejected": -40.286476135253906, + "loss": 0.636, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.7758972644805908, + "rewards/margins": 0.582476019859314, + "rewards/rejected": 1.1934212446212769, + "step": 4686 + }, + { + "epoch": 0.76, + "learning_rate": 7.111900117126514e-06, + "logits/chosen": -1.2132291793823242, + "logits/rejected": -1.1418184041976929, + "logps/chosen": -42.56492614746094, + "logps/rejected": -14.360980987548828, + "loss": 0.5436, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.8740310668945312, + "rewards/margins": 1.4808686971664429, + "rewards/rejected": 0.393162339925766, + "step": 4687 + }, + { + "epoch": 0.76, + "learning_rate": 7.1107087796372146e-06, + "logits/chosen": -1.1272413730621338, + "logits/rejected": -1.1994483470916748, + "logps/chosen": -55.7890739440918, + "logps/rejected": -53.80902862548828, + "loss": 0.67, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.7655231952667236, + "rewards/margins": -0.48319196701049805, + "rewards/rejected": 2.2487151622772217, + "step": 4688 + }, + { + "epoch": 0.76, + "learning_rate": 7.10951729631791e-06, + "logits/chosen": -1.0613720417022705, + "logits/rejected": -1.0299179553985596, + "logps/chosen": -44.547550201416016, + "logps/rejected": -126.67774200439453, + "loss": 0.492, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.153700590133667, + "rewards/margins": -0.22792410850524902, + "rewards/rejected": 2.381624698638916, + "step": 4689 + }, + { + "epoch": 0.76, + "learning_rate": 7.10832566725092e-06, + "logits/chosen": -1.0136703252792358, + "logits/rejected": -1.0189906358718872, + "logps/chosen": -61.43824768066406, + "logps/rejected": -82.26451873779297, + "loss": 3.053, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.698568820953369, + "rewards/margins": -1.9181618690490723, + "rewards/rejected": 4.616730690002441, + "step": 4690 + }, + { + "epoch": 0.76, + "learning_rate": 7.107133892518577e-06, + "logits/chosen": -1.1198315620422363, + "logits/rejected": -1.268246054649353, + "logps/chosen": -70.56449890136719, + "logps/rejected": -117.34464263916016, + "loss": 2.8281, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.526615858078003, + "rewards/margins": -3.323514699935913, + "rewards/rejected": 5.850130558013916, + "step": 4691 + }, + { + "epoch": 0.76, + "learning_rate": 7.1059419722032195e-06, + "logits/chosen": -0.783341109752655, + "logits/rejected": -0.7888336777687073, + "logps/chosen": -2.354898691177368, + "logps/rejected": -1.839386224746704, + "loss": 0.3593, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.27570632100105286, + "rewards/margins": 0.024842023849487305, + "rewards/rejected": 0.25086429715156555, + "step": 4692 + }, + { + "epoch": 0.76, + "learning_rate": 7.1047499063871986e-06, + "logits/chosen": -0.7952420711517334, + "logits/rejected": -0.8260713815689087, + "logps/chosen": -35.30187225341797, + "logps/rejected": -68.4736557006836, + "loss": 0.8914, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.5000030994415283, + "rewards/margins": -0.6364631652832031, + "rewards/rejected": 2.1364662647247314, + "step": 4693 + }, + { + "epoch": 0.76, + "learning_rate": 7.103557695152874e-06, + "logits/chosen": -1.0243018865585327, + "logits/rejected": -0.9819871783256531, + "logps/chosen": -96.47661590576172, + "logps/rejected": -71.84979248046875, + "loss": 1.22, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.9681297540664673, + "rewards/margins": -1.1549416780471802, + "rewards/rejected": 2.1230714321136475, + "step": 4694 + }, + { + "epoch": 0.76, + "learning_rate": 7.102365338582617e-06, + "logits/chosen": -1.4580625295639038, + "logits/rejected": -1.4745705127716064, + "logps/chosen": -120.13895416259766, + "logps/rejected": -85.792724609375, + "loss": 1.0926, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.3132805824279785, + "rewards/margins": -0.2051689624786377, + "rewards/rejected": 2.518449544906616, + "step": 4695 + }, + { + "epoch": 0.76, + "learning_rate": 7.101172836758808e-06, + "logits/chosen": -1.3936647176742554, + "logits/rejected": -1.2824623584747314, + "logps/chosen": -63.47312927246094, + "logps/rejected": -75.03265380859375, + "loss": 2.1694, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.1367676258087158, + "rewards/margins": -3.0272161960601807, + "rewards/rejected": 4.1639838218688965, + "step": 4696 + }, + { + "epoch": 0.76, + "learning_rate": 7.099980189763836e-06, + "logits/chosen": -1.3647003173828125, + "logits/rejected": -1.2537468671798706, + "logps/chosen": -65.16683959960938, + "logps/rejected": -31.574731826782227, + "loss": 1.9587, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.148883104324341, + "rewards/margins": 2.6436269283294678, + "rewards/rejected": 0.5052561163902283, + "step": 4697 + }, + { + "epoch": 0.76, + "learning_rate": 7.098787397680104e-06, + "logits/chosen": -1.3804208040237427, + "logits/rejected": -1.3776403665542603, + "logps/chosen": -94.0184326171875, + "logps/rejected": -177.16546630859375, + "loss": 0.9619, + "rewards/accuracies": 0.0, + "rewards/chosen": 5.879477024078369, + "rewards/margins": -1.6924619674682617, + "rewards/rejected": 7.571938991546631, + "step": 4698 + }, + { + "epoch": 0.76, + "learning_rate": 7.097594460590023e-06, + "logits/chosen": -1.0430238246917725, + "logits/rejected": -1.0344688892364502, + "logps/chosen": -21.74344253540039, + "logps/rejected": -1.2123301029205322, + "loss": 0.7298, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.13158702850341797, + "rewards/margins": -0.1415371298789978, + "rewards/rejected": 0.27312415838241577, + "step": 4699 + }, + { + "epoch": 0.76, + "learning_rate": 7.096401378576011e-06, + "logits/chosen": -1.0398921966552734, + "logits/rejected": -0.9644798040390015, + "logps/chosen": -83.13456726074219, + "logps/rejected": -72.78046417236328, + "loss": 0.3131, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.518627882003784, + "rewards/margins": 0.5455825328826904, + "rewards/rejected": 1.9730453491210938, + "step": 4700 + }, + { + "epoch": 0.76, + "learning_rate": 7.0952081517205005e-06, + "logits/chosen": -0.8181547522544861, + "logits/rejected": -0.8330445885658264, + "logps/chosen": -79.39285278320312, + "logps/rejected": -81.88359069824219, + "loss": 0.7317, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0366196632385254, + "rewards/margins": 0.6851241588592529, + "rewards/rejected": 2.3514955043792725, + "step": 4701 + }, + { + "epoch": 0.76, + "learning_rate": 7.094014780105931e-06, + "logits/chosen": -1.0703943967819214, + "logits/rejected": -1.0703943967819214, + "logps/chosen": -39.12031173706055, + "logps/rejected": -39.12031173706055, + "loss": 0.4831, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.776566743850708, + "rewards/margins": 0.0, + "rewards/rejected": 1.776566743850708, + "step": 4702 + }, + { + "epoch": 0.76, + "learning_rate": 7.092821263814756e-06, + "logits/chosen": -1.0729485750198364, + "logits/rejected": -1.034092903137207, + "logps/chosen": -70.4014663696289, + "logps/rejected": -43.37697219848633, + "loss": 0.7871, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.7712570428848267, + "rewards/margins": -1.0895978212356567, + "rewards/rejected": 2.8608548641204834, + "step": 4703 + }, + { + "epoch": 0.76, + "learning_rate": 7.0916276029294314e-06, + "logits/chosen": -1.1138468980789185, + "logits/rejected": -1.1138468980789185, + "logps/chosen": -59.63220977783203, + "logps/rejected": -59.63220977783203, + "loss": 0.3502, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.7691490650177, + "rewards/margins": 0.0, + "rewards/rejected": 2.7691490650177, + "step": 4704 + }, + { + "epoch": 0.76, + "learning_rate": 7.090433797532433e-06, + "logits/chosen": -1.6089708805084229, + "logits/rejected": -1.6004129648208618, + "logps/chosen": -71.8780517578125, + "logps/rejected": -33.63844680786133, + "loss": 0.2817, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.622106909751892, + "rewards/margins": 0.5452380180358887, + "rewards/rejected": 1.0768688917160034, + "step": 4705 + }, + { + "epoch": 0.76, + "learning_rate": 7.0892398477062375e-06, + "logits/chosen": -1.3613816499710083, + "logits/rejected": -1.424046516418457, + "logps/chosen": -56.184715270996094, + "logps/rejected": -95.66262817382812, + "loss": 1.3456, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.0426422357559204, + "rewards/margins": -1.2352837324142456, + "rewards/rejected": 2.277925968170166, + "step": 4706 + }, + { + "epoch": 0.76, + "learning_rate": 7.088045753533337e-06, + "logits/chosen": -1.540874719619751, + "logits/rejected": -1.5381262302398682, + "logps/chosen": -104.21293640136719, + "logps/rejected": -99.9407958984375, + "loss": 0.8106, + "rewards/accuracies": 0.0, + "rewards/chosen": 7.1106276512146, + "rewards/margins": -1.3994154930114746, + "rewards/rejected": 8.510043144226074, + "step": 4707 + }, + { + "epoch": 0.76, + "learning_rate": 7.086851515096233e-06, + "logits/chosen": -1.1997407674789429, + "logits/rejected": -1.1997407674789429, + "logps/chosen": -47.96917724609375, + "logps/rejected": -47.96917724609375, + "loss": 0.3469, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.8222427368164062, + "rewards/margins": 0.0, + "rewards/rejected": 1.8222427368164062, + "step": 4708 + }, + { + "epoch": 0.76, + "learning_rate": 7.085657132477435e-06, + "logits/chosen": -0.7532697319984436, + "logits/rejected": -0.758141815662384, + "logps/chosen": -4.50731897354126, + "logps/rejected": -2.2060415744781494, + "loss": 0.7462, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.3315204679965973, + "rewards/margins": -0.20191070437431335, + "rewards/rejected": 0.5334311723709106, + "step": 4709 + }, + { + "epoch": 0.76, + "learning_rate": 7.0844626057594644e-06, + "logits/chosen": -1.4993610382080078, + "logits/rejected": -1.273789882659912, + "logps/chosen": -158.9693603515625, + "logps/rejected": -14.84979248046875, + "loss": 0.0061, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.727856636047363, + "rewards/margins": 4.630838394165039, + "rewards/rejected": 1.0970181226730347, + "step": 4710 + }, + { + "epoch": 0.76, + "learning_rate": 7.083267935024851e-06, + "logits/chosen": -0.9297897815704346, + "logits/rejected": -0.9592429399490356, + "logps/chosen": -46.413177490234375, + "logps/rejected": -57.798927307128906, + "loss": 0.2908, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.5956157445907593, + "rewards/margins": 0.25635409355163574, + "rewards/rejected": 1.3392616510391235, + "step": 4711 + }, + { + "epoch": 0.76, + "learning_rate": 7.082073120356134e-06, + "logits/chosen": -1.1656149625778198, + "logits/rejected": -1.178667664527893, + "logps/chosen": -50.26246643066406, + "logps/rejected": -89.74360656738281, + "loss": 2.4071, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.7764785289764404, + "rewards/margins": -0.00873422622680664, + "rewards/rejected": 2.785212755203247, + "step": 4712 + }, + { + "epoch": 0.76, + "learning_rate": 7.080878161835867e-06, + "logits/chosen": -0.866476833820343, + "logits/rejected": -0.7735756635665894, + "logps/chosen": -26.793405532836914, + "logps/rejected": -44.57841873168945, + "loss": 1.0028, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.5787036418914795, + "rewards/margins": 0.7116557359695435, + "rewards/rejected": 1.867047905921936, + "step": 4713 + }, + { + "epoch": 0.77, + "learning_rate": 7.079683059546607e-06, + "logits/chosen": -1.0267609357833862, + "logits/rejected": -0.9985317587852478, + "logps/chosen": -71.15959930419922, + "logps/rejected": -94.85490417480469, + "loss": 1.0381, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.6950798034667969, + "rewards/margins": -0.11832654476165771, + "rewards/rejected": 1.8134063482284546, + "step": 4714 + }, + { + "epoch": 0.77, + "learning_rate": 7.0784878135709254e-06, + "logits/chosen": -1.1746599674224854, + "logits/rejected": -1.056597113609314, + "logps/chosen": -95.4543685913086, + "logps/rejected": -43.6075439453125, + "loss": 0.4738, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.174802303314209, + "rewards/margins": 1.8709707260131836, + "rewards/rejected": 4.303831577301025, + "step": 4715 + }, + { + "epoch": 0.77, + "learning_rate": 7.077292423991404e-06, + "logits/chosen": -1.4488837718963623, + "logits/rejected": -1.37806236743927, + "logps/chosen": -69.0000991821289, + "logps/rejected": -77.31670379638672, + "loss": 1.9673, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.8946151733398438, + "rewards/margins": 0.9634330868721008, + "rewards/rejected": 0.9311820864677429, + "step": 4716 + }, + { + "epoch": 0.77, + "learning_rate": 7.076096890890632e-06, + "logits/chosen": -0.8944929242134094, + "logits/rejected": -0.8574128150939941, + "logps/chosen": -48.41774368286133, + "logps/rejected": -69.48136138916016, + "loss": 0.6124, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.6898571252822876, + "rewards/margins": -0.812872052192688, + "rewards/rejected": 2.5027291774749756, + "step": 4717 + }, + { + "epoch": 0.77, + "learning_rate": 7.074901214351207e-06, + "logits/chosen": -0.6497946381568909, + "logits/rejected": -0.6497946381568909, + "logps/chosen": -67.32533264160156, + "logps/rejected": -67.32533264160156, + "loss": 0.5846, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.444689989089966, + "rewards/margins": 0.0, + "rewards/rejected": 2.444689989089966, + "step": 4718 + }, + { + "epoch": 0.77, + "learning_rate": 7.073705394455743e-06, + "logits/chosen": -0.8805035948753357, + "logits/rejected": -0.8197261691093445, + "logps/chosen": -65.12169647216797, + "logps/rejected": -121.53836059570312, + "loss": 0.296, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.6653801202774048, + "rewards/margins": 0.22266924381256104, + "rewards/rejected": 1.4427108764648438, + "step": 4719 + }, + { + "epoch": 0.77, + "learning_rate": 7.072509431286858e-06, + "logits/chosen": -1.1750596761703491, + "logits/rejected": -1.24695885181427, + "logps/chosen": -66.65178680419922, + "logps/rejected": -88.43571472167969, + "loss": 0.6628, + "rewards/accuracies": 0.0, + "rewards/chosen": 5.865885257720947, + "rewards/margins": -1.0038247108459473, + "rewards/rejected": 6.8697099685668945, + "step": 4720 + }, + { + "epoch": 0.77, + "learning_rate": 7.07131332492718e-06, + "logits/chosen": -1.3377585411071777, + "logits/rejected": -1.120818018913269, + "logps/chosen": -124.1148452758789, + "logps/rejected": -40.642642974853516, + "loss": 0.0124, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.6843791007995605, + "rewards/margins": 4.039904594421387, + "rewards/rejected": 2.644474506378174, + "step": 4721 + }, + { + "epoch": 0.77, + "learning_rate": 7.0701170754593516e-06, + "logits/chosen": -0.4069017767906189, + "logits/rejected": -0.43118035793304443, + "logps/chosen": -3.3393654823303223, + "logps/rejected": -26.405567169189453, + "loss": 0.4459, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.37149786949157715, + "rewards/margins": -0.34663718938827515, + "rewards/rejected": 0.7181350588798523, + "step": 4722 + }, + { + "epoch": 0.77, + "learning_rate": 7.06892068296602e-06, + "logits/chosen": -1.516412615776062, + "logits/rejected": -1.436004877090454, + "logps/chosen": -55.691097259521484, + "logps/rejected": -25.93817901611328, + "loss": 0.3253, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.563936233520508, + "rewards/margins": 1.1311134099960327, + "rewards/rejected": 1.432822823524475, + "step": 4723 + }, + { + "epoch": 0.77, + "learning_rate": 7.067724147529847e-06, + "logits/chosen": -1.2474089860916138, + "logits/rejected": -1.2193304300308228, + "logps/chosen": -71.43788146972656, + "logps/rejected": -34.63798522949219, + "loss": 0.8038, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.323706865310669, + "rewards/margins": -0.4067714214324951, + "rewards/rejected": 2.730478286743164, + "step": 4724 + }, + { + "epoch": 0.77, + "learning_rate": 7.066527469233497e-06, + "logits/chosen": -1.1542965173721313, + "logits/rejected": -0.9703322052955627, + "logps/chosen": -97.11894989013672, + "logps/rejected": -13.548563003540039, + "loss": 0.2091, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.0530662536621094, + "rewards/margins": 1.2321841716766357, + "rewards/rejected": 0.8208820223808289, + "step": 4725 + }, + { + "epoch": 0.77, + "learning_rate": 7.0653306481596565e-06, + "logits/chosen": -1.3338665962219238, + "logits/rejected": -1.2537415027618408, + "logps/chosen": -32.33033752441406, + "logps/rejected": -48.889583587646484, + "loss": 0.4083, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.6954925060272217, + "rewards/margins": 1.2334941625595093, + "rewards/rejected": 1.4619983434677124, + "step": 4726 + }, + { + "epoch": 0.77, + "learning_rate": 7.064133684391008e-06, + "logits/chosen": -0.9757159948348999, + "logits/rejected": -0.9757159948348999, + "logps/chosen": -35.853118896484375, + "logps/rejected": -35.853118896484375, + "loss": 2.7898, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.6804847717285156, + "rewards/margins": 0.0, + "rewards/rejected": 2.6804847717285156, + "step": 4727 + }, + { + "epoch": 0.77, + "learning_rate": 7.062936578010253e-06, + "logits/chosen": -1.2190213203430176, + "logits/rejected": -1.121881127357483, + "logps/chosen": -76.10212707519531, + "logps/rejected": -58.29350280761719, + "loss": 0.6175, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.925158739089966, + "rewards/margins": -0.824211835861206, + "rewards/rejected": 3.749370574951172, + "step": 4728 + }, + { + "epoch": 0.77, + "learning_rate": 7.061739329100101e-06, + "logits/chosen": -1.2231101989746094, + "logits/rejected": -1.303707480430603, + "logps/chosen": -87.12505340576172, + "logps/rejected": -145.05149841308594, + "loss": 2.4214, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.2837189435958862, + "rewards/margins": -3.3748135566711426, + "rewards/rejected": 4.658532619476318, + "step": 4729 + }, + { + "epoch": 0.77, + "learning_rate": 7.060541937743269e-06, + "logits/chosen": -1.3014453649520874, + "logits/rejected": -1.2701131105422974, + "logps/chosen": -174.84368896484375, + "logps/rejected": -155.42953491210938, + "loss": 0.6912, + "rewards/accuracies": 0.0, + "rewards/chosen": 4.105435371398926, + "rewards/margins": -1.0390806198120117, + "rewards/rejected": 5.1445159912109375, + "step": 4730 + }, + { + "epoch": 0.77, + "learning_rate": 7.059344404022488e-06, + "logits/chosen": -1.3803081512451172, + "logits/rejected": -1.0554423332214355, + "logps/chosen": -149.48574829101562, + "logps/rejected": -45.903724670410156, + "loss": 0.0483, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.876077175140381, + "rewards/margins": 5.1212615966796875, + "rewards/rejected": 1.754815697669983, + "step": 4731 + }, + { + "epoch": 0.77, + "learning_rate": 7.058146728020492e-06, + "logits/chosen": -1.304163932800293, + "logits/rejected": -1.181310772895813, + "logps/chosen": -65.44855499267578, + "logps/rejected": -63.43950653076172, + "loss": 0.0646, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.24795389175415, + "rewards/margins": 2.430487871170044, + "rewards/rejected": 2.8174660205841064, + "step": 4732 + }, + { + "epoch": 0.77, + "learning_rate": 7.0569489098200325e-06, + "logits/chosen": -1.2546929121017456, + "logits/rejected": -1.0072945356369019, + "logps/chosen": -91.47295379638672, + "logps/rejected": -66.93804168701172, + "loss": 0.0425, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.601415157318115, + "rewards/margins": 2.6094555854797363, + "rewards/rejected": 1.9919594526290894, + "step": 4733 + }, + { + "epoch": 0.77, + "learning_rate": 7.055750949503867e-06, + "logits/chosen": -1.3422846794128418, + "logits/rejected": -1.3215581178665161, + "logps/chosen": -180.67572021484375, + "logps/rejected": -22.754920959472656, + "loss": 0.0072, + "rewards/accuracies": 1.0, + "rewards/chosen": 7.244851589202881, + "rewards/margins": 4.264411926269531, + "rewards/rejected": 2.9804394245147705, + "step": 4734 + }, + { + "epoch": 0.77, + "learning_rate": 7.0545528471547605e-06, + "logits/chosen": -1.2005499601364136, + "logits/rejected": -1.167976975440979, + "logps/chosen": -143.31423950195312, + "logps/rejected": -54.914283752441406, + "loss": 0.4869, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.906121850013733, + "rewards/margins": -0.3051398992538452, + "rewards/rejected": 2.211261749267578, + "step": 4735 + }, + { + "epoch": 0.77, + "learning_rate": 7.053354602855495e-06, + "logits/chosen": -1.2463322877883911, + "logits/rejected": -1.2298130989074707, + "logps/chosen": -78.55947875976562, + "logps/rejected": -46.318511962890625, + "loss": 1.787, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.6477432250976562, + "rewards/margins": 0.5382487773895264, + "rewards/rejected": 1.1094944477081299, + "step": 4736 + }, + { + "epoch": 0.77, + "learning_rate": 7.052156216688855e-06, + "logits/chosen": -1.2577717304229736, + "logits/rejected": -1.228184461593628, + "logps/chosen": -49.68199157714844, + "logps/rejected": -77.91357421875, + "loss": 0.3571, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.1913788318634033, + "rewards/margins": 0.1757493019104004, + "rewards/rejected": 2.015629529953003, + "step": 4737 + }, + { + "epoch": 0.77, + "learning_rate": 7.0509576887376375e-06, + "logits/chosen": -0.7738308310508728, + "logits/rejected": -0.6567268371582031, + "logps/chosen": -65.10370635986328, + "logps/rejected": -54.86296844482422, + "loss": 0.0928, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.199354648590088, + "rewards/margins": 1.7360409498214722, + "rewards/rejected": 0.46331366896629333, + "step": 4738 + }, + { + "epoch": 0.77, + "learning_rate": 7.04975901908465e-06, + "logits/chosen": -1.5708192586898804, + "logits/rejected": -1.2179679870605469, + "logps/chosen": -162.0884246826172, + "logps/rejected": -70.12973022460938, + "loss": 0.421, + "rewards/accuracies": 0.0, + "rewards/chosen": 4.517146587371826, + "rewards/margins": -0.2763333320617676, + "rewards/rejected": 4.793479919433594, + "step": 4739 + }, + { + "epoch": 0.77, + "learning_rate": 7.048560207812709e-06, + "logits/chosen": -1.0154013633728027, + "logits/rejected": -1.0620036125183105, + "logps/chosen": -42.68802261352539, + "logps/rejected": -54.42354202270508, + "loss": 1.2111, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.2643253803253174, + "rewards/margins": -0.15151822566986084, + "rewards/rejected": 1.4158436059951782, + "step": 4740 + }, + { + "epoch": 0.77, + "learning_rate": 7.047361255004642e-06, + "logits/chosen": -0.9903912544250488, + "logits/rejected": -0.9555225968360901, + "logps/chosen": -47.65245819091797, + "logps/rejected": -19.491741180419922, + "loss": 0.5092, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.052977442741394, + "rewards/margins": 0.2777971625328064, + "rewards/rejected": 0.7751802802085876, + "step": 4741 + }, + { + "epoch": 0.77, + "learning_rate": 7.046162160743284e-06, + "logits/chosen": -0.9303386807441711, + "logits/rejected": -0.8085926175117493, + "logps/chosen": -74.7099609375, + "logps/rejected": -53.569679260253906, + "loss": 0.141, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.329087972640991, + "rewards/margins": 1.2918320894241333, + "rewards/rejected": 1.037255883216858, + "step": 4742 + }, + { + "epoch": 0.77, + "learning_rate": 7.044962925111482e-06, + "logits/chosen": -1.3967442512512207, + "logits/rejected": -1.4475398063659668, + "logps/chosen": -169.80152893066406, + "logps/rejected": -129.33177185058594, + "loss": 0.2471, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.2141618728637695, + "rewards/margins": 3.128657579421997, + "rewards/rejected": 2.0855042934417725, + "step": 4743 + }, + { + "epoch": 0.77, + "learning_rate": 7.043763548192091e-06, + "logits/chosen": -1.3894600868225098, + "logits/rejected": -1.2393943071365356, + "logps/chosen": -163.0762481689453, + "logps/rejected": -54.82061004638672, + "loss": 0.7053, + "rewards/accuracies": 1.0, + "rewards/chosen": 10.454103469848633, + "rewards/margins": 6.847766876220703, + "rewards/rejected": 3.6063363552093506, + "step": 4744 + }, + { + "epoch": 0.77, + "learning_rate": 7.042564030067977e-06, + "logits/chosen": -1.5233383178710938, + "logits/rejected": -1.1206899881362915, + "logps/chosen": -142.185302734375, + "logps/rejected": -25.93532943725586, + "loss": 0.0024, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.808386325836182, + "rewards/margins": 6.048198223114014, + "rewards/rejected": 0.7601879239082336, + "step": 4745 + }, + { + "epoch": 0.77, + "learning_rate": 7.041364370822017e-06, + "logits/chosen": -1.2633898258209229, + "logits/rejected": -1.0835109949111938, + "logps/chosen": -163.18084716796875, + "logps/rejected": -49.79061508178711, + "loss": 0.0057, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.622342109680176, + "rewards/margins": 4.97745418548584, + "rewards/rejected": 1.644887924194336, + "step": 4746 + }, + { + "epoch": 0.77, + "learning_rate": 7.040164570537093e-06, + "logits/chosen": -0.9768146872520447, + "logits/rejected": -0.9768146872520447, + "logps/chosen": -60.87955093383789, + "logps/rejected": -60.87955093383789, + "loss": 0.4442, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.5102291107177734, + "rewards/margins": 0.0, + "rewards/rejected": 2.5102291107177734, + "step": 4747 + }, + { + "epoch": 0.77, + "learning_rate": 7.038964629296103e-06, + "logits/chosen": -1.2978557348251343, + "logits/rejected": -1.0538289546966553, + "logps/chosen": -145.7139892578125, + "logps/rejected": -14.051977157592773, + "loss": 0.0316, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.59027099609375, + "rewards/margins": 5.37871789932251, + "rewards/rejected": 1.2115532159805298, + "step": 4748 + }, + { + "epoch": 0.77, + "learning_rate": 7.037764547181948e-06, + "logits/chosen": -1.3420066833496094, + "logits/rejected": -1.228610873222351, + "logps/chosen": -97.39808654785156, + "logps/rejected": -62.69990158081055, + "loss": 0.517, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.133256673812866, + "rewards/margins": 0.22795915603637695, + "rewards/rejected": 1.9052975177764893, + "step": 4749 + }, + { + "epoch": 0.77, + "learning_rate": 7.036564324277545e-06, + "logits/chosen": -0.9821140170097351, + "logits/rejected": -1.1072463989257812, + "logps/chosen": -98.23570251464844, + "logps/rejected": -95.03822326660156, + "loss": 1.1968, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.534040927886963, + "rewards/margins": -2.284292697906494, + "rewards/rejected": 4.818333625793457, + "step": 4750 + }, + { + "epoch": 0.77, + "learning_rate": 7.035363960665817e-06, + "logits/chosen": -1.3848025798797607, + "logits/rejected": -1.3197762966156006, + "logps/chosen": -54.22285842895508, + "logps/rejected": -62.97514343261719, + "loss": 0.756, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.6328556537628174, + "rewards/margins": -1.234875202178955, + "rewards/rejected": 3.8677308559417725, + "step": 4751 + }, + { + "epoch": 0.77, + "learning_rate": 7.034163456429699e-06, + "logits/chosen": -1.0687421560287476, + "logits/rejected": -1.0656791925430298, + "logps/chosen": -3.3365471363067627, + "logps/rejected": -2.4197545051574707, + "loss": 0.4777, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.4308508038520813, + "rewards/margins": -0.1299349069595337, + "rewards/rejected": 0.560785710811615, + "step": 4752 + }, + { + "epoch": 0.77, + "learning_rate": 7.032962811652133e-06, + "logits/chosen": -0.5562334656715393, + "logits/rejected": -0.5559648275375366, + "logps/chosen": -3.4360015392303467, + "logps/rejected": -2.473449945449829, + "loss": 0.6935, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.31301769614219666, + "rewards/margins": 0.076984703540802, + "rewards/rejected": 0.23603299260139465, + "step": 4753 + }, + { + "epoch": 0.77, + "learning_rate": 7.031762026416074e-06, + "logits/chosen": -1.1459369659423828, + "logits/rejected": -1.2058472633361816, + "logps/chosen": -66.7813720703125, + "logps/rejected": -74.24003601074219, + "loss": 0.7453, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.3305435180664062, + "rewards/margins": -0.9958839416503906, + "rewards/rejected": 3.326427459716797, + "step": 4754 + }, + { + "epoch": 0.77, + "learning_rate": 7.030561100804483e-06, + "logits/chosen": -1.149033546447754, + "logits/rejected": -1.1742687225341797, + "logps/chosen": -173.3956298828125, + "logps/rejected": -153.56967163085938, + "loss": 1.5574, + "rewards/accuracies": 0.0, + "rewards/chosen": 6.687429904937744, + "rewards/margins": -1.6726346015930176, + "rewards/rejected": 8.360064506530762, + "step": 4755 + }, + { + "epoch": 0.77, + "learning_rate": 7.029360034900332e-06, + "logits/chosen": -0.7686792016029358, + "logits/rejected": -0.7686792016029358, + "logps/chosen": -1.181811809539795, + "logps/rejected": -1.181811809539795, + "loss": 1.0477, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.17550809681415558, + "rewards/margins": 0.0, + "rewards/rejected": 0.17550809681415558, + "step": 4756 + }, + { + "epoch": 0.77, + "learning_rate": 7.0281588287866065e-06, + "logits/chosen": -0.8863251209259033, + "logits/rejected": -0.8509186506271362, + "logps/chosen": -248.49655151367188, + "logps/rejected": -49.28723907470703, + "loss": 1.2207, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.473776340484619, + "rewards/margins": 2.9198548793792725, + "rewards/rejected": 3.5539214611053467, + "step": 4757 + }, + { + "epoch": 0.77, + "learning_rate": 7.026957482546295e-06, + "logits/chosen": -1.2023289203643799, + "logits/rejected": -1.2023289203643799, + "logps/chosen": -54.41291809082031, + "logps/rejected": -54.41291809082031, + "loss": 0.496, + "rewards/accuracies": 0.0, + "rewards/chosen": 4.8193278312683105, + "rewards/margins": 0.0, + "rewards/rejected": 4.8193278312683105, + "step": 4758 + }, + { + "epoch": 0.77, + "learning_rate": 7.025755996262401e-06, + "logits/chosen": -1.3816003799438477, + "logits/rejected": -1.1536931991577148, + "logps/chosen": -93.75129699707031, + "logps/rejected": -71.66165924072266, + "loss": 0.4833, + "rewards/accuracies": 1.0, + "rewards/chosen": 10.897473335266113, + "rewards/margins": 9.389094352722168, + "rewards/rejected": 1.508379340171814, + "step": 4759 + }, + { + "epoch": 0.77, + "learning_rate": 7.024554370017937e-06, + "logits/chosen": -0.856395423412323, + "logits/rejected": -0.856395423412323, + "logps/chosen": -22.295164108276367, + "logps/rejected": -22.295164108276367, + "loss": 0.9306, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.7980760931968689, + "rewards/margins": 0.0, + "rewards/rejected": 0.7980760931968689, + "step": 4760 + }, + { + "epoch": 0.77, + "learning_rate": 7.023352603895921e-06, + "logits/chosen": -1.4077377319335938, + "logits/rejected": -1.5171011686325073, + "logps/chosen": -150.46902465820312, + "logps/rejected": -35.100284576416016, + "loss": 0.3622, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.769235372543335, + "rewards/margins": 3.4479711055755615, + "rewards/rejected": 0.32126426696777344, + "step": 4761 + }, + { + "epoch": 0.77, + "learning_rate": 7.022150697979385e-06, + "logits/chosen": -1.1241958141326904, + "logits/rejected": -1.08950936794281, + "logps/chosen": -305.01885986328125, + "logps/rejected": -114.8504409790039, + "loss": 0.0024, + "rewards/accuracies": 1.0, + "rewards/chosen": 7.601675510406494, + "rewards/margins": 5.808185577392578, + "rewards/rejected": 1.7934898138046265, + "step": 4762 + }, + { + "epoch": 0.77, + "learning_rate": 7.02094865235137e-06, + "logits/chosen": -0.835986852645874, + "logits/rejected": -0.835986852645874, + "logps/chosen": -90.63282012939453, + "logps/rejected": -90.63282012939453, + "loss": 0.6527, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.181645154953003, + "rewards/margins": 0.0, + "rewards/rejected": 3.181645154953003, + "step": 4763 + }, + { + "epoch": 0.77, + "learning_rate": 7.019746467094927e-06, + "logits/chosen": -1.0373742580413818, + "logits/rejected": -0.8950261473655701, + "logps/chosen": -64.69978332519531, + "logps/rejected": -23.06128692626953, + "loss": 0.234, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.5592437982559204, + "rewards/margins": 0.9357767105102539, + "rewards/rejected": 0.6234670877456665, + "step": 4764 + }, + { + "epoch": 0.77, + "learning_rate": 7.018544142293112e-06, + "logits/chosen": -1.4601776599884033, + "logits/rejected": -1.4061341285705566, + "logps/chosen": -69.73662567138672, + "logps/rejected": -86.8897705078125, + "loss": 0.2826, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.446445465087891, + "rewards/margins": 0.3223271369934082, + "rewards/rejected": 4.124118328094482, + "step": 4765 + }, + { + "epoch": 0.77, + "learning_rate": 7.017341678028997e-06, + "logits/chosen": -1.1085987091064453, + "logits/rejected": -1.1370086669921875, + "logps/chosen": -184.5729217529297, + "logps/rejected": -44.610252380371094, + "loss": 0.0775, + "rewards/accuracies": 1.0, + "rewards/chosen": 7.920362949371338, + "rewards/margins": 5.986071586608887, + "rewards/rejected": 1.934291124343872, + "step": 4766 + }, + { + "epoch": 0.77, + "learning_rate": 7.016139074385661e-06, + "logits/chosen": -1.252508282661438, + "logits/rejected": -1.1720197200775146, + "logps/chosen": -138.8308563232422, + "logps/rejected": -81.69373321533203, + "loss": 0.1736, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.880551338195801, + "rewards/margins": 0.999544620513916, + "rewards/rejected": 4.881006717681885, + "step": 4767 + }, + { + "epoch": 0.77, + "learning_rate": 7.014936331446192e-06, + "logits/chosen": -0.889251172542572, + "logits/rejected": -0.880261242389679, + "logps/chosen": -48.86909484863281, + "logps/rejected": -54.85149383544922, + "loss": 0.5106, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.1045174598693848, + "rewards/margins": -0.3087751865386963, + "rewards/rejected": 2.413292646408081, + "step": 4768 + }, + { + "epoch": 0.77, + "learning_rate": 7.0137334492936875e-06, + "logits/chosen": -0.8919299840927124, + "logits/rejected": -0.8871248364448547, + "logps/chosen": -2.094179391860962, + "logps/rejected": -1.7328931093215942, + "loss": 0.3718, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.20613515377044678, + "rewards/margins": -0.09097284078598022, + "rewards/rejected": 0.297107994556427, + "step": 4769 + }, + { + "epoch": 0.77, + "learning_rate": 7.0125304280112546e-06, + "logits/chosen": -0.9619594812393188, + "logits/rejected": -0.9167126417160034, + "logps/chosen": -61.76250076293945, + "logps/rejected": -68.35206604003906, + "loss": 0.4326, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.5739024877548218, + "rewards/margins": -0.05529677867889404, + "rewards/rejected": 1.6291992664337158, + "step": 4770 + }, + { + "epoch": 0.77, + "learning_rate": 7.011327267682013e-06, + "logits/chosen": -1.133984923362732, + "logits/rejected": -0.9276837110519409, + "logps/chosen": -123.2166519165039, + "logps/rejected": -38.05854034423828, + "loss": 0.0644, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.4539971351623535, + "rewards/margins": 2.098456621170044, + "rewards/rejected": 3.3555405139923096, + "step": 4771 + }, + { + "epoch": 0.77, + "learning_rate": 7.0101239683890885e-06, + "logits/chosen": -1.0431276559829712, + "logits/rejected": -1.0467760562896729, + "logps/chosen": -76.51223754882812, + "logps/rejected": -81.12461853027344, + "loss": 2.242, + "rewards/accuracies": 0.0, + "rewards/chosen": 4.205312252044678, + "rewards/margins": -0.7282862663269043, + "rewards/rejected": 4.933598518371582, + "step": 4772 + }, + { + "epoch": 0.77, + "learning_rate": 7.008920530215619e-06, + "logits/chosen": -1.273593544960022, + "logits/rejected": -1.404458999633789, + "logps/chosen": -126.02491760253906, + "logps/rejected": -182.85940551757812, + "loss": 1.911, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.0928802490234375, + "rewards/margins": -1.8828766345977783, + "rewards/rejected": 2.975756883621216, + "step": 4773 + }, + { + "epoch": 0.77, + "learning_rate": 7.0077169532447474e-06, + "logits/chosen": -1.2120532989501953, + "logits/rejected": -1.216988444328308, + "logps/chosen": -36.183197021484375, + "logps/rejected": -29.88800811767578, + "loss": 0.2708, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.039666771888733, + "rewards/margins": 0.43372344970703125, + "rewards/rejected": 0.6059433221817017, + "step": 4774 + }, + { + "epoch": 0.78, + "learning_rate": 7.006513237559632e-06, + "logits/chosen": -1.3304418325424194, + "logits/rejected": -1.1954917907714844, + "logps/chosen": -102.56666564941406, + "logps/rejected": -122.44566345214844, + "loss": 0.3266, + "rewards/accuracies": 1.0, + "rewards/chosen": 7.0121049880981445, + "rewards/margins": 0.09101104736328125, + "rewards/rejected": 6.921093940734863, + "step": 4775 + }, + { + "epoch": 0.78, + "learning_rate": 7.0053093832434385e-06, + "logits/chosen": -0.8897088170051575, + "logits/rejected": -0.8867323994636536, + "logps/chosen": -87.84846496582031, + "logps/rejected": -68.7762680053711, + "loss": 1.3634, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.750638723373413, + "rewards/margins": -1.7114250659942627, + "rewards/rejected": 4.462063789367676, + "step": 4776 + }, + { + "epoch": 0.78, + "learning_rate": 7.004105390379341e-06, + "logits/chosen": -1.1535937786102295, + "logits/rejected": -1.1535937786102295, + "logps/chosen": -76.99760437011719, + "logps/rejected": -76.99760437011719, + "loss": 0.3541, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.688676595687866, + "rewards/margins": 0.0, + "rewards/rejected": 3.688676595687866, + "step": 4777 + }, + { + "epoch": 0.78, + "learning_rate": 7.002901259050523e-06, + "logits/chosen": -1.2289129495620728, + "logits/rejected": -1.305017113685608, + "logps/chosen": -230.63345336914062, + "logps/rejected": -179.42420959472656, + "loss": 1.0612, + "rewards/accuracies": 0.0, + "rewards/chosen": 7.719336032867432, + "rewards/margins": -0.4516739845275879, + "rewards/rejected": 8.17101001739502, + "step": 4778 + }, + { + "epoch": 0.78, + "learning_rate": 7.001696989340181e-06, + "logits/chosen": -1.1717162132263184, + "logits/rejected": -1.2288326025009155, + "logps/chosen": -30.933090209960938, + "logps/rejected": -67.23467254638672, + "loss": 1.4332, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.1932491064071655, + "rewards/margins": -2.0525264739990234, + "rewards/rejected": 3.2457756996154785, + "step": 4779 + }, + { + "epoch": 0.78, + "learning_rate": 7.000492581331516e-06, + "logits/chosen": -0.9163491725921631, + "logits/rejected": -0.971515417098999, + "logps/chosen": -71.18327331542969, + "logps/rejected": -40.9080810546875, + "loss": 1.014, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.158587694168091, + "rewards/margins": -0.05702710151672363, + "rewards/rejected": 2.2156147956848145, + "step": 4780 + }, + { + "epoch": 0.78, + "learning_rate": 6.999288035107743e-06, + "logits/chosen": -0.9312418103218079, + "logits/rejected": -0.9575108885765076, + "logps/chosen": -44.30010986328125, + "logps/rejected": -61.346832275390625, + "loss": 0.3683, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.6846901178359985, + "rewards/margins": -0.012960076332092285, + "rewards/rejected": 1.6976501941680908, + "step": 4781 + }, + { + "epoch": 0.78, + "learning_rate": 6.998083350752084e-06, + "logits/chosen": -1.058306336402893, + "logits/rejected": -0.994634211063385, + "logps/chosen": -145.10714721679688, + "logps/rejected": -52.08879089355469, + "loss": 0.0304, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.60292387008667, + "rewards/margins": 3.834279775619507, + "rewards/rejected": 2.768644094467163, + "step": 4782 + }, + { + "epoch": 0.78, + "learning_rate": 6.996878528347771e-06, + "logits/chosen": -1.2272696495056152, + "logits/rejected": -1.2294615507125854, + "logps/chosen": -73.21472930908203, + "logps/rejected": -131.59228515625, + "loss": 0.4517, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.0155937671661377, + "rewards/margins": -0.28920817375183105, + "rewards/rejected": 2.3048019409179688, + "step": 4783 + }, + { + "epoch": 0.78, + "learning_rate": 6.995673567978047e-06, + "logits/chosen": -1.1425455808639526, + "logits/rejected": -1.2251886129379272, + "logps/chosen": -142.85855102539062, + "logps/rejected": -133.56509399414062, + "loss": 2.3831, + "rewards/accuracies": 0.0, + "rewards/chosen": 5.689225673675537, + "rewards/margins": -3.262723445892334, + "rewards/rejected": 8.951949119567871, + "step": 4784 + }, + { + "epoch": 0.78, + "learning_rate": 6.9944684697261634e-06, + "logits/chosen": -1.0011698007583618, + "logits/rejected": -1.075674057006836, + "logps/chosen": -71.22140502929688, + "logps/rejected": -86.6951904296875, + "loss": 3.1597, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.6573173999786377, + "rewards/margins": -2.881235361099243, + "rewards/rejected": 5.538552761077881, + "step": 4785 + }, + { + "epoch": 0.78, + "learning_rate": 6.99326323367538e-06, + "logits/chosen": -0.6994252800941467, + "logits/rejected": -0.6994252800941467, + "logps/chosen": -66.9017333984375, + "logps/rejected": -66.9017333984375, + "loss": 0.3676, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.4612228870391846, + "rewards/margins": 0.0, + "rewards/rejected": 1.4612228870391846, + "step": 4786 + }, + { + "epoch": 0.78, + "learning_rate": 6.992057859908967e-06, + "logits/chosen": -1.1132051944732666, + "logits/rejected": -1.1262543201446533, + "logps/chosen": -96.44850158691406, + "logps/rejected": -83.30870056152344, + "loss": 0.9475, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.5318984985351562, + "rewards/margins": -1.1004409790039062, + "rewards/rejected": 1.6323394775390625, + "step": 4787 + }, + { + "epoch": 0.78, + "learning_rate": 6.990852348510205e-06, + "logits/chosen": -1.0002652406692505, + "logits/rejected": -0.9221838712692261, + "logps/chosen": -48.12142562866211, + "logps/rejected": -41.056339263916016, + "loss": 0.5371, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.562718629837036, + "rewards/margins": -0.158050537109375, + "rewards/rejected": 2.720769166946411, + "step": 4788 + }, + { + "epoch": 0.78, + "learning_rate": 6.989646699562383e-06, + "logits/chosen": -1.0246613025665283, + "logits/rejected": -1.0388301610946655, + "logps/chosen": -39.270118713378906, + "logps/rejected": -67.49540710449219, + "loss": 0.4897, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.8276557922363281, + "rewards/margins": -0.35943078994750977, + "rewards/rejected": 2.187086582183838, + "step": 4789 + }, + { + "epoch": 0.78, + "learning_rate": 6.988440913148802e-06, + "logits/chosen": -1.1017404794692993, + "logits/rejected": -1.1017404794692993, + "logps/chosen": -39.213722229003906, + "logps/rejected": -39.213722229003906, + "loss": 0.5649, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.7390327453613281, + "rewards/margins": 0.0, + "rewards/rejected": 0.7390327453613281, + "step": 4790 + }, + { + "epoch": 0.78, + "learning_rate": 6.987234989352767e-06, + "logits/chosen": -0.9500489234924316, + "logits/rejected": -0.8985135555267334, + "logps/chosen": -73.89274597167969, + "logps/rejected": -80.35963439941406, + "loss": 0.8444, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.568281650543213, + "rewards/margins": 1.8243317604064941, + "rewards/rejected": 0.7439498901367188, + "step": 4791 + }, + { + "epoch": 0.78, + "learning_rate": 6.9860289282575976e-06, + "logits/chosen": -1.7126587629318237, + "logits/rejected": -1.776058316230774, + "logps/chosen": -132.1269073486328, + "logps/rejected": -108.72981262207031, + "loss": 2.438, + "rewards/accuracies": 0.0, + "rewards/chosen": 6.210859775543213, + "rewards/margins": -1.2605104446411133, + "rewards/rejected": 7.471370220184326, + "step": 4792 + }, + { + "epoch": 0.78, + "learning_rate": 6.984822729946622e-06, + "logits/chosen": -0.4239581227302551, + "logits/rejected": -0.41639479994773865, + "logps/chosen": -16.09708595275879, + "logps/rejected": -47.23638153076172, + "loss": 0.6144, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.26330700516700745, + "rewards/margins": -0.5623651742935181, + "rewards/rejected": 0.8256721496582031, + "step": 4793 + }, + { + "epoch": 0.78, + "learning_rate": 6.983616394503177e-06, + "logits/chosen": -1.1738611459732056, + "logits/rejected": -1.1747206449508667, + "logps/chosen": -49.92169189453125, + "logps/rejected": -70.2098617553711, + "loss": 1.7689, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.853291392326355, + "rewards/margins": -2.726807117462158, + "rewards/rejected": 4.580098628997803, + "step": 4794 + }, + { + "epoch": 0.78, + "learning_rate": 6.982409922010607e-06, + "logits/chosen": -1.5117913484573364, + "logits/rejected": -1.2422125339508057, + "logps/chosen": -148.25955200195312, + "logps/rejected": -70.8910903930664, + "loss": 0.0095, + "rewards/accuracies": 1.0, + "rewards/chosen": 7.0965576171875, + "rewards/margins": 3.9775640964508057, + "rewards/rejected": 3.1189935207366943, + "step": 4795 + }, + { + "epoch": 0.78, + "learning_rate": 6.981203312552269e-06, + "logits/chosen": -0.8497701287269592, + "logits/rejected": -1.0142823457717896, + "logps/chosen": -65.95268249511719, + "logps/rejected": -118.87168884277344, + "loss": 2.2933, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.8685548305511475, + "rewards/margins": -1.017690896987915, + "rewards/rejected": 4.8862457275390625, + "step": 4796 + }, + { + "epoch": 0.78, + "learning_rate": 6.979996566211528e-06, + "logits/chosen": -1.2842347621917725, + "logits/rejected": -1.2326875925064087, + "logps/chosen": -72.47746276855469, + "logps/rejected": -103.88907623291016, + "loss": 2.1593, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.7082252502441406, + "rewards/margins": -0.21715319156646729, + "rewards/rejected": 1.925378441810608, + "step": 4797 + }, + { + "epoch": 0.78, + "learning_rate": 6.978789683071761e-06, + "logits/chosen": -0.8110266923904419, + "logits/rejected": -0.7913760542869568, + "logps/chosen": -72.812255859375, + "logps/rejected": -73.89106750488281, + "loss": 0.9982, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.8090317249298096, + "rewards/margins": -1.8101449012756348, + "rewards/rejected": 3.6191766262054443, + "step": 4798 + }, + { + "epoch": 0.78, + "learning_rate": 6.977582663216349e-06, + "logits/chosen": -1.1033096313476562, + "logits/rejected": -0.947647750377655, + "logps/chosen": -45.964515686035156, + "logps/rejected": -26.16385269165039, + "loss": 1.1633, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.238340139389038, + "rewards/margins": 0.35179567337036133, + "rewards/rejected": 1.8865444660186768, + "step": 4799 + }, + { + "epoch": 0.78, + "learning_rate": 6.976375506728688e-06, + "logits/chosen": -1.115436315536499, + "logits/rejected": -1.1568801403045654, + "logps/chosen": -103.17445373535156, + "logps/rejected": -101.0001220703125, + "loss": 3.8542, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.5328369140625, + "rewards/margins": -4.083004951477051, + "rewards/rejected": 5.615841865539551, + "step": 4800 + }, + { + "epoch": 0.78, + "learning_rate": 6.97516821369218e-06, + "logits/chosen": -1.0660892724990845, + "logits/rejected": -1.1017712354660034, + "logps/chosen": -19.849821090698242, + "logps/rejected": -37.90843963623047, + "loss": 2.3821, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.893419086933136, + "rewards/margins": -1.4838542938232422, + "rewards/rejected": 2.3772733211517334, + "step": 4801 + }, + { + "epoch": 0.78, + "learning_rate": 6.9739607841902365e-06, + "logits/chosen": -1.1292335987091064, + "logits/rejected": -1.1219545602798462, + "logps/chosen": -21.60140609741211, + "logps/rejected": -112.572265625, + "loss": 0.4541, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.2853500843048096, + "rewards/margins": -0.3858199119567871, + "rewards/rejected": 3.6711699962615967, + "step": 4802 + }, + { + "epoch": 0.78, + "learning_rate": 6.972753218306282e-06, + "logits/chosen": -1.419215440750122, + "logits/rejected": -1.4595459699630737, + "logps/chosen": -140.83935546875, + "logps/rejected": -52.558860778808594, + "loss": 0.7263, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.325198650360107, + "rewards/margins": 2.7783350944519043, + "rewards/rejected": 1.5468635559082031, + "step": 4803 + }, + { + "epoch": 0.78, + "learning_rate": 6.971545516123745e-06, + "logits/chosen": -0.9408923387527466, + "logits/rejected": -0.9408923387527466, + "logps/chosen": -157.1728515625, + "logps/rejected": -157.1728515625, + "loss": 0.426, + "rewards/accuracies": 0.0, + "rewards/chosen": 5.60206937789917, + "rewards/margins": 0.0, + "rewards/rejected": 5.60206937789917, + "step": 4804 + }, + { + "epoch": 0.78, + "learning_rate": 6.970337677726069e-06, + "logits/chosen": -0.852921187877655, + "logits/rejected": -0.852921187877655, + "logps/chosen": -48.15488052368164, + "logps/rejected": -48.15488052368164, + "loss": 0.349, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.9023075103759766, + "rewards/margins": 0.0, + "rewards/rejected": 2.9023075103759766, + "step": 4805 + }, + { + "epoch": 0.78, + "learning_rate": 6.969129703196703e-06, + "logits/chosen": -0.9798091650009155, + "logits/rejected": -0.9324961304664612, + "logps/chosen": -95.35546875, + "logps/rejected": -53.68324279785156, + "loss": 0.2932, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.8438005447387695, + "rewards/margins": 0.23319721221923828, + "rewards/rejected": 4.610603332519531, + "step": 4806 + }, + { + "epoch": 0.78, + "learning_rate": 6.967921592619104e-06, + "logits/chosen": -0.8945459723472595, + "logits/rejected": -1.1131867170333862, + "logps/chosen": -90.38645935058594, + "logps/rejected": -101.08356475830078, + "loss": 2.1736, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.0310661792755127, + "rewards/margins": -4.134625434875488, + "rewards/rejected": 6.165691375732422, + "step": 4807 + }, + { + "epoch": 0.78, + "learning_rate": 6.966713346076748e-06, + "logits/chosen": -1.148301124572754, + "logits/rejected": -1.159966230392456, + "logps/chosen": -205.4077911376953, + "logps/rejected": -149.61798095703125, + "loss": 0.0139, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.371312141418457, + "rewards/margins": 3.5783708095550537, + "rewards/rejected": 1.7929413318634033, + "step": 4808 + }, + { + "epoch": 0.78, + "learning_rate": 6.9655049636531056e-06, + "logits/chosen": -0.9465211033821106, + "logits/rejected": -0.9465211033821106, + "logps/chosen": -48.897254943847656, + "logps/rejected": -48.897254943847656, + "loss": 0.375, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.5422767400741577, + "rewards/margins": 0.0, + "rewards/rejected": 1.5422767400741577, + "step": 4809 + }, + { + "epoch": 0.78, + "learning_rate": 6.96429644543167e-06, + "logits/chosen": -0.9336447715759277, + "logits/rejected": -0.9549942016601562, + "logps/chosen": -78.58151245117188, + "logps/rejected": -88.30484008789062, + "loss": 1.3372, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.370160698890686, + "rewards/margins": -0.6309243440628052, + "rewards/rejected": 2.001085042953491, + "step": 4810 + }, + { + "epoch": 0.78, + "learning_rate": 6.963087791495935e-06, + "logits/chosen": -1.4733856916427612, + "logits/rejected": -1.3683202266693115, + "logps/chosen": -123.53251647949219, + "logps/rejected": -54.52227020263672, + "loss": 1.1944, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.3856217861175537, + "rewards/margins": 0.7957909107208252, + "rewards/rejected": 2.5898308753967285, + "step": 4811 + }, + { + "epoch": 0.78, + "learning_rate": 6.961879001929409e-06, + "logits/chosen": -1.3882383108139038, + "logits/rejected": -1.3183497190475464, + "logps/chosen": -97.93894958496094, + "logps/rejected": -70.62588500976562, + "loss": 0.106, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.7994766235351562, + "rewards/margins": 1.4781036376953125, + "rewards/rejected": 1.3213729858398438, + "step": 4812 + }, + { + "epoch": 0.78, + "learning_rate": 6.960670076815608e-06, + "logits/chosen": -0.9574994444847107, + "logits/rejected": -0.866377055644989, + "logps/chosen": -54.29909133911133, + "logps/rejected": -43.969505310058594, + "loss": 0.256, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.9764026403427124, + "rewards/margins": 0.48153913021087646, + "rewards/rejected": 1.494863510131836, + "step": 4813 + }, + { + "epoch": 0.78, + "learning_rate": 6.959461016238056e-06, + "logits/chosen": -0.8712800741195679, + "logits/rejected": -0.8288971185684204, + "logps/chosen": -28.885276794433594, + "logps/rejected": -61.46812438964844, + "loss": 0.5958, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.7766571044921875, + "rewards/margins": -0.7989928722381592, + "rewards/rejected": 2.5756499767303467, + "step": 4814 + }, + { + "epoch": 0.78, + "learning_rate": 6.958251820280288e-06, + "logits/chosen": -1.38716721534729, + "logits/rejected": -1.2712781429290771, + "logps/chosen": -142.4749755859375, + "logps/rejected": -25.487430572509766, + "loss": 0.0855, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.111035108566284, + "rewards/margins": 1.7012933492660522, + "rewards/rejected": 0.4097417891025543, + "step": 4815 + }, + { + "epoch": 0.78, + "learning_rate": 6.957042489025849e-06, + "logits/chosen": -1.323045253753662, + "logits/rejected": -1.2269872426986694, + "logps/chosen": -68.00827026367188, + "logps/rejected": -87.7515640258789, + "loss": 0.1257, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.079542636871338, + "rewards/margins": 1.3840394020080566, + "rewards/rejected": 2.6955032348632812, + "step": 4816 + }, + { + "epoch": 0.78, + "learning_rate": 6.955833022558292e-06, + "logits/chosen": -1.3621715307235718, + "logits/rejected": -1.329897165298462, + "logps/chosen": -105.263427734375, + "logps/rejected": -80.95364379882812, + "loss": 0.6813, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.6303871870040894, + "rewards/margins": -0.9800940752029419, + "rewards/rejected": 2.6104812622070312, + "step": 4817 + }, + { + "epoch": 0.78, + "learning_rate": 6.954623420961179e-06, + "logits/chosen": -1.114646077156067, + "logits/rejected": -1.1071008443832397, + "logps/chosen": -57.706809997558594, + "logps/rejected": -99.03627014160156, + "loss": 0.1066, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.207012176513672, + "rewards/margins": 1.5257880687713623, + "rewards/rejected": 0.6812240481376648, + "step": 4818 + }, + { + "epoch": 0.78, + "learning_rate": 6.953413684318083e-06, + "logits/chosen": -0.9067685008049011, + "logits/rejected": -0.9136006236076355, + "logps/chosen": -68.39733123779297, + "logps/rejected": -100.75911712646484, + "loss": 0.5062, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.512066602706909, + "rewards/margins": -0.5258157253265381, + "rewards/rejected": 3.0378823280334473, + "step": 4819 + }, + { + "epoch": 0.78, + "learning_rate": 6.952203812712584e-06, + "logits/chosen": -1.0448601245880127, + "logits/rejected": -1.0839327573776245, + "logps/chosen": -35.80027770996094, + "logps/rejected": -52.35961151123047, + "loss": 1.5326, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.261460542678833, + "rewards/margins": -0.417324423789978, + "rewards/rejected": 1.678784966468811, + "step": 4820 + }, + { + "epoch": 0.78, + "learning_rate": 6.950993806228274e-06, + "logits/chosen": -1.3583016395568848, + "logits/rejected": -1.4206823110580444, + "logps/chosen": -63.33747863769531, + "logps/rejected": -162.96795654296875, + "loss": 2.4494, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.3534622192382812, + "rewards/margins": -4.416664123535156, + "rewards/rejected": 6.7701263427734375, + "step": 4821 + }, + { + "epoch": 0.78, + "learning_rate": 6.949783664948752e-06, + "logits/chosen": -1.066644549369812, + "logits/rejected": -1.2182763814926147, + "logps/chosen": -80.55645751953125, + "logps/rejected": -119.06973266601562, + "loss": 2.3017, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.052694797515869, + "rewards/margins": -4.518716335296631, + "rewards/rejected": 7.5714111328125, + "step": 4822 + }, + { + "epoch": 0.78, + "learning_rate": 6.948573388957628e-06, + "logits/chosen": -0.9753217697143555, + "logits/rejected": -0.9766667485237122, + "logps/chosen": -46.21920394897461, + "logps/rejected": -77.70733642578125, + "loss": 0.7833, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.721491575241089, + "rewards/margins": 0.6392123699188232, + "rewards/rejected": 2.0822792053222656, + "step": 4823 + }, + { + "epoch": 0.78, + "learning_rate": 6.947362978338521e-06, + "logits/chosen": -1.2824870347976685, + "logits/rejected": -1.328170895576477, + "logps/chosen": -116.55770111083984, + "logps/rejected": -73.01348876953125, + "loss": 1.8711, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.305606842041016, + "rewards/margins": 3.7046501636505127, + "rewards/rejected": 2.600956678390503, + "step": 4824 + }, + { + "epoch": 0.78, + "learning_rate": 6.946152433175057e-06, + "logits/chosen": -1.378631830215454, + "logits/rejected": -1.2654286623001099, + "logps/chosen": -103.95323181152344, + "logps/rejected": -17.260238647460938, + "loss": 0.0387, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9561188220977783, + "rewards/margins": 2.7998127937316895, + "rewards/rejected": 0.15630607306957245, + "step": 4825 + }, + { + "epoch": 0.78, + "learning_rate": 6.944941753550877e-06, + "logits/chosen": -1.3809254169464111, + "logits/rejected": -1.3862900733947754, + "logps/chosen": -50.21654510498047, + "logps/rejected": -62.340843200683594, + "loss": 0.2301, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.3304214477539062, + "rewards/margins": 0.5652511119842529, + "rewards/rejected": 1.7651703357696533, + "step": 4826 + }, + { + "epoch": 0.78, + "learning_rate": 6.943730939549623e-06, + "logits/chosen": -1.347968578338623, + "logits/rejected": -1.1667593717575073, + "logps/chosen": -50.32779312133789, + "logps/rejected": -20.242290496826172, + "loss": 0.4248, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.733578085899353, + "rewards/margins": 0.8411214351654053, + "rewards/rejected": 0.8924566507339478, + "step": 4827 + }, + { + "epoch": 0.78, + "learning_rate": 6.942519991254954e-06, + "logits/chosen": -1.360236406326294, + "logits/rejected": -1.2740364074707031, + "logps/chosen": -137.19664001464844, + "logps/rejected": -107.32743072509766, + "loss": 0.3119, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.7666335105896, + "rewards/margins": 0.627647876739502, + "rewards/rejected": 6.138985633850098, + "step": 4828 + }, + { + "epoch": 0.78, + "learning_rate": 6.941308908750533e-06, + "logits/chosen": -1.4576386213302612, + "logits/rejected": -1.2402397394180298, + "logps/chosen": -179.72006225585938, + "logps/rejected": -164.6011962890625, + "loss": 0.734, + "rewards/accuracies": 0.0, + "rewards/chosen": 7.20456075668335, + "rewards/margins": -1.199720859527588, + "rewards/rejected": 8.404281616210938, + "step": 4829 + }, + { + "epoch": 0.78, + "learning_rate": 6.940097692120035e-06, + "logits/chosen": -1.7569921016693115, + "logits/rejected": -1.7233848571777344, + "logps/chosen": -94.50320434570312, + "logps/rejected": -157.1881561279297, + "loss": 1.1713, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.5585525035858154, + "rewards/margins": -2.241162061691284, + "rewards/rejected": 4.7997145652771, + "step": 4830 + }, + { + "epoch": 0.78, + "learning_rate": 6.9388863414471445e-06, + "logits/chosen": -2.5951881408691406, + "logits/rejected": -2.4601898193359375, + "logps/chosen": -113.02725982666016, + "logps/rejected": -147.27162170410156, + "loss": 2.3875, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.0139825344085693, + "rewards/margins": -4.373943328857422, + "rewards/rejected": 6.387925624847412, + "step": 4831 + }, + { + "epoch": 0.78, + "learning_rate": 6.937674856815553e-06, + "logits/chosen": -1.143237590789795, + "logits/rejected": -1.1416319608688354, + "logps/chosen": -3.008918523788452, + "logps/rejected": -6.172138214111328, + "loss": 2.5619, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5476316809654236, + "rewards/margins": 0.14553779363632202, + "rewards/rejected": 0.40209388732910156, + "step": 4832 + }, + { + "epoch": 0.78, + "learning_rate": 6.936463238308964e-06, + "logits/chosen": -0.9202571511268616, + "logits/rejected": -0.949665904045105, + "logps/chosen": -6.834721088409424, + "logps/rejected": -28.226226806640625, + "loss": 0.4068, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3910689055919647, + "rewards/margins": 0.0577521026134491, + "rewards/rejected": 0.3333168029785156, + "step": 4833 + }, + { + "epoch": 0.78, + "learning_rate": 6.9352514860110876e-06, + "logits/chosen": -1.0932581424713135, + "logits/rejected": -1.0144184827804565, + "logps/chosen": -122.19432067871094, + "logps/rejected": -36.54243850708008, + "loss": 0.1165, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.671626329421997, + "rewards/margins": 1.4353458881378174, + "rewards/rejected": 0.2362804412841797, + "step": 4834 + }, + { + "epoch": 0.78, + "learning_rate": 6.934039600005644e-06, + "logits/chosen": -1.1929551362991333, + "logits/rejected": -1.2225877046585083, + "logps/chosen": -79.51774597167969, + "logps/rejected": -109.96858215332031, + "loss": 0.4023, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.3530991077423096, + "rewards/margins": -0.11247634887695312, + "rewards/rejected": 1.4655754566192627, + "step": 4835 + }, + { + "epoch": 0.78, + "learning_rate": 6.932827580376366e-06, + "logits/chosen": -1.1782621145248413, + "logits/rejected": -1.1970356702804565, + "logps/chosen": -102.22657775878906, + "logps/rejected": -57.64202117919922, + "loss": 1.3265, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.825850009918213, + "rewards/margins": -2.2457098960876465, + "rewards/rejected": 5.071559906005859, + "step": 4836 + }, + { + "epoch": 0.79, + "learning_rate": 6.93161542720699e-06, + "logits/chosen": -1.5235930681228638, + "logits/rejected": -1.5320534706115723, + "logps/chosen": -169.03125, + "logps/rejected": -164.96755981445312, + "loss": 0.0089, + "rewards/accuracies": 1.0, + "rewards/chosen": 8.475854873657227, + "rewards/margins": 4.1207475662231445, + "rewards/rejected": 4.355107307434082, + "step": 4837 + }, + { + "epoch": 0.79, + "learning_rate": 6.930403140581266e-06, + "logits/chosen": -1.2731647491455078, + "logits/rejected": -1.2375600337982178, + "logps/chosen": -101.53410339355469, + "logps/rejected": -60.502967834472656, + "loss": 0.208, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2450242042541504, + "rewards/margins": 0.6904823780059814, + "rewards/rejected": 2.554541826248169, + "step": 4838 + }, + { + "epoch": 0.79, + "learning_rate": 6.929190720582949e-06, + "logits/chosen": -1.1712785959243774, + "logits/rejected": -1.1263582706451416, + "logps/chosen": -43.3648796081543, + "logps/rejected": -43.12576675415039, + "loss": 0.671, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.4621628522872925, + "rewards/margins": -0.6921747922897339, + "rewards/rejected": 2.1543376445770264, + "step": 4839 + }, + { + "epoch": 0.79, + "learning_rate": 6.927978167295808e-06, + "logits/chosen": -0.9149580597877502, + "logits/rejected": -0.9168371558189392, + "logps/chosen": -6.969415664672852, + "logps/rejected": -2.8988828659057617, + "loss": 0.3772, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.30977773666381836, + "rewards/margins": -0.09163886308670044, + "rewards/rejected": 0.4014165997505188, + "step": 4840 + }, + { + "epoch": 0.79, + "learning_rate": 6.926765480803619e-06, + "logits/chosen": -0.8326570391654968, + "logits/rejected": -0.8197228908538818, + "logps/chosen": -2.3925745487213135, + "logps/rejected": -4.74094295501709, + "loss": 0.4514, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.15451624989509583, + "rewards/margins": -0.1717643439769745, + "rewards/rejected": 0.3262805938720703, + "step": 4841 + }, + { + "epoch": 0.79, + "learning_rate": 6.925552661190166e-06, + "logits/chosen": -0.9366407990455627, + "logits/rejected": -0.8235595226287842, + "logps/chosen": -41.84326934814453, + "logps/rejected": -26.78290557861328, + "loss": 0.1861, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.6529006958007812, + "rewards/margins": 1.1345916986465454, + "rewards/rejected": 1.5183089971542358, + "step": 4842 + }, + { + "epoch": 0.79, + "learning_rate": 6.924339708539244e-06, + "logits/chosen": -1.0413025617599487, + "logits/rejected": -1.076117753982544, + "logps/chosen": -63.806610107421875, + "logps/rejected": -77.0607681274414, + "loss": 1.3727, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.2147667407989502, + "rewards/margins": -1.9148650169372559, + "rewards/rejected": 3.129631757736206, + "step": 4843 + }, + { + "epoch": 0.79, + "learning_rate": 6.923126622934656e-06, + "logits/chosen": -1.2241421937942505, + "logits/rejected": -1.145979642868042, + "logps/chosen": -215.43731689453125, + "logps/rejected": -82.6891098022461, + "loss": 0.047, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.064587593078613, + "rewards/margins": 2.326918125152588, + "rewards/rejected": 3.7376694679260254, + "step": 4844 + }, + { + "epoch": 0.79, + "learning_rate": 6.921913404460216e-06, + "logits/chosen": -0.9195969700813293, + "logits/rejected": -0.850479245185852, + "logps/chosen": -55.370567321777344, + "logps/rejected": -78.27975463867188, + "loss": 1.8754, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.2067391872406006, + "rewards/margins": -3.687645673751831, + "rewards/rejected": 5.894384860992432, + "step": 4845 + }, + { + "epoch": 0.79, + "learning_rate": 6.9207000531997445e-06, + "logits/chosen": -1.3611325025558472, + "logits/rejected": -1.457660436630249, + "logps/chosen": -172.30050659179688, + "logps/rejected": -109.23828125, + "loss": 1.3329, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.5119965076446533, + "rewards/margins": -2.531306505203247, + "rewards/rejected": 5.0433030128479, + "step": 4846 + }, + { + "epoch": 0.79, + "learning_rate": 6.919486569237074e-06, + "logits/chosen": -0.9916591644287109, + "logits/rejected": -0.8228143453598022, + "logps/chosen": -63.2882080078125, + "logps/rejected": -25.20909881591797, + "loss": 0.4234, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.1767876148223877, + "rewards/margins": 1.7948617935180664, + "rewards/rejected": 0.3819257915019989, + "step": 4847 + }, + { + "epoch": 0.79, + "learning_rate": 6.918272952656042e-06, + "logits/chosen": -1.2466778755187988, + "logits/rejected": -1.2046111822128296, + "logps/chosen": -39.776573181152344, + "logps/rejected": -24.45766258239746, + "loss": 0.4591, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.5599961280822754, + "rewards/margins": 0.7089705467224121, + "rewards/rejected": 1.8510255813598633, + "step": 4848 + }, + { + "epoch": 0.79, + "learning_rate": 6.917059203540502e-06, + "logits/chosen": -1.314000129699707, + "logits/rejected": -1.2523927688598633, + "logps/chosen": -173.03433227539062, + "logps/rejected": -71.53875732421875, + "loss": 0.4197, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.740217685699463, + "rewards/margins": -0.230743408203125, + "rewards/rejected": 2.970961093902588, + "step": 4849 + }, + { + "epoch": 0.79, + "learning_rate": 6.915845321974309e-06, + "logits/chosen": -1.1954137086868286, + "logits/rejected": -1.2054637670516968, + "logps/chosen": -24.478052139282227, + "logps/rejected": -49.48074722290039, + "loss": 1.2754, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.427349090576172, + "rewards/margins": 0.2584574222564697, + "rewards/rejected": 3.168891668319702, + "step": 4850 + }, + { + "epoch": 0.79, + "learning_rate": 6.914631308041333e-06, + "logits/chosen": -1.2819411754608154, + "logits/rejected": -1.3363012075424194, + "logps/chosen": -142.1796875, + "logps/rejected": -83.0149917602539, + "loss": 0.6259, + "rewards/accuracies": 0.0, + "rewards/chosen": 5.4090576171875, + "rewards/margins": -0.669161319732666, + "rewards/rejected": 6.078218936920166, + "step": 4851 + }, + { + "epoch": 0.79, + "learning_rate": 6.913417161825449e-06, + "logits/chosen": -1.2586866617202759, + "logits/rejected": -1.0025068521499634, + "logps/chosen": -80.96026611328125, + "logps/rejected": -50.264862060546875, + "loss": 0.1582, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.3450188636779785, + "rewards/margins": 2.2493362426757812, + "rewards/rejected": 2.0956826210021973, + "step": 4852 + }, + { + "epoch": 0.79, + "learning_rate": 6.912202883410546e-06, + "logits/chosen": -1.2683675289154053, + "logits/rejected": -1.2288419008255005, + "logps/chosen": -66.15692138671875, + "logps/rejected": -36.346046447753906, + "loss": 0.4024, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.3032569885253906, + "rewards/margins": -0.19222187995910645, + "rewards/rejected": 3.495478868484497, + "step": 4853 + }, + { + "epoch": 0.79, + "learning_rate": 6.910988472880515e-06, + "logits/chosen": -1.0634419918060303, + "logits/rejected": -1.0987592935562134, + "logps/chosen": -57.398250579833984, + "logps/rejected": -40.883663177490234, + "loss": 3.9776, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.4142024517059326, + "rewards/margins": -1.6774485111236572, + "rewards/rejected": 4.09165096282959, + "step": 4854 + }, + { + "epoch": 0.79, + "learning_rate": 6.909773930319263e-06, + "logits/chosen": -0.9730265736579895, + "logits/rejected": -0.9046420454978943, + "logps/chosen": -70.79869842529297, + "logps/rejected": -60.39897155761719, + "loss": 0.1898, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.816994547843933, + "rewards/margins": 1.3070886135101318, + "rewards/rejected": 0.509905993938446, + "step": 4855 + }, + { + "epoch": 0.79, + "learning_rate": 6.908559255810704e-06, + "logits/chosen": -1.2828543186187744, + "logits/rejected": -1.2579413652420044, + "logps/chosen": -67.89225006103516, + "logps/rejected": -90.73637390136719, + "loss": 0.6038, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.6198647022247314, + "rewards/margins": 0.9313461780548096, + "rewards/rejected": 1.6885185241699219, + "step": 4856 + }, + { + "epoch": 0.79, + "learning_rate": 6.9073444494387585e-06, + "logits/chosen": -1.276892900466919, + "logits/rejected": -1.2225178480148315, + "logps/chosen": -81.72950744628906, + "logps/rejected": -134.68017578125, + "loss": 0.9195, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.433964490890503, + "rewards/margins": -1.5332872867584229, + "rewards/rejected": 4.967251777648926, + "step": 4857 + }, + { + "epoch": 0.79, + "learning_rate": 6.906129511287358e-06, + "logits/chosen": -1.0655630826950073, + "logits/rejected": -1.0651909112930298, + "logps/chosen": -69.0433349609375, + "logps/rejected": -85.2181167602539, + "loss": 0.579, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.712867021560669, + "rewards/margins": 0.17651450634002686, + "rewards/rejected": 1.536352515220642, + "step": 4858 + }, + { + "epoch": 0.79, + "learning_rate": 6.904914441440447e-06, + "logits/chosen": -1.0302730798721313, + "logits/rejected": -0.9051446318626404, + "logps/chosen": -51.58555221557617, + "logps/rejected": -80.08377838134766, + "loss": 1.9601, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.4723827838897705, + "rewards/margins": 1.0326069593429565, + "rewards/rejected": 1.439775824546814, + "step": 4859 + }, + { + "epoch": 0.79, + "learning_rate": 6.90369923998197e-06, + "logits/chosen": -1.0611529350280762, + "logits/rejected": -1.0827564001083374, + "logps/chosen": -90.2716064453125, + "logps/rejected": -59.47999572753906, + "loss": 0.6784, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.64434814453125, + "rewards/margins": 0.12001192569732666, + "rewards/rejected": 1.5243362188339233, + "step": 4860 + }, + { + "epoch": 0.79, + "learning_rate": 6.902483906995889e-06, + "logits/chosen": -1.3546391725540161, + "logits/rejected": -1.26410973072052, + "logps/chosen": -97.35250091552734, + "logps/rejected": -30.518953323364258, + "loss": 0.0965, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.5930588245391846, + "rewards/margins": 1.5575544834136963, + "rewards/rejected": 1.0355043411254883, + "step": 4861 + }, + { + "epoch": 0.79, + "learning_rate": 6.901268442566172e-06, + "logits/chosen": -1.1744884252548218, + "logits/rejected": -1.1342757940292358, + "logps/chosen": -67.31440734863281, + "logps/rejected": -57.4395637512207, + "loss": 2.0747, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.840221405029297, + "rewards/margins": 1.2922145128250122, + "rewards/rejected": 1.5480068922042847, + "step": 4862 + }, + { + "epoch": 0.79, + "learning_rate": 6.900052846776796e-06, + "logits/chosen": -1.656954288482666, + "logits/rejected": -1.649361491203308, + "logps/chosen": -225.55459594726562, + "logps/rejected": -21.55714988708496, + "loss": 0.0326, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.620156764984131, + "rewards/margins": 4.5071516036987305, + "rewards/rejected": 0.11300525814294815, + "step": 4863 + }, + { + "epoch": 0.79, + "learning_rate": 6.898837119711746e-06, + "logits/chosen": -0.9099184274673462, + "logits/rejected": -0.8205564618110657, + "logps/chosen": -85.38395690917969, + "logps/rejected": -74.18679809570312, + "loss": 0.881, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.939996361732483, + "rewards/margins": -1.2238472700119019, + "rewards/rejected": 3.1638436317443848, + "step": 4864 + }, + { + "epoch": 0.79, + "learning_rate": 6.897621261455018e-06, + "logits/chosen": -1.0151197910308838, + "logits/rejected": -1.0320979356765747, + "logps/chosen": -143.71554565429688, + "logps/rejected": -105.46109771728516, + "loss": 1.155, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.8832123279571533, + "rewards/margins": -2.2036402225494385, + "rewards/rejected": 5.086852550506592, + "step": 4865 + }, + { + "epoch": 0.79, + "learning_rate": 6.8964052720906175e-06, + "logits/chosen": -0.8367714285850525, + "logits/rejected": -0.8624582290649414, + "logps/chosen": -211.49427795410156, + "logps/rejected": -123.52307891845703, + "loss": 1.549, + "rewards/accuracies": 0.0, + "rewards/chosen": 5.222349643707275, + "rewards/margins": -2.4441232681274414, + "rewards/rejected": 7.666472911834717, + "step": 4866 + }, + { + "epoch": 0.79, + "learning_rate": 6.895189151702555e-06, + "logits/chosen": -1.268607497215271, + "logits/rejected": -1.282726764678955, + "logps/chosen": -86.16149139404297, + "logps/rejected": -58.92460632324219, + "loss": 0.6722, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.9535720348358154, + "rewards/margins": -0.9158601760864258, + "rewards/rejected": 3.869432210922241, + "step": 4867 + }, + { + "epoch": 0.79, + "learning_rate": 6.893972900374856e-06, + "logits/chosen": -0.4723210632801056, + "logits/rejected": -0.44588178396224976, + "logps/chosen": -19.958301544189453, + "logps/rejected": -18.2205810546875, + "loss": 1.0137, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5437256097793579, + "rewards/margins": 0.09108200669288635, + "rewards/rejected": 0.45264360308647156, + "step": 4868 + }, + { + "epoch": 0.79, + "learning_rate": 6.892756518191549e-06, + "logits/chosen": -1.3769500255584717, + "logits/rejected": -1.3641853332519531, + "logps/chosen": -115.05146789550781, + "logps/rejected": -319.71002197265625, + "loss": 1.6335, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.9894180297851562, + "rewards/margins": -3.1665663719177246, + "rewards/rejected": 7.155984401702881, + "step": 4869 + }, + { + "epoch": 0.79, + "learning_rate": 6.8915400052366756e-06, + "logits/chosen": -1.2068355083465576, + "logits/rejected": -1.1245242357254028, + "logps/chosen": -59.71723175048828, + "logps/rejected": -19.51995086669922, + "loss": 0.3857, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.250936269760132, + "rewards/margins": 0.6257474422454834, + "rewards/rejected": 1.6251888275146484, + "step": 4870 + }, + { + "epoch": 0.79, + "learning_rate": 6.890323361594286e-06, + "logits/chosen": -1.2746224403381348, + "logits/rejected": -1.2919045686721802, + "logps/chosen": -52.16791534423828, + "logps/rejected": -74.34761047363281, + "loss": 2.438, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.9028865694999695, + "rewards/margins": -0.27920347452163696, + "rewards/rejected": 1.1820900440216064, + "step": 4871 + }, + { + "epoch": 0.79, + "learning_rate": 6.889106587348441e-06, + "logits/chosen": -0.9913875460624695, + "logits/rejected": -0.8686845898628235, + "logps/chosen": -59.91181945800781, + "logps/rejected": -32.421836853027344, + "loss": 0.5487, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.1305274963378906, + "rewards/margins": -0.5027604103088379, + "rewards/rejected": 2.6332879066467285, + "step": 4872 + }, + { + "epoch": 0.79, + "learning_rate": 6.887889682583204e-06, + "logits/chosen": -1.28287672996521, + "logits/rejected": -1.2696468830108643, + "logps/chosen": -53.652618408203125, + "logps/rejected": -69.31066131591797, + "loss": 1.2521, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.3381661176681519, + "rewards/margins": 1.3681557178497314, + "rewards/rejected": -0.02998962439596653, + "step": 4873 + }, + { + "epoch": 0.79, + "learning_rate": 6.886672647382653e-06, + "logits/chosen": -1.1907925605773926, + "logits/rejected": -1.1907925605773926, + "logps/chosen": -82.73011016845703, + "logps/rejected": -82.73011016845703, + "loss": 2.26, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.6020302772521973, + "rewards/margins": 0.0, + "rewards/rejected": 3.6020302772521973, + "step": 4874 + }, + { + "epoch": 0.79, + "learning_rate": 6.885455481830874e-06, + "logits/chosen": -0.8455134034156799, + "logits/rejected": -0.9200377464294434, + "logps/chosen": -25.435461044311523, + "logps/rejected": -70.95191955566406, + "loss": 1.107, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.5517662167549133, + "rewards/margins": -1.192263126373291, + "rewards/rejected": 1.7440292835235596, + "step": 4875 + }, + { + "epoch": 0.79, + "learning_rate": 6.884238186011962e-06, + "logits/chosen": -1.3152297735214233, + "logits/rejected": -1.2732700109481812, + "logps/chosen": -56.99848556518555, + "logps/rejected": -80.01485443115234, + "loss": 0.3959, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.489596962928772, + "rewards/margins": -0.18002355098724365, + "rewards/rejected": 1.6696205139160156, + "step": 4876 + }, + { + "epoch": 0.79, + "learning_rate": 6.88302076001002e-06, + "logits/chosen": -0.6435628533363342, + "logits/rejected": -0.1860702782869339, + "logps/chosen": -102.54581451416016, + "logps/rejected": -127.9029312133789, + "loss": 1.716, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.4051254987716675, + "rewards/margins": -2.1840171813964844, + "rewards/rejected": 3.5891425609588623, + "step": 4877 + }, + { + "epoch": 0.79, + "learning_rate": 6.881803203909161e-06, + "logits/chosen": -0.8857792019844055, + "logits/rejected": -0.8997642993927002, + "logps/chosen": -66.02703857421875, + "logps/rejected": -86.44796752929688, + "loss": 1.4957, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.4137314558029175, + "rewards/margins": -1.3315452337265015, + "rewards/rejected": 2.745276689529419, + "step": 4878 + }, + { + "epoch": 0.79, + "learning_rate": 6.880585517793508e-06, + "logits/chosen": -1.0663305521011353, + "logits/rejected": -1.054206132888794, + "logps/chosen": -77.41314697265625, + "logps/rejected": -54.89059829711914, + "loss": 0.7673, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.4452804327011108, + "rewards/margins": -1.2528592348098755, + "rewards/rejected": 2.6981396675109863, + "step": 4879 + }, + { + "epoch": 0.79, + "learning_rate": 6.879367701747188e-06, + "logits/chosen": -1.2565420866012573, + "logits/rejected": -1.1039263010025024, + "logps/chosen": -85.077392578125, + "logps/rejected": -15.794733047485352, + "loss": 0.0365, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.046856880187988, + "rewards/margins": 5.067473411560059, + "rewards/rejected": 0.9793832898139954, + "step": 4880 + }, + { + "epoch": 0.79, + "learning_rate": 6.878149755854343e-06, + "logits/chosen": -1.0164510011672974, + "logits/rejected": -1.0142827033996582, + "logps/chosen": -3.1611204147338867, + "logps/rejected": -3.351328134536743, + "loss": 0.7429, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.2754943370819092, + "rewards/margins": -0.18997034430503845, + "rewards/rejected": 0.46546468138694763, + "step": 4881 + }, + { + "epoch": 0.79, + "learning_rate": 6.876931680199121e-06, + "logits/chosen": -1.275665521621704, + "logits/rejected": -0.8920599222183228, + "logps/chosen": -127.4614486694336, + "logps/rejected": -33.451168060302734, + "loss": 0.4106, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.155185699462891, + "rewards/margins": 4.956075191497803, + "rewards/rejected": 0.19911041855812073, + "step": 4882 + }, + { + "epoch": 0.79, + "learning_rate": 6.875713474865679e-06, + "logits/chosen": -0.5857945680618286, + "logits/rejected": -0.6508510708808899, + "logps/chosen": -1.949427604675293, + "logps/rejected": -59.48772430419922, + "loss": 0.6887, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.49009647965431213, + "rewards/margins": -0.8956739902496338, + "rewards/rejected": 1.3857704401016235, + "step": 4883 + }, + { + "epoch": 0.79, + "learning_rate": 6.874495139938186e-06, + "logits/chosen": -1.0360270738601685, + "logits/rejected": -0.9236109256744385, + "logps/chosen": -70.93031311035156, + "logps/rejected": -45.55942153930664, + "loss": 0.5772, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.0151703357696533, + "rewards/margins": -0.7691738605499268, + "rewards/rejected": 2.78434419631958, + "step": 4884 + }, + { + "epoch": 0.79, + "learning_rate": 6.8732766755008126e-06, + "logits/chosen": -0.8993263840675354, + "logits/rejected": -0.8344796299934387, + "logps/chosen": -39.33887481689453, + "logps/rejected": -32.06373977661133, + "loss": 0.2502, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.6754472255706787, + "rewards/margins": 0.7675614356994629, + "rewards/rejected": 2.907885789871216, + "step": 4885 + }, + { + "epoch": 0.79, + "learning_rate": 6.872058081637748e-06, + "logits/chosen": -0.9556883573532104, + "logits/rejected": -0.9472829699516296, + "logps/chosen": -8.654119491577148, + "logps/rejected": -13.162464141845703, + "loss": 1.1584, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.4143288731575012, + "rewards/margins": -0.4800249934196472, + "rewards/rejected": 0.8943538665771484, + "step": 4886 + }, + { + "epoch": 0.79, + "learning_rate": 6.870839358433183e-06, + "logits/chosen": -0.9569290280342102, + "logits/rejected": -0.9050916433334351, + "logps/chosen": -56.359886169433594, + "logps/rejected": -44.94775390625, + "loss": 2.0514, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.992865800857544, + "rewards/margins": 0.4355407953262329, + "rewards/rejected": 1.557325005531311, + "step": 4887 + }, + { + "epoch": 0.79, + "learning_rate": 6.86962050597132e-06, + "logits/chosen": -0.7376583218574524, + "logits/rejected": -0.7186634540557861, + "logps/chosen": -20.424230575561523, + "logps/rejected": -15.117685317993164, + "loss": 1.3791, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.24927940964698792, + "rewards/margins": 0.010070234537124634, + "rewards/rejected": 0.23920917510986328, + "step": 4888 + }, + { + "epoch": 0.79, + "learning_rate": 6.868401524336371e-06, + "logits/chosen": -1.2491981983184814, + "logits/rejected": -1.254220962524414, + "logps/chosen": -91.41822052001953, + "logps/rejected": -70.62677001953125, + "loss": 1.8813, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.073446035385132, + "rewards/margins": -3.738929033279419, + "rewards/rejected": 5.812375068664551, + "step": 4889 + }, + { + "epoch": 0.79, + "learning_rate": 6.867182413612556e-06, + "logits/chosen": -1.2191870212554932, + "logits/rejected": -1.2243740558624268, + "logps/chosen": -49.64710998535156, + "logps/rejected": -54.282020568847656, + "loss": 0.8968, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.0469932556152344, + "rewards/margins": -0.34778904914855957, + "rewards/rejected": 3.394782304763794, + "step": 4890 + }, + { + "epoch": 0.79, + "learning_rate": 6.865963173884103e-06, + "logits/chosen": -1.3540560007095337, + "logits/rejected": -1.3771764039993286, + "logps/chosen": -107.5509033203125, + "logps/rejected": -77.95503997802734, + "loss": 0.5609, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.6749595403671265, + "rewards/margins": -0.6647950410842896, + "rewards/rejected": 2.339754581451416, + "step": 4891 + }, + { + "epoch": 0.79, + "learning_rate": 6.864743805235251e-06, + "logits/chosen": -0.798771858215332, + "logits/rejected": -0.7924261093139648, + "logps/chosen": -74.6158447265625, + "logps/rejected": -54.74113464355469, + "loss": 1.4753, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.449903130531311, + "rewards/margins": -2.77235746383667, + "rewards/rejected": 4.222260475158691, + "step": 4892 + }, + { + "epoch": 0.79, + "learning_rate": 6.8635243077502465e-06, + "logits/chosen": -1.465596318244934, + "logits/rejected": -1.4513137340545654, + "logps/chosen": -98.44319152832031, + "logps/rejected": -25.988384246826172, + "loss": 1.3278, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.1108429431915283, + "rewards/margins": 0.19353872537612915, + "rewards/rejected": 0.9173042178153992, + "step": 4893 + }, + { + "epoch": 0.79, + "learning_rate": 6.862304681513344e-06, + "logits/chosen": -1.0031715631484985, + "logits/rejected": -1.0520973205566406, + "logps/chosen": -45.64695739746094, + "logps/rejected": -87.37013244628906, + "loss": 0.6755, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.9041088223457336, + "rewards/margins": -0.18898051977157593, + "rewards/rejected": 1.0930893421173096, + "step": 4894 + }, + { + "epoch": 0.79, + "learning_rate": 6.861084926608811e-06, + "logits/chosen": -0.741355836391449, + "logits/rejected": -0.8292490839958191, + "logps/chosen": -69.52396392822266, + "logps/rejected": -127.56112670898438, + "loss": 1.2647, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.8274093866348267, + "rewards/margins": -0.8864349126815796, + "rewards/rejected": 2.7138442993164062, + "step": 4895 + }, + { + "epoch": 0.79, + "learning_rate": 6.859865043120919e-06, + "logits/chosen": -1.3372057676315308, + "logits/rejected": -1.1731526851654053, + "logps/chosen": -103.41069030761719, + "logps/rejected": -63.12034606933594, + "loss": 0.1399, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.304482936859131, + "rewards/margins": 1.1561470031738281, + "rewards/rejected": 5.148335933685303, + "step": 4896 + }, + { + "epoch": 0.79, + "learning_rate": 6.85864503113395e-06, + "logits/chosen": -1.258723497390747, + "logits/rejected": -1.4606965780258179, + "logps/chosen": -64.77525329589844, + "logps/rejected": -34.8714485168457, + "loss": 0.5283, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2594192028045654, + "rewards/margins": 2.8244526386260986, + "rewards/rejected": 0.43496665358543396, + "step": 4897 + }, + { + "epoch": 0.8, + "learning_rate": 6.857424890732195e-06, + "logits/chosen": -1.0050956010818481, + "logits/rejected": -1.0062565803527832, + "logps/chosen": -78.00735473632812, + "logps/rejected": -35.82456970214844, + "loss": 0.2357, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.2408645153045654, + "rewards/margins": 0.5122359991073608, + "rewards/rejected": 1.7286285161972046, + "step": 4898 + }, + { + "epoch": 0.8, + "learning_rate": 6.8562046219999565e-06, + "logits/chosen": -1.3701813220977783, + "logits/rejected": -1.3017823696136475, + "logps/chosen": -108.62551879882812, + "logps/rejected": -67.97554016113281, + "loss": 1.4541, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.166569471359253, + "rewards/margins": 0.9457221031188965, + "rewards/rejected": 1.2208473682403564, + "step": 4899 + }, + { + "epoch": 0.8, + "learning_rate": 6.854984225021541e-06, + "logits/chosen": -0.8825790882110596, + "logits/rejected": -0.7979987263679504, + "logps/chosen": -43.72559356689453, + "logps/rejected": -90.7376937866211, + "loss": 0.2222, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.4249305725097656, + "rewards/margins": 1.248497724533081, + "rewards/rejected": 2.1764328479766846, + "step": 4900 + }, + { + "epoch": 0.8, + "learning_rate": 6.853763699881269e-06, + "logits/chosen": -0.49049919843673706, + "logits/rejected": -0.49049919843673706, + "logps/chosen": -1.5670735836029053, + "logps/rejected": -1.5670735836029053, + "loss": 0.348, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.1806354820728302, + "rewards/margins": 0.0, + "rewards/rejected": 0.1806354820728302, + "step": 4901 + }, + { + "epoch": 0.8, + "learning_rate": 6.852543046663467e-06, + "logits/chosen": -0.7294016480445862, + "logits/rejected": -0.7262460589408875, + "logps/chosen": -2.8545732498168945, + "logps/rejected": -12.271172523498535, + "loss": 0.5355, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.8171688318252563, + "rewards/margins": -0.06877356767654419, + "rewards/rejected": 0.8859423995018005, + "step": 4902 + }, + { + "epoch": 0.8, + "learning_rate": 6.851322265452467e-06, + "logits/chosen": -1.3188254833221436, + "logits/rejected": -1.3067774772644043, + "logps/chosen": -69.15380859375, + "logps/rejected": -71.85362243652344, + "loss": 0.2275, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.0574920177459717, + "rewards/margins": 0.6570662260055542, + "rewards/rejected": 1.4004257917404175, + "step": 4903 + }, + { + "epoch": 0.8, + "learning_rate": 6.850101356332617e-06, + "logits/chosen": -1.0368261337280273, + "logits/rejected": -1.0353175401687622, + "logps/chosen": -13.036067008972168, + "logps/rejected": -2.4712462425231934, + "loss": 0.4287, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.24337922036647797, + "rewards/margins": -0.15054650604724884, + "rewards/rejected": 0.3939257264137268, + "step": 4904 + }, + { + "epoch": 0.8, + "learning_rate": 6.848880319388269e-06, + "logits/chosen": -1.4272133111953735, + "logits/rejected": -1.168473243713379, + "logps/chosen": -78.62110900878906, + "logps/rejected": -91.85189819335938, + "loss": 1.3802, + "rewards/accuracies": 0.0, + "rewards/chosen": 4.069629192352295, + "rewards/margins": -2.596169948577881, + "rewards/rejected": 6.665799140930176, + "step": 4905 + }, + { + "epoch": 0.8, + "learning_rate": 6.847659154703785e-06, + "logits/chosen": -1.2580000162124634, + "logits/rejected": -1.2147691249847412, + "logps/chosen": -54.40690231323242, + "logps/rejected": -89.18994140625, + "loss": 0.6042, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.136011838912964, + "rewards/margins": -0.8478336334228516, + "rewards/rejected": 2.9838454723358154, + "step": 4906 + }, + { + "epoch": 0.8, + "learning_rate": 6.846437862363536e-06, + "logits/chosen": -1.152169108390808, + "logits/rejected": -1.1780338287353516, + "logps/chosen": -131.83326721191406, + "logps/rejected": -111.88352966308594, + "loss": 0.5222, + "rewards/accuracies": 0.0, + "rewards/chosen": 5.35610818862915, + "rewards/margins": -0.5284819602966309, + "rewards/rejected": 5.884590148925781, + "step": 4907 + }, + { + "epoch": 0.8, + "learning_rate": 6.845216442451902e-06, + "logits/chosen": -1.0674378871917725, + "logits/rejected": -1.165602684020996, + "logps/chosen": -62.02434539794922, + "logps/rejected": -125.0243148803711, + "loss": 1.6471, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.639021396636963, + "rewards/margins": -2.897136688232422, + "rewards/rejected": 5.536158084869385, + "step": 4908 + }, + { + "epoch": 0.8, + "learning_rate": 6.843994895053272e-06, + "logits/chosen": -1.3153717517852783, + "logits/rejected": -1.2958133220672607, + "logps/chosen": -97.71421813964844, + "logps/rejected": -81.26535034179688, + "loss": 0.7344, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.6533714532852173, + "rewards/margins": -1.0252631902694702, + "rewards/rejected": 2.6786346435546875, + "step": 4909 + }, + { + "epoch": 0.8, + "learning_rate": 6.842773220252041e-06, + "logits/chosen": -1.3750338554382324, + "logits/rejected": -1.306528091430664, + "logps/chosen": -81.84526824951172, + "logps/rejected": -14.042317390441895, + "loss": 0.8035, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.4223923683166504, + "rewards/margins": 2.1103360652923584, + "rewards/rejected": 0.3120562732219696, + "step": 4910 + }, + { + "epoch": 0.8, + "learning_rate": 6.8415514181326195e-06, + "logits/chosen": -0.816984236240387, + "logits/rejected": -0.8187933564186096, + "logps/chosen": -59.11918640136719, + "logps/rejected": -87.01730346679688, + "loss": 0.352, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.435237169265747, + "rewards/margins": -0.020160675048828125, + "rewards/rejected": 1.4553978443145752, + "step": 4911 + }, + { + "epoch": 0.8, + "learning_rate": 6.840329488779418e-06, + "logits/chosen": -1.3194924592971802, + "logits/rejected": -1.2368804216384888, + "logps/chosen": -66.82862091064453, + "logps/rejected": -17.018211364746094, + "loss": 1.2591, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.6899116039276123, + "rewards/margins": 2.5416486263275146, + "rewards/rejected": 0.14826297760009766, + "step": 4912 + }, + { + "epoch": 0.8, + "learning_rate": 6.839107432276864e-06, + "logits/chosen": -0.8967552185058594, + "logits/rejected": -0.7868108749389648, + "logps/chosen": -35.736732482910156, + "logps/rejected": -5.891783714294434, + "loss": 0.917, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.488499402999878, + "rewards/margins": 1.5807791948318481, + "rewards/rejected": 0.9077202081680298, + "step": 4913 + }, + { + "epoch": 0.8, + "learning_rate": 6.837885248709386e-06, + "logits/chosen": -1.0772801637649536, + "logits/rejected": -1.1974565982818604, + "logps/chosen": -109.42613220214844, + "logps/rejected": -108.06800842285156, + "loss": 1.5752, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.7244460582733154, + "rewards/margins": -2.997509717941284, + "rewards/rejected": 5.7219557762146, + "step": 4914 + }, + { + "epoch": 0.8, + "learning_rate": 6.836662938161429e-06, + "logits/chosen": -1.2414888143539429, + "logits/rejected": -1.1925278902053833, + "logps/chosen": -84.81202697753906, + "logps/rejected": -73.67367553710938, + "loss": 0.0287, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.678868293762207, + "rewards/margins": 3.5079164505004883, + "rewards/rejected": 2.1709518432617188, + "step": 4915 + }, + { + "epoch": 0.8, + "learning_rate": 6.835440500717441e-06, + "logits/chosen": -1.386311650276184, + "logits/rejected": -1.4680761098861694, + "logps/chosen": -109.47962951660156, + "logps/rejected": -143.29119873046875, + "loss": 2.197, + "rewards/accuracies": 0.0, + "rewards/chosen": 6.715123176574707, + "rewards/margins": -2.683591842651367, + "rewards/rejected": 9.398715019226074, + "step": 4916 + }, + { + "epoch": 0.8, + "learning_rate": 6.834217936461882e-06, + "logits/chosen": -0.963312029838562, + "logits/rejected": -0.9406237602233887, + "logps/chosen": -94.82064819335938, + "logps/rejected": -61.43346405029297, + "loss": 1.0882, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.7088721990585327, + "rewards/margins": -0.7630897760391235, + "rewards/rejected": 2.4719619750976562, + "step": 4917 + }, + { + "epoch": 0.8, + "learning_rate": 6.832995245479219e-06, + "logits/chosen": -1.036037564277649, + "logits/rejected": -0.9799975156784058, + "logps/chosen": -56.11934280395508, + "logps/rejected": -92.72075653076172, + "loss": 0.1805, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.586336851119995, + "rewards/margins": 0.8452891111373901, + "rewards/rejected": 1.741047739982605, + "step": 4918 + }, + { + "epoch": 0.8, + "learning_rate": 6.831772427853929e-06, + "logits/chosen": -0.9033800959587097, + "logits/rejected": -0.896988034248352, + "logps/chosen": -21.980106353759766, + "logps/rejected": -10.547645568847656, + "loss": 2.6103, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.9157371520996094, + "rewards/margins": 0.8494119644165039, + "rewards/rejected": 1.0663251876831055, + "step": 4919 + }, + { + "epoch": 0.8, + "learning_rate": 6.830549483670495e-06, + "logits/chosen": -1.0933842658996582, + "logits/rejected": -1.046752691268921, + "logps/chosen": -64.75314331054688, + "logps/rejected": -45.53215026855469, + "loss": 0.364, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.0318429470062256, + "rewards/margins": -0.06283187866210938, + "rewards/rejected": 2.094674825668335, + "step": 4920 + }, + { + "epoch": 0.8, + "learning_rate": 6.8293264130134144e-06, + "logits/chosen": -0.5509806871414185, + "logits/rejected": -0.5261163115501404, + "logps/chosen": -20.615877151489258, + "logps/rejected": -1.4363802671432495, + "loss": 0.4202, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5665489435195923, + "rewards/margins": 0.26907283067703247, + "rewards/rejected": 0.2974761128425598, + "step": 4921 + }, + { + "epoch": 0.8, + "learning_rate": 6.8281032159671865e-06, + "logits/chosen": -1.5259112119674683, + "logits/rejected": -1.5126032829284668, + "logps/chosen": -148.7093963623047, + "logps/rejected": -87.48358154296875, + "loss": 0.3533, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.689090251922607, + "rewards/margins": 2.4080309867858887, + "rewards/rejected": 3.2810592651367188, + "step": 4922 + }, + { + "epoch": 0.8, + "learning_rate": 6.826879892616325e-06, + "logits/chosen": -0.6174428462982178, + "logits/rejected": -0.6612465977668762, + "logps/chosen": -30.748146057128906, + "logps/rejected": -51.31707000732422, + "loss": 0.6362, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.295386552810669, + "rewards/margins": -0.6388657093048096, + "rewards/rejected": 2.9342522621154785, + "step": 4923 + }, + { + "epoch": 0.8, + "learning_rate": 6.825656443045348e-06, + "logits/chosen": -1.0234079360961914, + "logits/rejected": -1.0367543697357178, + "logps/chosen": -52.99784851074219, + "logps/rejected": -108.25576782226562, + "loss": 0.2115, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.8389923572540283, + "rewards/margins": 0.7720634937286377, + "rewards/rejected": 1.0669288635253906, + "step": 4924 + }, + { + "epoch": 0.8, + "learning_rate": 6.824432867338786e-06, + "logits/chosen": -1.3065855503082275, + "logits/rejected": -1.2727254629135132, + "logps/chosen": -62.532196044921875, + "logps/rejected": -26.53986167907715, + "loss": 0.1602, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.4329726696014404, + "rewards/margins": 1.1791691780090332, + "rewards/rejected": 1.2538034915924072, + "step": 4925 + }, + { + "epoch": 0.8, + "learning_rate": 6.823209165581176e-06, + "logits/chosen": -1.2294546365737915, + "logits/rejected": -1.1356701850891113, + "logps/chosen": -62.70695495605469, + "logps/rejected": -46.412132263183594, + "loss": 1.0627, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.639888048171997, + "rewards/margins": -0.4745819568634033, + "rewards/rejected": 3.1144700050354004, + "step": 4926 + }, + { + "epoch": 0.8, + "learning_rate": 6.821985337857063e-06, + "logits/chosen": -1.1671619415283203, + "logits/rejected": -1.079293131828308, + "logps/chosen": -61.41764831542969, + "logps/rejected": -55.342803955078125, + "loss": 0.8562, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.4061198234558105, + "rewards/margins": 2.3483598232269287, + "rewards/rejected": 3.057760000228882, + "step": 4927 + }, + { + "epoch": 0.8, + "learning_rate": 6.820761384251004e-06, + "logits/chosen": -1.0712018013000488, + "logits/rejected": -1.0192596912384033, + "logps/chosen": -59.79408645629883, + "logps/rejected": -50.2014274597168, + "loss": 0.2822, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.650972366333008, + "rewards/margins": 0.2883446216583252, + "rewards/rejected": 2.3626277446746826, + "step": 4928 + }, + { + "epoch": 0.8, + "learning_rate": 6.819537304847561e-06, + "logits/chosen": -0.894473135471344, + "logits/rejected": -0.8199801445007324, + "logps/chosen": -50.212806701660156, + "logps/rejected": -31.943370819091797, + "loss": 0.3728, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.349560499191284, + "rewards/margins": -0.07111144065856934, + "rewards/rejected": 2.4206719398498535, + "step": 4929 + }, + { + "epoch": 0.8, + "learning_rate": 6.818313099731308e-06, + "logits/chosen": -1.2487778663635254, + "logits/rejected": -1.1337976455688477, + "logps/chosen": -58.395652770996094, + "logps/rejected": -70.9467544555664, + "loss": 0.9819, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.0692498683929443, + "rewards/margins": -0.25284647941589355, + "rewards/rejected": 2.322096347808838, + "step": 4930 + }, + { + "epoch": 0.8, + "learning_rate": 6.817088768986823e-06, + "logits/chosen": -1.2996052503585815, + "logits/rejected": -0.807044506072998, + "logps/chosen": -123.1317138671875, + "logps/rejected": -107.0207748413086, + "loss": 1.1903, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.133766174316406, + "rewards/margins": 0.0866246223449707, + "rewards/rejected": 6.0471415519714355, + "step": 4931 + }, + { + "epoch": 0.8, + "learning_rate": 6.815864312698699e-06, + "logits/chosen": -0.993317723274231, + "logits/rejected": -0.993317723274231, + "logps/chosen": -57.188262939453125, + "logps/rejected": -57.188262939453125, + "loss": 1.0221, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.8261916637420654, + "rewards/margins": 0.0, + "rewards/rejected": 2.8261916637420654, + "step": 4932 + }, + { + "epoch": 0.8, + "learning_rate": 6.814639730951532e-06, + "logits/chosen": -0.9808138012886047, + "logits/rejected": -0.9309849143028259, + "logps/chosen": -58.22234344482422, + "logps/rejected": -56.1050910949707, + "loss": 0.5622, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.5295021533966064, + "rewards/margins": 0.6177104711532593, + "rewards/rejected": 1.9117916822433472, + "step": 4933 + }, + { + "epoch": 0.8, + "learning_rate": 6.81341502382993e-06, + "logits/chosen": -1.638693928718567, + "logits/rejected": -1.59002685546875, + "logps/chosen": -152.5782470703125, + "logps/rejected": -49.19329833984375, + "loss": 0.0763, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.925523519515991, + "rewards/margins": 3.064636707305908, + "rewards/rejected": 0.8608867526054382, + "step": 4934 + }, + { + "epoch": 0.8, + "learning_rate": 6.812190191418508e-06, + "logits/chosen": -1.2793004512786865, + "logits/rejected": -1.4258290529251099, + "logps/chosen": -88.39904022216797, + "logps/rejected": -112.16253662109375, + "loss": 2.9162, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.3192269802093506, + "rewards/margins": -5.483183860778809, + "rewards/rejected": 7.802411079406738, + "step": 4935 + }, + { + "epoch": 0.8, + "learning_rate": 6.810965233801893e-06, + "logits/chosen": -0.9371641874313354, + "logits/rejected": -0.9432921409606934, + "logps/chosen": -52.762428283691406, + "logps/rejected": -86.40333557128906, + "loss": 0.1786, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.7235320806503296, + "rewards/margins": 0.8856727480888367, + "rewards/rejected": 0.8378593325614929, + "step": 4936 + }, + { + "epoch": 0.8, + "learning_rate": 6.809740151064714e-06, + "logits/chosen": -1.4685643911361694, + "logits/rejected": -1.395302653312683, + "logps/chosen": -90.41462707519531, + "logps/rejected": -24.754802703857422, + "loss": 0.1114, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.305088758468628, + "rewards/margins": 2.535806179046631, + "rewards/rejected": -0.2307174652814865, + "step": 4937 + }, + { + "epoch": 0.8, + "learning_rate": 6.8085149432916155e-06, + "logits/chosen": -1.6117675304412842, + "logits/rejected": -1.5314748287200928, + "logps/chosen": -112.35856628417969, + "logps/rejected": -28.454214096069336, + "loss": 0.1665, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.6732895374298096, + "rewards/margins": 1.292847990989685, + "rewards/rejected": 1.3804415464401245, + "step": 4938 + }, + { + "epoch": 0.8, + "learning_rate": 6.8072896105672455e-06, + "logits/chosen": -0.9768097996711731, + "logits/rejected": -0.9054771065711975, + "logps/chosen": -62.85517501831055, + "logps/rejected": -37.851776123046875, + "loss": 0.9592, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.082279682159424, + "rewards/margins": -0.4681277275085449, + "rewards/rejected": 2.5504074096679688, + "step": 4939 + }, + { + "epoch": 0.8, + "learning_rate": 6.806064152976265e-06, + "logits/chosen": -0.8570451736450195, + "logits/rejected": -0.8645883202552795, + "logps/chosen": -218.00473022460938, + "logps/rejected": -77.53462219238281, + "loss": 0.0243, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.160330295562744, + "rewards/margins": 3.3964760303497314, + "rewards/rejected": 1.7638542652130127, + "step": 4940 + }, + { + "epoch": 0.8, + "learning_rate": 6.80483857060334e-06, + "logits/chosen": -1.1000959873199463, + "logits/rejected": -1.0375723838806152, + "logps/chosen": -59.569183349609375, + "logps/rejected": -42.98442840576172, + "loss": 0.1739, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.6251542568206787, + "rewards/margins": 1.6123254299163818, + "rewards/rejected": 1.0128288269042969, + "step": 4941 + }, + { + "epoch": 0.8, + "learning_rate": 6.803612863533149e-06, + "logits/chosen": -1.1816837787628174, + "logits/rejected": -1.2711225748062134, + "logps/chosen": -86.26148986816406, + "logps/rejected": -115.83554077148438, + "loss": 0.4997, + "rewards/accuracies": 0.0, + "rewards/chosen": 6.688285827636719, + "rewards/margins": -0.5375781059265137, + "rewards/rejected": 7.225863933563232, + "step": 4942 + }, + { + "epoch": 0.8, + "learning_rate": 6.802387031850372e-06, + "logits/chosen": -0.9905698895454407, + "logits/rejected": -0.9905698895454407, + "logps/chosen": -42.85993957519531, + "logps/rejected": -42.85993957519531, + "loss": 2.7974, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.179236650466919, + "rewards/margins": 0.0, + "rewards/rejected": 1.179236650466919, + "step": 4943 + }, + { + "epoch": 0.8, + "learning_rate": 6.801161075639709e-06, + "logits/chosen": -1.2343478202819824, + "logits/rejected": -1.1917998790740967, + "logps/chosen": -79.12870788574219, + "logps/rejected": -68.0132064819336, + "loss": 2.9401, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.5119765996932983, + "rewards/margins": 0.045697689056396484, + "rewards/rejected": 1.4662789106369019, + "step": 4944 + }, + { + "epoch": 0.8, + "learning_rate": 6.799934994985856e-06, + "logits/chosen": -0.6949248313903809, + "logits/rejected": -0.7239136695861816, + "logps/chosen": -4.765040874481201, + "logps/rejected": -81.88902282714844, + "loss": 0.8951, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3867824971675873, + "rewards/margins": 0.4515079855918884, + "rewards/rejected": -0.06472549587488174, + "step": 4945 + }, + { + "epoch": 0.8, + "learning_rate": 6.798708789973527e-06, + "logits/chosen": -1.357330083847046, + "logits/rejected": -1.357330083847046, + "logps/chosen": -52.549591064453125, + "logps/rejected": -52.549591064453125, + "loss": 0.4112, + "rewards/accuracies": 0.0, + "rewards/chosen": 4.161546230316162, + "rewards/margins": 0.0, + "rewards/rejected": 4.161546230316162, + "step": 4946 + }, + { + "epoch": 0.8, + "learning_rate": 6.79748246068744e-06, + "logits/chosen": -1.4560521841049194, + "logits/rejected": -1.4432100057601929, + "logps/chosen": -101.81910705566406, + "logps/rejected": -109.28849792480469, + "loss": 3.3755, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.0595650672912598, + "rewards/margins": 1.1756752729415894, + "rewards/rejected": 0.8838897943496704, + "step": 4947 + }, + { + "epoch": 0.8, + "learning_rate": 6.796256007212323e-06, + "logits/chosen": -1.2084717750549316, + "logits/rejected": -1.1868866682052612, + "logps/chosen": -107.36636352539062, + "logps/rejected": -50.1083869934082, + "loss": 0.5675, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.7584465742111206, + "rewards/margins": 0.30859339237213135, + "rewards/rejected": 1.4498531818389893, + "step": 4948 + }, + { + "epoch": 0.8, + "learning_rate": 6.795029429632912e-06, + "logits/chosen": -1.1263998746871948, + "logits/rejected": -1.123149037361145, + "logps/chosen": -88.73998260498047, + "logps/rejected": -69.38812255859375, + "loss": 0.281, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.426857829093933, + "rewards/margins": 0.28383636474609375, + "rewards/rejected": 1.1430214643478394, + "step": 4949 + }, + { + "epoch": 0.8, + "learning_rate": 6.7938027280339514e-06, + "logits/chosen": -1.036038875579834, + "logits/rejected": -0.9977337718009949, + "logps/chosen": -65.01066589355469, + "logps/rejected": -63.49121856689453, + "loss": 2.0002, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.897284746170044, + "rewards/margins": -1.2628638744354248, + "rewards/rejected": 4.160148620605469, + "step": 4950 + }, + { + "epoch": 0.8, + "learning_rate": 6.792575902500197e-06, + "logits/chosen": -1.0691365003585815, + "logits/rejected": -1.0883198976516724, + "logps/chosen": -111.37779998779297, + "logps/rejected": -110.78255462646484, + "loss": 1.7396, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.9922943115234375, + "rewards/margins": -2.759580373764038, + "rewards/rejected": 3.7518746852874756, + "step": 4951 + }, + { + "epoch": 0.8, + "learning_rate": 6.7913489531164074e-06, + "logits/chosen": -1.0831739902496338, + "logits/rejected": -1.1708556413650513, + "logps/chosen": -210.33558654785156, + "logps/rejected": -100.513671875, + "loss": 1.0946, + "rewards/accuracies": 0.0, + "rewards/chosen": 4.937297344207764, + "rewards/margins": -2.0418930053710938, + "rewards/rejected": 6.979190349578857, + "step": 4952 + }, + { + "epoch": 0.8, + "learning_rate": 6.790121879967357e-06, + "logits/chosen": -1.228732943534851, + "logits/rejected": -1.2431721687316895, + "logps/chosen": -71.78533172607422, + "logps/rejected": -74.82101440429688, + "loss": 1.8728, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.6271095275878906, + "rewards/margins": -3.234239101409912, + "rewards/rejected": 4.861348628997803, + "step": 4953 + }, + { + "epoch": 0.8, + "learning_rate": 6.788894683137822e-06, + "logits/chosen": -1.109018087387085, + "logits/rejected": -1.1033204793930054, + "logps/chosen": -125.6920394897461, + "logps/rejected": -141.38262939453125, + "loss": 1.5868, + "rewards/accuracies": 0.0, + "rewards/chosen": 5.113816261291504, + "rewards/margins": -3.1150426864624023, + "rewards/rejected": 8.228858947753906, + "step": 4954 + }, + { + "epoch": 0.8, + "learning_rate": 6.787667362712591e-06, + "logits/chosen": -0.7578954696655273, + "logits/rejected": -0.7334696054458618, + "logps/chosen": -73.54624938964844, + "logps/rejected": -115.93406677246094, + "loss": 0.4547, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1872284412384033, + "rewards/margins": 1.4213058948516846, + "rewards/rejected": 1.7659225463867188, + "step": 4955 + }, + { + "epoch": 0.8, + "learning_rate": 6.786439918776461e-06, + "logits/chosen": -0.996799886226654, + "logits/rejected": -0.9629791975021362, + "logps/chosen": -94.66667175292969, + "logps/rejected": -60.922325134277344, + "loss": 0.2865, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.910597324371338, + "rewards/margins": 1.0537927150726318, + "rewards/rejected": 3.856804609298706, + "step": 4956 + }, + { + "epoch": 0.8, + "learning_rate": 6.785212351414234e-06, + "logits/chosen": -1.1620348691940308, + "logits/rejected": -1.1796716451644897, + "logps/chosen": -47.930564880371094, + "logps/rejected": -41.86296844482422, + "loss": 0.5041, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.0130035877227783, + "rewards/margins": -0.5125694274902344, + "rewards/rejected": 2.5255730152130127, + "step": 4957 + }, + { + "epoch": 0.8, + "learning_rate": 6.783984660710727e-06, + "logits/chosen": -0.9428165555000305, + "logits/rejected": -0.9428165555000305, + "logps/chosen": -38.09452438354492, + "logps/rejected": -38.09452438354492, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.0911041498184204, + "rewards/margins": 0.0, + "rewards/rejected": 1.0911041498184204, + "step": 4958 + }, + { + "epoch": 0.8, + "learning_rate": 6.782756846750761e-06, + "logits/chosen": -1.1077227592468262, + "logits/rejected": -0.9518154263496399, + "logps/chosen": -62.26023864746094, + "logps/rejected": -12.103846549987793, + "loss": 1.8055, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.779217481613159, + "rewards/margins": 1.617043137550354, + "rewards/rejected": 1.1621743440628052, + "step": 4959 + }, + { + "epoch": 0.81, + "learning_rate": 6.781528909619164e-06, + "logits/chosen": -0.7942684888839722, + "logits/rejected": -0.8123015761375427, + "logps/chosen": -15.15163516998291, + "logps/rejected": -50.06074523925781, + "loss": 0.4629, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.5082694292068481, + "rewards/margins": -0.3493347764015198, + "rewards/rejected": 0.8576042056083679, + "step": 4960 + }, + { + "epoch": 0.81, + "learning_rate": 6.780300849400777e-06, + "logits/chosen": -1.3614753484725952, + "logits/rejected": -1.3528156280517578, + "logps/chosen": -58.23626708984375, + "logps/rejected": -53.950164794921875, + "loss": 0.507, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.7482573986053467, + "rewards/margins": 0.33201122283935547, + "rewards/rejected": 2.416246175765991, + "step": 4961 + }, + { + "epoch": 0.81, + "learning_rate": 6.779072666180447e-06, + "logits/chosen": -1.0633437633514404, + "logits/rejected": -1.1306506395339966, + "logps/chosen": -179.9141387939453, + "logps/rejected": -122.19607543945312, + "loss": 0.7388, + "rewards/accuracies": 0.0, + "rewards/chosen": 5.688188076019287, + "rewards/margins": -1.1994004249572754, + "rewards/rejected": 6.8875885009765625, + "step": 4962 + }, + { + "epoch": 0.81, + "learning_rate": 6.777844360043028e-06, + "logits/chosen": -1.4103513956069946, + "logits/rejected": -1.3470826148986816, + "logps/chosen": -55.743309020996094, + "logps/rejected": -55.286651611328125, + "loss": 1.2822, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.3010857105255127, + "rewards/margins": -0.48873281478881836, + "rewards/rejected": 2.789818525314331, + "step": 4963 + }, + { + "epoch": 0.81, + "learning_rate": 6.776615931073387e-06, + "logits/chosen": -1.2834056615829468, + "logits/rejected": -1.310470461845398, + "logps/chosen": -129.82772827148438, + "logps/rejected": -83.58857727050781, + "loss": 0.7077, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.8375473022460938, + "rewards/margins": -1.118417501449585, + "rewards/rejected": 2.9559648036956787, + "step": 4964 + }, + { + "epoch": 0.81, + "learning_rate": 6.7753873793563955e-06, + "logits/chosen": -0.6118754148483276, + "logits/rejected": -0.5798782110214233, + "logps/chosen": -45.177669525146484, + "logps/rejected": -21.839679718017578, + "loss": 0.5016, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.5277019739151001, + "rewards/margins": -0.5281791687011719, + "rewards/rejected": 1.055881142616272, + "step": 4965 + }, + { + "epoch": 0.81, + "learning_rate": 6.774158704976934e-06, + "logits/chosen": -1.0887811183929443, + "logits/rejected": -0.9636101126670837, + "logps/chosen": -39.27801513671875, + "logps/rejected": -32.97494125366211, + "loss": 1.6982, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.5852882862091064, + "rewards/margins": -1.9178836345672607, + "rewards/rejected": 3.503171920776367, + "step": 4966 + }, + { + "epoch": 0.81, + "learning_rate": 6.772929908019894e-06, + "logits/chosen": -1.28227698802948, + "logits/rejected": -1.2176251411437988, + "logps/chosen": -38.77734375, + "logps/rejected": -41.82489776611328, + "loss": 0.175, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.910923719406128, + "rewards/margins": 0.9126384258270264, + "rewards/rejected": 1.9982852935791016, + "step": 4967 + }, + { + "epoch": 0.81, + "learning_rate": 6.771700988570173e-06, + "logits/chosen": -1.0560054779052734, + "logits/rejected": -1.007077693939209, + "logps/chosen": -88.06230163574219, + "logps/rejected": -43.87494659423828, + "loss": 1.3851, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.1910629272460938, + "rewards/margins": -0.7942272424697876, + "rewards/rejected": 1.9852901697158813, + "step": 4968 + }, + { + "epoch": 0.81, + "learning_rate": 6.770471946712679e-06, + "logits/chosen": -1.1351749897003174, + "logits/rejected": -1.2844634056091309, + "logps/chosen": -54.06939697265625, + "logps/rejected": -105.94548034667969, + "loss": 1.9764, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.2608819007873535, + "rewards/margins": -2.3695807456970215, + "rewards/rejected": 5.630462646484375, + "step": 4969 + }, + { + "epoch": 0.81, + "learning_rate": 6.769242782532324e-06, + "logits/chosen": -0.8890487551689148, + "logits/rejected": -0.9329925179481506, + "logps/chosen": -45.79059600830078, + "logps/rejected": -63.05708312988281, + "loss": 0.355, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.141304850578308, + "rewards/margins": 0.042612552642822266, + "rewards/rejected": 1.0986922979354858, + "step": 4970 + }, + { + "epoch": 0.81, + "learning_rate": 6.7680134961140344e-06, + "logits/chosen": -1.225804090499878, + "logits/rejected": -1.0967131853103638, + "logps/chosen": -48.50334930419922, + "logps/rejected": -21.57470703125, + "loss": 0.2601, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9253273010253906, + "rewards/margins": 2.3258094787597656, + "rewards/rejected": 0.599517822265625, + "step": 4971 + }, + { + "epoch": 0.81, + "learning_rate": 6.766784087542741e-06, + "logits/chosen": -0.7988706231117249, + "logits/rejected": -0.8459376096725464, + "logps/chosen": -77.75064849853516, + "logps/rejected": -42.550750732421875, + "loss": 0.4994, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.7168601751327515, + "rewards/margins": -0.5274261236190796, + "rewards/rejected": 2.244286298751831, + "step": 4972 + }, + { + "epoch": 0.81, + "learning_rate": 6.765554556903384e-06, + "logits/chosen": -1.0922322273254395, + "logits/rejected": -1.0588778257369995, + "logps/chosen": -62.914276123046875, + "logps/rejected": -79.64805603027344, + "loss": 1.2648, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.6927292346954346, + "rewards/margins": -0.25467681884765625, + "rewards/rejected": 1.9474060535430908, + "step": 4973 + }, + { + "epoch": 0.81, + "learning_rate": 6.7643249042809146e-06, + "logits/chosen": -0.9664924144744873, + "logits/rejected": -0.9552120566368103, + "logps/chosen": -94.51359558105469, + "logps/rejected": -73.52603149414062, + "loss": 1.4662, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.4442994594573975, + "rewards/margins": 0.49020397663116455, + "rewards/rejected": 1.954095482826233, + "step": 4974 + }, + { + "epoch": 0.81, + "learning_rate": 6.7630951297602876e-06, + "logits/chosen": -1.0022777318954468, + "logits/rejected": -0.9230768084526062, + "logps/chosen": -39.45115280151367, + "logps/rejected": -34.89769744873047, + "loss": 0.7393, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.374319076538086, + "rewards/margins": 0.4744521975517273, + "rewards/rejected": 0.8998668789863586, + "step": 4975 + }, + { + "epoch": 0.81, + "learning_rate": 6.76186523342647e-06, + "logits/chosen": -1.609938144683838, + "logits/rejected": -1.5629394054412842, + "logps/chosen": -105.08726501464844, + "logps/rejected": -59.98310089111328, + "loss": 0.3127, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.7689026594161987, + "rewards/margins": 0.4573814868927002, + "rewards/rejected": 1.3115211725234985, + "step": 4976 + }, + { + "epoch": 0.81, + "learning_rate": 6.760635215364435e-06, + "logits/chosen": -1.0376375913619995, + "logits/rejected": -1.0296460390090942, + "logps/chosen": -73.6456298828125, + "logps/rejected": -94.10236358642578, + "loss": 0.9264, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.7327919006347656, + "rewards/margins": -1.6481873989105225, + "rewards/rejected": 3.380979299545288, + "step": 4977 + }, + { + "epoch": 0.81, + "learning_rate": 6.759405075659165e-06, + "logits/chosen": -1.0459928512573242, + "logits/rejected": -0.8489521145820618, + "logps/chosen": -119.1253662109375, + "logps/rejected": -50.01908874511719, + "loss": 0.0167, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.068060398101807, + "rewards/margins": 3.638136386871338, + "rewards/rejected": 1.4299240112304688, + "step": 4978 + }, + { + "epoch": 0.81, + "learning_rate": 6.758174814395654e-06, + "logits/chosen": -1.287650465965271, + "logits/rejected": -1.2236969470977783, + "logps/chosen": -72.2283935546875, + "logps/rejected": -60.940582275390625, + "loss": 1.0475, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.1513237953186035, + "rewards/margins": -0.3968849182128906, + "rewards/rejected": 3.548208713531494, + "step": 4979 + }, + { + "epoch": 0.81, + "learning_rate": 6.756944431658898e-06, + "logits/chosen": -1.1042348146438599, + "logits/rejected": -0.6857379078865051, + "logps/chosen": -171.81788635253906, + "logps/rejected": -31.02498435974121, + "loss": 0.004, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.973167419433594, + "rewards/margins": 5.83644962310791, + "rewards/rejected": 0.1367177963256836, + "step": 4980 + }, + { + "epoch": 0.81, + "learning_rate": 6.755713927533907e-06, + "logits/chosen": -1.1202136278152466, + "logits/rejected": -0.9535699486732483, + "logps/chosen": -171.05087280273438, + "logps/rejected": -84.88970184326172, + "loss": 0.0217, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.967968940734863, + "rewards/margins": 3.1709420680999756, + "rewards/rejected": 3.7970268726348877, + "step": 4981 + }, + { + "epoch": 0.81, + "learning_rate": 6.754483302105696e-06, + "logits/chosen": -0.9602042436599731, + "logits/rejected": -0.9346749186515808, + "logps/chosen": -39.82770538330078, + "logps/rejected": -29.33127212524414, + "loss": 0.8079, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.7949379682540894, + "rewards/margins": -1.3848294019699097, + "rewards/rejected": 3.179767370223999, + "step": 4982 + }, + { + "epoch": 0.81, + "learning_rate": 6.75325255545929e-06, + "logits/chosen": -0.8479912877082825, + "logits/rejected": -0.663519024848938, + "logps/chosen": -50.29942321777344, + "logps/rejected": -119.96975708007812, + "loss": 2.6969, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.7634963989257812, + "rewards/margins": -1.6002883911132812, + "rewards/rejected": 3.3637847900390625, + "step": 4983 + }, + { + "epoch": 0.81, + "learning_rate": 6.752021687679721e-06, + "logits/chosen": -1.1255837678909302, + "logits/rejected": -1.0912588834762573, + "logps/chosen": -77.02434539794922, + "logps/rejected": -54.56014633178711, + "loss": 0.1565, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.036550283432007, + "rewards/margins": 1.4287457466125488, + "rewards/rejected": 0.6078044772148132, + "step": 4984 + }, + { + "epoch": 0.81, + "learning_rate": 6.750790698852032e-06, + "logits/chosen": -1.0559468269348145, + "logits/rejected": -1.1740788221359253, + "logps/chosen": -98.72451782226562, + "logps/rejected": -95.19145965576172, + "loss": 1.4675, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.296945333480835, + "rewards/margins": -0.30957698822021484, + "rewards/rejected": 3.60652232170105, + "step": 4985 + }, + { + "epoch": 0.81, + "learning_rate": 6.749559589061273e-06, + "logits/chosen": -1.0463498830795288, + "logits/rejected": -1.05637788772583, + "logps/chosen": -38.40484619140625, + "logps/rejected": -6.063224792480469, + "loss": 0.444, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.8072406649589539, + "rewards/margins": 0.4636460244655609, + "rewards/rejected": 0.34359464049339294, + "step": 4986 + }, + { + "epoch": 0.81, + "learning_rate": 6.7483283583925e-06, + "logits/chosen": -0.8762699961662292, + "logits/rejected": -0.8735979199409485, + "logps/chosen": -2.616109848022461, + "logps/rejected": -1.5059739351272583, + "loss": 0.7043, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.28759661316871643, + "rewards/margins": -0.04423028230667114, + "rewards/rejected": 0.3318268954753876, + "step": 4987 + }, + { + "epoch": 0.81, + "learning_rate": 6.74709700693078e-06, + "logits/chosen": -0.7647772431373596, + "logits/rejected": -0.7647772431373596, + "logps/chosen": -19.186790466308594, + "logps/rejected": -19.186790466308594, + "loss": 1.2325, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.789324939250946, + "rewards/margins": 0.0, + "rewards/rejected": 0.789324939250946, + "step": 4988 + }, + { + "epoch": 0.81, + "learning_rate": 6.745865534761188e-06, + "logits/chosen": -0.9854910969734192, + "logits/rejected": -0.9398810267448425, + "logps/chosen": -116.08779907226562, + "logps/rejected": -48.53505325317383, + "loss": 0.6492, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.2659454345703125, + "rewards/margins": 0.39261394739151, + "rewards/rejected": 0.8733314871788025, + "step": 4989 + }, + { + "epoch": 0.81, + "learning_rate": 6.744633941968806e-06, + "logits/chosen": -1.0078644752502441, + "logits/rejected": -0.9004764556884766, + "logps/chosen": -128.49514770507812, + "logps/rejected": -92.93600463867188, + "loss": 1.5364, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.594494819641113, + "rewards/margins": 2.340001106262207, + "rewards/rejected": 2.2544937133789062, + "step": 4990 + }, + { + "epoch": 0.81, + "learning_rate": 6.743402228638727e-06, + "logits/chosen": -1.1729800701141357, + "logits/rejected": -1.1945936679840088, + "logps/chosen": -49.74363327026367, + "logps/rejected": -111.83218383789062, + "loss": 1.6622, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.7517056465148926, + "rewards/margins": -3.280179023742676, + "rewards/rejected": 6.031884670257568, + "step": 4991 + }, + { + "epoch": 0.81, + "learning_rate": 6.742170394856052e-06, + "logits/chosen": -1.0541448593139648, + "logits/rejected": -1.0034395456314087, + "logps/chosen": -96.53616333007812, + "logps/rejected": -143.9977569580078, + "loss": 0.7436, + "rewards/accuracies": 0.0, + "rewards/chosen": 6.997500896453857, + "rewards/margins": -0.6502513885498047, + "rewards/rejected": 7.647752285003662, + "step": 4992 + }, + { + "epoch": 0.81, + "learning_rate": 6.740938440705884e-06, + "logits/chosen": -0.9784784913063049, + "logits/rejected": -0.9234855771064758, + "logps/chosen": -27.197771072387695, + "logps/rejected": -8.26873779296875, + "loss": 0.0433, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.4973182678222656, + "rewards/margins": 2.55195951461792, + "rewards/rejected": 0.9453588724136353, + "step": 4993 + }, + { + "epoch": 0.81, + "learning_rate": 6.739706366273346e-06, + "logits/chosen": -1.2755780220031738, + "logits/rejected": -1.1712461709976196, + "logps/chosen": -62.60918426513672, + "logps/rejected": -58.057003021240234, + "loss": 0.1072, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.075660228729248, + "rewards/margins": 1.4853460788726807, + "rewards/rejected": 2.5903141498565674, + "step": 4994 + }, + { + "epoch": 0.81, + "learning_rate": 6.738474171643557e-06, + "logits/chosen": -1.1196352243423462, + "logits/rejected": -1.141356110572815, + "logps/chosen": -77.91777801513672, + "logps/rejected": -67.28076171875, + "loss": 0.555, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.9897682070732117, + "rewards/margins": -0.5665306448936462, + "rewards/rejected": 1.556298851966858, + "step": 4995 + }, + { + "epoch": 0.81, + "learning_rate": 6.737241856901653e-06, + "logits/chosen": -1.1263999938964844, + "logits/rejected": -1.0431121587753296, + "logps/chosen": -77.2944564819336, + "logps/rejected": -45.526397705078125, + "loss": 2.2597, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.1846771240234375, + "rewards/margins": -0.8216753005981445, + "rewards/rejected": 4.006352424621582, + "step": 4996 + }, + { + "epoch": 0.81, + "learning_rate": 6.736009422132775e-06, + "logits/chosen": -1.1048465967178345, + "logits/rejected": -0.9904902577400208, + "logps/chosen": -32.986976623535156, + "logps/rejected": -11.08798885345459, + "loss": 0.1147, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.2804932594299316, + "rewards/margins": 1.8246747255325317, + "rewards/rejected": 0.4558185636997223, + "step": 4997 + }, + { + "epoch": 0.81, + "learning_rate": 6.734776867422073e-06, + "logits/chosen": -1.370636224746704, + "logits/rejected": -1.412085771560669, + "logps/chosen": -57.06492614746094, + "logps/rejected": -62.02455139160156, + "loss": 0.9545, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.5900466442108154, + "rewards/margins": -1.6033504009246826, + "rewards/rejected": 4.193397045135498, + "step": 4998 + }, + { + "epoch": 0.81, + "learning_rate": 6.733544192854703e-06, + "logits/chosen": -1.318941593170166, + "logits/rejected": -1.3193820714950562, + "logps/chosen": -215.1608428955078, + "logps/rejected": -121.31455993652344, + "loss": 1.0998, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.371098279953003, + "rewards/margins": -2.0266754627227783, + "rewards/rejected": 5.397773742675781, + "step": 4999 + }, + { + "epoch": 0.81, + "learning_rate": 6.732311398515832e-06, + "logits/chosen": -1.2839494943618774, + "logits/rejected": -1.3948649168014526, + "logps/chosen": -47.831607818603516, + "logps/rejected": -94.48649597167969, + "loss": 2.2567, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.198942184448242, + "rewards/margins": -4.460904598236084, + "rewards/rejected": 6.659846782684326, + "step": 5000 + }, + { + "epoch": 0.81, + "learning_rate": 6.7310784844906355e-06, + "logits/chosen": -1.2958617210388184, + "logits/rejected": -1.1365926265716553, + "logps/chosen": -178.22872924804688, + "logps/rejected": -47.0373649597168, + "loss": 0.014, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.610342502593994, + "rewards/margins": 5.244643211364746, + "rewards/rejected": 1.3656994104385376, + "step": 5001 + }, + { + "epoch": 0.81, + "learning_rate": 6.7298454508642945e-06, + "logits/chosen": -1.0331202745437622, + "logits/rejected": -1.0331202745437622, + "logps/chosen": -86.35014343261719, + "logps/rejected": -86.35014343261719, + "loss": 0.3479, + "rewards/accuracies": 0.0, + "rewards/chosen": 4.03915548324585, + "rewards/margins": 0.0, + "rewards/rejected": 4.03915548324585, + "step": 5002 + }, + { + "epoch": 0.81, + "learning_rate": 6.728612297722001e-06, + "logits/chosen": -1.2482048273086548, + "logits/rejected": -1.1385911703109741, + "logps/chosen": -58.67209243774414, + "logps/rejected": -88.06211853027344, + "loss": 0.1126, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.4595627784729, + "rewards/margins": 1.706852912902832, + "rewards/rejected": 4.752709865570068, + "step": 5003 + }, + { + "epoch": 0.81, + "learning_rate": 6.727379025148955e-06, + "logits/chosen": -1.0133932828903198, + "logits/rejected": -0.9317609667778015, + "logps/chosen": -62.29267883300781, + "logps/rejected": -43.18686294555664, + "loss": 1.5338, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.2168091535568237, + "rewards/margins": 0.5630989670753479, + "rewards/rejected": 0.6537101864814758, + "step": 5004 + }, + { + "epoch": 0.81, + "learning_rate": 6.726145633230361e-06, + "logits/chosen": -1.0026581287384033, + "logits/rejected": -1.0303453207015991, + "logps/chosen": -103.01727294921875, + "logps/rejected": -90.08716583251953, + "loss": 0.3806, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.5511536598205566, + "rewards/margins": -0.13052821159362793, + "rewards/rejected": 2.6816818714141846, + "step": 5005 + }, + { + "epoch": 0.81, + "learning_rate": 6.724912122051439e-06, + "logits/chosen": -1.0109753608703613, + "logits/rejected": -1.1078640222549438, + "logps/chosen": -119.80696105957031, + "logps/rejected": -97.61894226074219, + "loss": 1.6328, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.322821021080017, + "rewards/margins": -2.986100673675537, + "rewards/rejected": 4.308921813964844, + "step": 5006 + }, + { + "epoch": 0.81, + "learning_rate": 6.723678491697409e-06, + "logits/chosen": -1.4059910774230957, + "logits/rejected": -1.2185168266296387, + "logps/chosen": -79.39635467529297, + "logps/rejected": -57.59516906738281, + "loss": 0.0064, + "rewards/accuracies": 1.0, + "rewards/chosen": 7.063486576080322, + "rewards/margins": 4.636332035064697, + "rewards/rejected": 2.427154541015625, + "step": 5007 + }, + { + "epoch": 0.81, + "learning_rate": 6.722444742253505e-06, + "logits/chosen": -1.1603834629058838, + "logits/rejected": -1.2189979553222656, + "logps/chosen": -93.94790649414062, + "logps/rejected": -84.23733520507812, + "loss": 0.975, + "rewards/accuracies": 0.0, + "rewards/chosen": 4.538751125335693, + "rewards/margins": -1.7915453910827637, + "rewards/rejected": 6.330296516418457, + "step": 5008 + }, + { + "epoch": 0.81, + "learning_rate": 6.721210873804968e-06, + "logits/chosen": -0.7880399227142334, + "logits/rejected": -0.7880399227142334, + "logps/chosen": -85.47156524658203, + "logps/rejected": -85.47156524658203, + "loss": 0.3571, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.244434356689453, + "rewards/margins": 0.0, + "rewards/rejected": 2.244434356689453, + "step": 5009 + }, + { + "epoch": 0.81, + "learning_rate": 6.7199768864370455e-06, + "logits/chosen": -0.8623160123825073, + "logits/rejected": -0.8986905813217163, + "logps/chosen": -85.02067565917969, + "logps/rejected": -57.81801986694336, + "loss": 0.62, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.5312378406524658, + "rewards/margins": 0.43268096446990967, + "rewards/rejected": 1.0985568761825562, + "step": 5010 + }, + { + "epoch": 0.81, + "learning_rate": 6.718742780234994e-06, + "logits/chosen": -1.2747260332107544, + "logits/rejected": -1.0871037244796753, + "logps/chosen": -101.00627136230469, + "logps/rejected": -17.095226287841797, + "loss": 0.0444, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.320930480957031, + "rewards/margins": 3.4716241359710693, + "rewards/rejected": 0.8493062853813171, + "step": 5011 + }, + { + "epoch": 0.81, + "learning_rate": 6.71750855528408e-06, + "logits/chosen": -1.0065906047821045, + "logits/rejected": -1.0835283994674683, + "logps/chosen": -14.567609786987305, + "logps/rejected": -26.8961181640625, + "loss": 0.7047, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.7242504358291626, + "rewards/margins": -0.6669429540634155, + "rewards/rejected": 2.391193389892578, + "step": 5012 + }, + { + "epoch": 0.81, + "learning_rate": 6.7162742116695754e-06, + "logits/chosen": -1.5069583654403687, + "logits/rejected": -1.5422735214233398, + "logps/chosen": -72.24002075195312, + "logps/rejected": -65.20600128173828, + "loss": 1.4684, + "rewards/accuracies": 0.0, + "rewards/chosen": 4.880047798156738, + "rewards/margins": -2.8017234802246094, + "rewards/rejected": 7.681771278381348, + "step": 5013 + }, + { + "epoch": 0.81, + "learning_rate": 6.715039749476764e-06, + "logits/chosen": -1.1978904008865356, + "logits/rejected": -1.192513346672058, + "logps/chosen": -69.02781677246094, + "logps/rejected": -119.07318115234375, + "loss": 1.3763, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.0038163661956787, + "rewards/margins": 0.7189210653305054, + "rewards/rejected": 1.2848953008651733, + "step": 5014 + }, + { + "epoch": 0.81, + "learning_rate": 6.713805168790932e-06, + "logits/chosen": -1.1095361709594727, + "logits/rejected": -0.9660298824310303, + "logps/chosen": -91.41661834716797, + "logps/rejected": -54.38043975830078, + "loss": 1.3296, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.276022434234619, + "rewards/margins": 1.4283058643341064, + "rewards/rejected": 1.8477165699005127, + "step": 5015 + }, + { + "epoch": 0.81, + "learning_rate": 6.71257046969738e-06, + "logits/chosen": -0.9497962594032288, + "logits/rejected": -0.8984687328338623, + "logps/chosen": -61.81357192993164, + "logps/rejected": -59.18348693847656, + "loss": 0.5598, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.7792720794677734, + "rewards/margins": 0.498638391494751, + "rewards/rejected": 2.2806336879730225, + "step": 5016 + }, + { + "epoch": 0.81, + "learning_rate": 6.711335652281412e-06, + "logits/chosen": -1.1142507791519165, + "logits/rejected": -1.0462538003921509, + "logps/chosen": -62.80448913574219, + "logps/rejected": -64.53912353515625, + "loss": 0.1292, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.617682695388794, + "rewards/margins": 1.2472221851348877, + "rewards/rejected": 1.3704605102539062, + "step": 5017 + }, + { + "epoch": 0.81, + "learning_rate": 6.710100716628345e-06, + "logits/chosen": -0.8375411629676819, + "logits/rejected": -0.8952202200889587, + "logps/chosen": -60.02274703979492, + "logps/rejected": -50.71547317504883, + "loss": 1.0855, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.7109638452529907, + "rewards/margins": -1.7655616998672485, + "rewards/rejected": 2.4765255451202393, + "step": 5018 + }, + { + "epoch": 0.81, + "learning_rate": 6.7088656628234986e-06, + "logits/chosen": -0.7826814651489258, + "logits/rejected": -0.7719898223876953, + "logps/chosen": -110.79547119140625, + "logps/rejected": -38.684669494628906, + "loss": 0.437, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.9375694394111633, + "rewards/margins": -0.31373101472854614, + "rewards/rejected": 1.2513004541397095, + "step": 5019 + }, + { + "epoch": 0.81, + "learning_rate": 6.707630490952204e-06, + "logits/chosen": -0.8209927678108215, + "logits/rejected": -0.8209927678108215, + "logps/chosen": -3.430610179901123, + "logps/rejected": -3.430610179901123, + "loss": 0.3576, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.5284386277198792, + "rewards/margins": 0.0, + "rewards/rejected": 0.5284386277198792, + "step": 5020 + }, + { + "epoch": 0.81, + "learning_rate": 6.7063952010998005e-06, + "logits/chosen": -1.1688956022262573, + "logits/rejected": -1.1291755437850952, + "logps/chosen": -86.32017517089844, + "logps/rejected": -93.93360900878906, + "loss": 0.8368, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.4366211891174316, + "rewards/margins": 0.9315719604492188, + "rewards/rejected": 2.505049228668213, + "step": 5021 + }, + { + "epoch": 0.82, + "learning_rate": 6.705159793351635e-06, + "logits/chosen": -1.146412968635559, + "logits/rejected": -1.2945019006729126, + "logps/chosen": -111.8508529663086, + "logps/rejected": -173.7969970703125, + "loss": 1.1454, + "rewards/accuracies": 0.0, + "rewards/chosen": 6.527904510498047, + "rewards/margins": -2.1568126678466797, + "rewards/rejected": 8.684717178344727, + "step": 5022 + }, + { + "epoch": 0.82, + "learning_rate": 6.703924267793061e-06, + "logits/chosen": -0.8236624002456665, + "logits/rejected": -0.7898384928703308, + "logps/chosen": -33.892478942871094, + "logps/rejected": -3.992488384246826, + "loss": 0.572, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.2588249444961548, + "rewards/margins": 0.5627759695053101, + "rewards/rejected": 0.6960489749908447, + "step": 5023 + }, + { + "epoch": 0.82, + "learning_rate": 6.702688624509443e-06, + "logits/chosen": -1.0374791622161865, + "logits/rejected": -0.8972786664962769, + "logps/chosen": -74.07303619384766, + "logps/rejected": -55.08208084106445, + "loss": 1.1152, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.481492757797241, + "rewards/margins": 1.3549230098724365, + "rewards/rejected": 2.1265697479248047, + "step": 5024 + }, + { + "epoch": 0.82, + "learning_rate": 6.7014528635861535e-06, + "logits/chosen": -0.8807052969932556, + "logits/rejected": -0.8777272701263428, + "logps/chosen": -4.01859188079834, + "logps/rejected": -2.746303081512451, + "loss": 0.4318, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3611655831336975, + "rewards/margins": 0.01161918044090271, + "rewards/rejected": 0.3495464026927948, + "step": 5025 + }, + { + "epoch": 0.82, + "learning_rate": 6.700216985108568e-06, + "logits/chosen": -1.3002125024795532, + "logits/rejected": -1.2555203437805176, + "logps/chosen": -100.41091918945312, + "logps/rejected": -56.01551055908203, + "loss": 0.758, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.7018051147460938, + "rewards/margins": -0.27381443977355957, + "rewards/rejected": 1.9756195545196533, + "step": 5026 + }, + { + "epoch": 0.82, + "learning_rate": 6.698980989162078e-06, + "logits/chosen": -0.9902554154396057, + "logits/rejected": -0.974876880645752, + "logps/chosen": -24.17645835876465, + "logps/rejected": -2.8321080207824707, + "loss": 0.2241, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.9482412338256836, + "rewards/margins": 0.6343066692352295, + "rewards/rejected": 0.3139345347881317, + "step": 5027 + }, + { + "epoch": 0.82, + "learning_rate": 6.697744875832078e-06, + "logits/chosen": -0.6865078210830688, + "logits/rejected": -0.6853002905845642, + "logps/chosen": -8.548097610473633, + "logps/rejected": -3.008427143096924, + "loss": 0.6451, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.36680155992507935, + "rewards/margins": 0.04851579666137695, + "rewards/rejected": 0.3182857632637024, + "step": 5028 + }, + { + "epoch": 0.82, + "learning_rate": 6.696508645203971e-06, + "logits/chosen": -1.1947355270385742, + "logits/rejected": -1.125654935836792, + "logps/chosen": -76.99497985839844, + "logps/rejected": -22.325027465820312, + "loss": 0.3573, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.131115674972534, + "rewards/margins": 1.60145103931427, + "rewards/rejected": 0.5296646356582642, + "step": 5029 + }, + { + "epoch": 0.82, + "learning_rate": 6.695272297363169e-06, + "logits/chosen": -1.2204550504684448, + "logits/rejected": -1.2146358489990234, + "logps/chosen": -61.326690673828125, + "logps/rejected": -44.81753158569336, + "loss": 1.0634, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.282208204269409, + "rewards/margins": 0.8143725395202637, + "rewards/rejected": 2.4678356647491455, + "step": 5030 + }, + { + "epoch": 0.82, + "learning_rate": 6.694035832395092e-06, + "logits/chosen": -1.2451534271240234, + "logits/rejected": -1.1913855075836182, + "logps/chosen": -89.6358413696289, + "logps/rejected": -62.835914611816406, + "loss": 0.8308, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.2055230140686035, + "rewards/margins": 0.03534865379333496, + "rewards/rejected": 2.1701743602752686, + "step": 5031 + }, + { + "epoch": 0.82, + "learning_rate": 6.692799250385168e-06, + "logits/chosen": -0.8030537962913513, + "logits/rejected": -0.8051007986068726, + "logps/chosen": -2.918884754180908, + "logps/rejected": -3.33084774017334, + "loss": 0.3491, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.26154038310050964, + "rewards/margins": -0.0032524466514587402, + "rewards/rejected": 0.2647928297519684, + "step": 5032 + }, + { + "epoch": 0.82, + "learning_rate": 6.691562551418833e-06, + "logits/chosen": -1.1931270360946655, + "logits/rejected": -1.0096383094787598, + "logps/chosen": -102.6098861694336, + "logps/rejected": -46.53276824951172, + "loss": 0.0176, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.7187981605529785, + "rewards/margins": 3.3555397987365723, + "rewards/rejected": 2.3632583618164062, + "step": 5033 + }, + { + "epoch": 0.82, + "learning_rate": 6.690325735581532e-06, + "logits/chosen": -1.2167421579360962, + "logits/rejected": -1.1303447484970093, + "logps/chosen": -160.72695922851562, + "logps/rejected": -17.263662338256836, + "loss": 1.611, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.5487334728240967, + "rewards/margins": 2.3994429111480713, + "rewards/rejected": 0.14929066598415375, + "step": 5034 + }, + { + "epoch": 0.82, + "learning_rate": 6.689088802958717e-06, + "logits/chosen": -1.3729262351989746, + "logits/rejected": -1.262200117111206, + "logps/chosen": -59.90333557128906, + "logps/rejected": -42.912445068359375, + "loss": 0.1464, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.5644021034240723, + "rewards/margins": 2.186448335647583, + "rewards/rejected": 1.3779537677764893, + "step": 5035 + }, + { + "epoch": 0.82, + "learning_rate": 6.687851753635847e-06, + "logits/chosen": -1.0285253524780273, + "logits/rejected": -1.0285253524780273, + "logps/chosen": -38.99751281738281, + "logps/rejected": -38.99751281738281, + "loss": 0.4704, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.7313125133514404, + "rewards/margins": 0.0, + "rewards/rejected": 2.7313125133514404, + "step": 5036 + }, + { + "epoch": 0.82, + "learning_rate": 6.686614587698392e-06, + "logits/chosen": -0.9350702166557312, + "logits/rejected": -0.9206494092941284, + "logps/chosen": -85.72706604003906, + "logps/rejected": -59.538665771484375, + "loss": 0.2502, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.414510488510132, + "rewards/margins": 0.4766411781311035, + "rewards/rejected": 1.9378693103790283, + "step": 5037 + }, + { + "epoch": 0.82, + "learning_rate": 6.6853773052318275e-06, + "logits/chosen": -1.3610692024230957, + "logits/rejected": -1.4282948970794678, + "logps/chosen": -148.84530639648438, + "logps/rejected": -148.42922973632812, + "loss": 2.7686, + "rewards/accuracies": 0.0, + "rewards/chosen": 4.963932991027832, + "rewards/margins": -5.502557754516602, + "rewards/rejected": 10.466490745544434, + "step": 5038 + }, + { + "epoch": 0.82, + "learning_rate": 6.684139906321639e-06, + "logits/chosen": -1.311448574066162, + "logits/rejected": -1.1745631694793701, + "logps/chosen": -134.42385864257812, + "logps/rejected": -172.91407775878906, + "loss": 0.3468, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.693185329437256, + "rewards/margins": 0.02237701416015625, + "rewards/rejected": 6.6708083152771, + "step": 5039 + }, + { + "epoch": 0.82, + "learning_rate": 6.682902391053319e-06, + "logits/chosen": -1.0286948680877686, + "logits/rejected": -1.0294636487960815, + "logps/chosen": -46.74775695800781, + "logps/rejected": -31.00461196899414, + "loss": 0.6346, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.214410424232483, + "rewards/margins": -0.656592607498169, + "rewards/rejected": 1.8710030317306519, + "step": 5040 + }, + { + "epoch": 0.82, + "learning_rate": 6.6816647595123665e-06, + "logits/chosen": -1.2662371397018433, + "logits/rejected": -1.140568494796753, + "logps/chosen": -87.36282348632812, + "logps/rejected": -20.99285888671875, + "loss": 0.3002, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.8077399730682373, + "rewards/margins": 2.1195104122161865, + "rewards/rejected": 0.6882295608520508, + "step": 5041 + }, + { + "epoch": 0.82, + "learning_rate": 6.680427011784292e-06, + "logits/chosen": -1.22946035861969, + "logits/rejected": -1.2066650390625, + "logps/chosen": -65.799072265625, + "logps/rejected": -85.61917877197266, + "loss": 1.5405, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.4396302700042725, + "rewards/margins": -0.23115825653076172, + "rewards/rejected": 2.670788526535034, + "step": 5042 + }, + { + "epoch": 0.82, + "learning_rate": 6.6791891479546104e-06, + "logits/chosen": -1.0105377435684204, + "logits/rejected": -0.9108315110206604, + "logps/chosen": -135.15399169921875, + "logps/rejected": -66.95640563964844, + "loss": 0.1378, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.555070400238037, + "rewards/margins": 1.1586875915527344, + "rewards/rejected": 4.396382808685303, + "step": 5043 + }, + { + "epoch": 0.82, + "learning_rate": 6.677951168108847e-06, + "logits/chosen": -0.9876807332038879, + "logits/rejected": -1.0030021667480469, + "logps/chosen": -54.99904251098633, + "logps/rejected": -85.98628997802734, + "loss": 0.9679, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.6622936725616455, + "rewards/margins": -0.6785411834716797, + "rewards/rejected": 2.340834856033325, + "step": 5044 + }, + { + "epoch": 0.82, + "learning_rate": 6.676713072332536e-06, + "logits/chosen": -1.0392074584960938, + "logits/rejected": -0.84525066614151, + "logps/chosen": -50.4058723449707, + "logps/rejected": -68.27983093261719, + "loss": 0.6036, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.4192707538604736, + "rewards/margins": 3.0975072383880615, + "rewards/rejected": -0.6782364249229431, + "step": 5045 + }, + { + "epoch": 0.82, + "learning_rate": 6.675474860711216e-06, + "logits/chosen": -0.8663004040718079, + "logits/rejected": -0.9174185395240784, + "logps/chosen": -61.92662811279297, + "logps/rejected": -50.99078369140625, + "loss": 0.7053, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.1949455738067627, + "rewards/margins": 0.4663475751876831, + "rewards/rejected": 1.7285979986190796, + "step": 5046 + }, + { + "epoch": 0.82, + "learning_rate": 6.674236533330437e-06, + "logits/chosen": -1.3598196506500244, + "logits/rejected": -1.3684237003326416, + "logps/chosen": -125.38379669189453, + "logps/rejected": -69.03214263916016, + "loss": 0.4155, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.216062307357788, + "rewards/margins": -0.09011530876159668, + "rewards/rejected": 2.3061776161193848, + "step": 5047 + }, + { + "epoch": 0.82, + "learning_rate": 6.672998090275755e-06, + "logits/chosen": -0.9350548982620239, + "logits/rejected": -0.6520636677742004, + "logps/chosen": -84.98908233642578, + "logps/rejected": -25.21683692932129, + "loss": 0.0114, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.576998233795166, + "rewards/margins": 4.567016124725342, + "rewards/rejected": 0.009982109069824219, + "step": 5048 + }, + { + "epoch": 0.82, + "learning_rate": 6.671759531632735e-06, + "logits/chosen": -1.019951343536377, + "logits/rejected": -0.9360026717185974, + "logps/chosen": -37.09653091430664, + "logps/rejected": -19.24200439453125, + "loss": 0.3784, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.1598896980285645, + "rewards/margins": 0.18438923358917236, + "rewards/rejected": 1.975500464439392, + "step": 5049 + }, + { + "epoch": 0.82, + "learning_rate": 6.6705208574869504e-06, + "logits/chosen": -0.9773738980293274, + "logits/rejected": -0.896156907081604, + "logps/chosen": -61.490257263183594, + "logps/rejected": -43.27934265136719, + "loss": 0.815, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.9449989795684814, + "rewards/margins": -0.01579737663269043, + "rewards/rejected": 3.960796356201172, + "step": 5050 + }, + { + "epoch": 0.82, + "learning_rate": 6.669282067923981e-06, + "logits/chosen": -1.3977102041244507, + "logits/rejected": -1.4638234376907349, + "logps/chosen": -275.0623779296875, + "logps/rejected": -210.61871337890625, + "loss": 1.836, + "rewards/accuracies": 0.0, + "rewards/chosen": 6.755649089813232, + "rewards/margins": -3.6216464042663574, + "rewards/rejected": 10.37729549407959, + "step": 5051 + }, + { + "epoch": 0.82, + "learning_rate": 6.668043163029415e-06, + "logits/chosen": -1.2634572982788086, + "logits/rejected": -1.175369381904602, + "logps/chosen": -45.359378814697266, + "logps/rejected": -41.967159271240234, + "loss": 0.3946, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.7169888019561768, + "rewards/margins": 1.3410743474960327, + "rewards/rejected": 1.375914454460144, + "step": 5052 + }, + { + "epoch": 0.82, + "learning_rate": 6.666804142888849e-06, + "logits/chosen": -1.3184863328933716, + "logits/rejected": -1.207991361618042, + "logps/chosen": -117.60140228271484, + "logps/rejected": -35.866554260253906, + "loss": 0.1503, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.4397759437561035, + "rewards/margins": 4.500107288360596, + "rewards/rejected": 1.9396686553955078, + "step": 5053 + }, + { + "epoch": 0.82, + "learning_rate": 6.665565007587888e-06, + "logits/chosen": -1.3612899780273438, + "logits/rejected": -1.1529676914215088, + "logps/chosen": -103.7730712890625, + "logps/rejected": -53.63672637939453, + "loss": 0.1349, + "rewards/accuracies": 1.0, + "rewards/chosen": 7.295956611633301, + "rewards/margins": 3.8061342239379883, + "rewards/rejected": 3.4898223876953125, + "step": 5054 + }, + { + "epoch": 0.82, + "learning_rate": 6.664325757212147e-06, + "logits/chosen": -0.9152176976203918, + "logits/rejected": -0.9152176976203918, + "logps/chosen": -67.76080322265625, + "logps/rejected": -67.76080322265625, + "loss": 0.35, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.584742784500122, + "rewards/margins": 0.0, + "rewards/rejected": 3.584742784500122, + "step": 5055 + }, + { + "epoch": 0.82, + "learning_rate": 6.663086391847242e-06, + "logits/chosen": -1.2666432857513428, + "logits/rejected": -1.3267511129379272, + "logps/chosen": -85.73411560058594, + "logps/rejected": -112.70915222167969, + "loss": 3.8201, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.94256591796875, + "rewards/margins": -7.269255638122559, + "rewards/rejected": 9.211821556091309, + "step": 5056 + }, + { + "epoch": 0.82, + "learning_rate": 6.6618469115788055e-06, + "logits/chosen": -0.8669567704200745, + "logits/rejected": -0.9107807874679565, + "logps/chosen": -94.07115936279297, + "logps/rejected": -129.09388732910156, + "loss": 1.4286, + "rewards/accuracies": 0.0, + "rewards/chosen": 5.090152740478516, + "rewards/margins": -2.714794158935547, + "rewards/rejected": 7.8049468994140625, + "step": 5057 + }, + { + "epoch": 0.82, + "learning_rate": 6.660607316492471e-06, + "logits/chosen": -0.6203440427780151, + "logits/rejected": -0.6371451616287231, + "logps/chosen": -51.284366607666016, + "logps/rejected": -53.72418212890625, + "loss": 0.8752, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.8390178680419922, + "rewards/margins": 0.5575839281082153, + "rewards/rejected": 1.2814339399337769, + "step": 5058 + }, + { + "epoch": 0.82, + "learning_rate": 6.659367606673883e-06, + "logits/chosen": -1.0776751041412354, + "logits/rejected": -0.9501349329948425, + "logps/chosen": -83.98963165283203, + "logps/rejected": -27.564973831176758, + "loss": 0.0666, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.601874589920044, + "rewards/margins": 2.06662917137146, + "rewards/rejected": 0.5352453589439392, + "step": 5059 + }, + { + "epoch": 0.82, + "learning_rate": 6.658127782208695e-06, + "logits/chosen": -1.1071734428405762, + "logits/rejected": -1.0871009826660156, + "logps/chosen": -42.714054107666016, + "logps/rejected": -56.31983947753906, + "loss": 0.3415, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.1035408973693848, + "rewards/margins": 0.08698344230651855, + "rewards/rejected": 2.016557455062866, + "step": 5060 + }, + { + "epoch": 0.82, + "learning_rate": 6.656887843182567e-06, + "logits/chosen": -0.9291273951530457, + "logits/rejected": -0.9216629862785339, + "logps/chosen": -44.41712188720703, + "logps/rejected": -71.766357421875, + "loss": 1.952, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.963714599609375, + "rewards/margins": -2.539790391921997, + "rewards/rejected": 3.503504991531372, + "step": 5061 + }, + { + "epoch": 0.82, + "learning_rate": 6.655647789681167e-06, + "logits/chosen": -1.2170758247375488, + "logits/rejected": -1.0805751085281372, + "logps/chosen": -100.95346069335938, + "logps/rejected": -76.44004821777344, + "loss": 0.2068, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.2650558948516846, + "rewards/margins": 0.6854804754257202, + "rewards/rejected": 1.5795754194259644, + "step": 5062 + }, + { + "epoch": 0.82, + "learning_rate": 6.65440762179017e-06, + "logits/chosen": -1.3959625959396362, + "logits/rejected": -1.2784720659255981, + "logps/chosen": -129.67117309570312, + "logps/rejected": -14.998348236083984, + "loss": 0.0088, + "rewards/accuracies": 1.0, + "rewards/chosen": 7.701086521148682, + "rewards/margins": 6.671198844909668, + "rewards/rejected": 1.0298877954483032, + "step": 5063 + }, + { + "epoch": 0.82, + "learning_rate": 6.653167339595261e-06, + "logits/chosen": -1.2992099523544312, + "logits/rejected": -1.472826361656189, + "logps/chosen": -216.075927734375, + "logps/rejected": -176.8720703125, + "loss": 0.091, + "rewards/accuracies": 1.0, + "rewards/chosen": 7.256704807281494, + "rewards/margins": 2.3723464012145996, + "rewards/rejected": 4.8843584060668945, + "step": 5064 + }, + { + "epoch": 0.82, + "learning_rate": 6.65192694318213e-06, + "logits/chosen": -1.2716217041015625, + "logits/rejected": -1.1481883525848389, + "logps/chosen": -123.63966369628906, + "logps/rejected": -81.28608703613281, + "loss": 0.3399, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.4479265213012695, + "rewards/margins": 1.01224684715271, + "rewards/rejected": 3.4356796741485596, + "step": 5065 + }, + { + "epoch": 0.82, + "learning_rate": 6.65068643263648e-06, + "logits/chosen": -0.8979488015174866, + "logits/rejected": -0.8736798763275146, + "logps/chosen": -57.01313400268555, + "logps/rejected": -62.11388397216797, + "loss": 1.8261, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.5983943939208984, + "rewards/margins": -1.7219510078430176, + "rewards/rejected": 4.320345401763916, + "step": 5066 + }, + { + "epoch": 0.82, + "learning_rate": 6.649445808044014e-06, + "logits/chosen": -1.0258022546768188, + "logits/rejected": -1.0361028909683228, + "logps/chosen": -109.15357971191406, + "logps/rejected": -115.40603637695312, + "loss": 1.518, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.6170976161956787, + "rewards/margins": -2.9846723079681396, + "rewards/rejected": 6.601769924163818, + "step": 5067 + }, + { + "epoch": 0.82, + "learning_rate": 6.648205069490451e-06, + "logits/chosen": -1.2185333967208862, + "logits/rejected": -1.1284135580062866, + "logps/chosen": -59.244380950927734, + "logps/rejected": -41.462547302246094, + "loss": 0.5256, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9835498332977295, + "rewards/margins": 1.069807767868042, + "rewards/rejected": 1.9137420654296875, + "step": 5068 + }, + { + "epoch": 0.82, + "learning_rate": 6.646964217061513e-06, + "logits/chosen": -1.4284790754318237, + "logits/rejected": -1.4062504768371582, + "logps/chosen": -135.92884826660156, + "logps/rejected": -104.68661499023438, + "loss": 0.7052, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.7114228010177612, + "rewards/margins": -0.8694823980331421, + "rewards/rejected": 2.5809051990509033, + "step": 5069 + }, + { + "epoch": 0.82, + "learning_rate": 6.6457232508429325e-06, + "logits/chosen": -1.4658845663070679, + "logits/rejected": -1.4154019355773926, + "logps/chosen": -121.87762451171875, + "logps/rejected": -31.42289924621582, + "loss": 0.0605, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0052154064178467, + "rewards/margins": 2.0795934200286865, + "rewards/rejected": 0.9256219863891602, + "step": 5070 + }, + { + "epoch": 0.82, + "learning_rate": 6.644482170920445e-06, + "logits/chosen": -1.060808777809143, + "logits/rejected": -1.0401514768600464, + "logps/chosen": -44.89429473876953, + "logps/rejected": -50.340755462646484, + "loss": 0.2372, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.406579613685608, + "rewards/margins": 0.5022850036621094, + "rewards/rejected": 0.9042946100234985, + "step": 5071 + }, + { + "epoch": 0.82, + "learning_rate": 6.643240977379801e-06, + "logits/chosen": -0.754593551158905, + "logits/rejected": -0.754593551158905, + "logps/chosen": -46.90650177001953, + "logps/rejected": -46.90650177001953, + "loss": 0.7004, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.6763232946395874, + "rewards/margins": 0.0, + "rewards/rejected": 1.6763232946395874, + "step": 5072 + }, + { + "epoch": 0.82, + "learning_rate": 6.641999670306755e-06, + "logits/chosen": -0.995292067527771, + "logits/rejected": -0.9972372651100159, + "logps/chosen": -33.980159759521484, + "logps/rejected": -48.530517578125, + "loss": 1.4778, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.6406208276748657, + "rewards/margins": -1.4991992712020874, + "rewards/rejected": 3.139820098876953, + "step": 5073 + }, + { + "epoch": 0.82, + "learning_rate": 6.640758249787067e-06, + "logits/chosen": -0.7999140620231628, + "logits/rejected": -0.8678271770477295, + "logps/chosen": -58.0133056640625, + "logps/rejected": -62.067344665527344, + "loss": 0.2966, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.3449599742889404, + "rewards/margins": 0.21902155876159668, + "rewards/rejected": 2.1259384155273438, + "step": 5074 + }, + { + "epoch": 0.82, + "learning_rate": 6.63951671590651e-06, + "logits/chosen": -1.1392898559570312, + "logits/rejected": -1.2849708795547485, + "logps/chosen": -159.79920959472656, + "logps/rejected": -131.63259887695312, + "loss": 1.8787, + "rewards/accuracies": 0.0, + "rewards/chosen": 4.705216884613037, + "rewards/margins": -3.6311936378479004, + "rewards/rejected": 8.336410522460938, + "step": 5075 + }, + { + "epoch": 0.82, + "learning_rate": 6.638275068750861e-06, + "logits/chosen": -0.872305154800415, + "logits/rejected": -0.8364083170890808, + "logps/chosen": -98.5064697265625, + "logps/rejected": -80.09585571289062, + "loss": 1.9136, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.5713096857070923, + "rewards/margins": 0.26343536376953125, + "rewards/rejected": 1.307874321937561, + "step": 5076 + }, + { + "epoch": 0.82, + "learning_rate": 6.637033308405906e-06, + "logits/chosen": -1.1628897190093994, + "logits/rejected": -1.0921804904937744, + "logps/chosen": -71.83563232421875, + "logps/rejected": -48.177555084228516, + "loss": 0.5549, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.1758430004119873, + "rewards/margins": -0.678333044052124, + "rewards/rejected": 3.8541760444641113, + "step": 5077 + }, + { + "epoch": 0.82, + "learning_rate": 6.6357914349574396e-06, + "logits/chosen": -0.7057437300682068, + "logits/rejected": -0.7057437300682068, + "logps/chosen": -43.299224853515625, + "logps/rejected": -43.299224853515625, + "loss": 0.601, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.3231377601623535, + "rewards/margins": 0.0, + "rewards/rejected": 2.3231377601623535, + "step": 5078 + }, + { + "epoch": 0.82, + "learning_rate": 6.634549448491261e-06, + "logits/chosen": -1.308258056640625, + "logits/rejected": -1.2790623903274536, + "logps/chosen": -127.53160095214844, + "logps/rejected": -118.74504852294922, + "loss": 0.4389, + "rewards/accuracies": 0.0, + "rewards/chosen": 4.382673740386963, + "rewards/margins": -0.3336038589477539, + "rewards/rejected": 4.716277599334717, + "step": 5079 + }, + { + "epoch": 0.82, + "learning_rate": 6.633307349093183e-06, + "logits/chosen": -0.9194923639297485, + "logits/rejected": -1.0208353996276855, + "logps/chosen": -36.94453811645508, + "logps/rejected": -35.988746643066406, + "loss": 0.68, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.264359712600708, + "rewards/margins": -1.0597813129425049, + "rewards/rejected": 2.324141025543213, + "step": 5080 + }, + { + "epoch": 0.82, + "learning_rate": 6.632065136849023e-06, + "logits/chosen": -1.1969425678253174, + "logits/rejected": -1.149411916732788, + "logps/chosen": -113.85974884033203, + "logps/rejected": -45.247520446777344, + "loss": 1.3351, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.4900872707366943, + "rewards/margins": -0.3325376510620117, + "rewards/rejected": 2.822624921798706, + "step": 5081 + }, + { + "epoch": 0.82, + "learning_rate": 6.630822811844604e-06, + "logits/chosen": -0.9792952537536621, + "logits/rejected": -0.8709773421287537, + "logps/chosen": -83.34297943115234, + "logps/rejected": -59.84373474121094, + "loss": 1.9005, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0318031311035156, + "rewards/margins": 0.29404377937316895, + "rewards/rejected": 2.7377593517303467, + "step": 5082 + }, + { + "epoch": 0.83, + "learning_rate": 6.6295803741657595e-06, + "logits/chosen": -0.7444510459899902, + "logits/rejected": -0.7752885818481445, + "logps/chosen": -72.96272277832031, + "logps/rejected": -55.800132751464844, + "loss": 0.6342, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.6891305446624756, + "rewards/margins": -0.1049048900604248, + "rewards/rejected": 2.7940354347229004, + "step": 5083 + }, + { + "epoch": 0.83, + "learning_rate": 6.6283378238983295e-06, + "logits/chosen": -1.53941810131073, + "logits/rejected": -1.431180715560913, + "logps/chosen": -104.60301208496094, + "logps/rejected": -104.99385070800781, + "loss": 0.0744, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.205572605133057, + "rewards/margins": 2.595088243484497, + "rewards/rejected": 3.6104843616485596, + "step": 5084 + }, + { + "epoch": 0.83, + "learning_rate": 6.627095161128164e-06, + "logits/chosen": -1.494945764541626, + "logits/rejected": -1.5136117935180664, + "logps/chosen": -96.69546508789062, + "logps/rejected": -61.545204162597656, + "loss": 1.1772, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.8645591735839844, + "rewards/margins": 0.4007704257965088, + "rewards/rejected": 2.4637887477874756, + "step": 5085 + }, + { + "epoch": 0.83, + "learning_rate": 6.62585238594112e-06, + "logits/chosen": -0.9807525277137756, + "logits/rejected": -1.0403223037719727, + "logps/chosen": -4.78738260269165, + "logps/rejected": -52.60700988769531, + "loss": 2.3995, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.5876158475875854, + "rewards/margins": -2.731658935546875, + "rewards/rejected": 3.31927490234375, + "step": 5086 + }, + { + "epoch": 0.83, + "learning_rate": 6.624609498423058e-06, + "logits/chosen": -0.8037339448928833, + "logits/rejected": -0.8037339448928833, + "logps/chosen": -74.66033935546875, + "logps/rejected": -74.66033935546875, + "loss": 2.6105, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.8398124575614929, + "rewards/margins": 0.0, + "rewards/rejected": 0.8398124575614929, + "step": 5087 + }, + { + "epoch": 0.83, + "learning_rate": 6.623366498659853e-06, + "logits/chosen": -0.9294210076332092, + "logits/rejected": -0.8311989307403564, + "logps/chosen": -44.884910583496094, + "logps/rejected": -69.27069091796875, + "loss": 0.9005, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.2369332313537598, + "rewards/margins": -1.5291075706481934, + "rewards/rejected": 3.766040802001953, + "step": 5088 + }, + { + "epoch": 0.83, + "learning_rate": 6.622123386737382e-06, + "logits/chosen": -0.6957820653915405, + "logits/rejected": -0.7603565454483032, + "logps/chosen": -59.56462097167969, + "logps/rejected": -74.714111328125, + "loss": 0.9262, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.1606621742248535, + "rewards/margins": -1.503838300704956, + "rewards/rejected": 3.6645004749298096, + "step": 5089 + }, + { + "epoch": 0.83, + "learning_rate": 6.620880162741534e-06, + "logits/chosen": -1.4389736652374268, + "logits/rejected": -1.3383821249008179, + "logps/chosen": -105.86612701416016, + "logps/rejected": -176.64315795898438, + "loss": 0.1439, + "rewards/accuracies": 1.0, + "rewards/chosen": 8.796744346618652, + "rewards/margins": 1.1047978401184082, + "rewards/rejected": 7.691946506500244, + "step": 5090 + }, + { + "epoch": 0.83, + "learning_rate": 6.619636826758204e-06, + "logits/chosen": -1.1807730197906494, + "logits/rejected": -1.2060084342956543, + "logps/chosen": -230.30581665039062, + "logps/rejected": -97.41770935058594, + "loss": 0.0452, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.1705169677734375, + "rewards/margins": 4.234010219573975, + "rewards/rejected": 1.9365066289901733, + "step": 5091 + }, + { + "epoch": 0.83, + "learning_rate": 6.6183933788732956e-06, + "logits/chosen": -0.6069053411483765, + "logits/rejected": -0.594735860824585, + "logps/chosen": -9.007594108581543, + "logps/rejected": -14.356579780578613, + "loss": 0.5031, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.15959607064723969, + "rewards/margins": -0.5077961683273315, + "rewards/rejected": 0.6673922538757324, + "step": 5092 + }, + { + "epoch": 0.83, + "learning_rate": 6.6171498191727195e-06, + "logits/chosen": -1.1853324174880981, + "logits/rejected": -1.1125062704086304, + "logps/chosen": -60.72308349609375, + "logps/rejected": -42.32438659667969, + "loss": 0.4529, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.7583649158477783, + "rewards/margins": 0.14334726333618164, + "rewards/rejected": 3.6150176525115967, + "step": 5093 + }, + { + "epoch": 0.83, + "learning_rate": 6.615906147742389e-06, + "logits/chosen": -0.8813905119895935, + "logits/rejected": -0.8680459260940552, + "logps/chosen": -27.153844833374023, + "logps/rejected": -22.66127586364746, + "loss": 0.4039, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.8616647720336914, + "rewards/margins": 0.026822268962860107, + "rewards/rejected": 0.8348425030708313, + "step": 5094 + }, + { + "epoch": 0.83, + "learning_rate": 6.614662364668235e-06, + "logits/chosen": -0.5734671950340271, + "logits/rejected": -0.5734671950340271, + "logps/chosen": -1.292917013168335, + "logps/rejected": -1.292917013168335, + "loss": 0.5716, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.41129836440086365, + "rewards/margins": 0.0, + "rewards/rejected": 0.41129836440086365, + "step": 5095 + }, + { + "epoch": 0.83, + "learning_rate": 6.613418470036189e-06, + "logits/chosen": -1.206466794013977, + "logits/rejected": -1.0827702283859253, + "logps/chosen": -101.76809692382812, + "logps/rejected": -65.4848403930664, + "loss": 0.0966, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.587506294250488, + "rewards/margins": 1.6184289455413818, + "rewards/rejected": 3.9690773487091064, + "step": 5096 + }, + { + "epoch": 0.83, + "learning_rate": 6.612174463932194e-06, + "logits/chosen": -0.7395508885383606, + "logits/rejected": -0.826781690120697, + "logps/chosen": -40.45240783691406, + "logps/rejected": -123.74714660644531, + "loss": 1.4377, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.755164384841919, + "rewards/margins": 0.3205498456954956, + "rewards/rejected": 1.4346145391464233, + "step": 5097 + }, + { + "epoch": 0.83, + "learning_rate": 6.610930346442198e-06, + "logits/chosen": -1.0491715669631958, + "logits/rejected": -1.0491715669631958, + "logps/chosen": -4.078033447265625, + "logps/rejected": -4.078033447265625, + "loss": 1.26, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.40491780638694763, + "rewards/margins": 0.0, + "rewards/rejected": 0.40491780638694763, + "step": 5098 + }, + { + "epoch": 0.83, + "learning_rate": 6.609686117652158e-06, + "logits/chosen": -1.0220937728881836, + "logits/rejected": -0.9734618663787842, + "logps/chosen": -137.02430725097656, + "logps/rejected": -105.92884826660156, + "loss": 1.4633, + "rewards/accuracies": 0.0, + "rewards/chosen": 5.052128791809082, + "rewards/margins": -2.866671562194824, + "rewards/rejected": 7.918800354003906, + "step": 5099 + }, + { + "epoch": 0.83, + "learning_rate": 6.608441777648037e-06, + "logits/chosen": -0.8208739757537842, + "logits/rejected": -0.8208739757537842, + "logps/chosen": -40.379051208496094, + "logps/rejected": -40.379051208496094, + "loss": 1.0288, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.827218234539032, + "rewards/margins": 0.0, + "rewards/rejected": 0.827218234539032, + "step": 5100 + }, + { + "epoch": 0.83, + "learning_rate": 6.607197326515808e-06, + "logits/chosen": -1.1819545030593872, + "logits/rejected": -1.2798309326171875, + "logps/chosen": -81.63058471679688, + "logps/rejected": -88.26522064208984, + "loss": 0.8099, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.657412052154541, + "rewards/margins": -1.1367683410644531, + "rewards/rejected": 3.794180393218994, + "step": 5101 + }, + { + "epoch": 0.83, + "learning_rate": 6.6059527643414535e-06, + "logits/chosen": -0.9348669648170471, + "logits/rejected": -0.9405880570411682, + "logps/chosen": -17.83049774169922, + "logps/rejected": -24.157503128051758, + "loss": 0.602, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.7500820159912109, + "rewards/margins": 0.2818138003349304, + "rewards/rejected": 0.4682682156562805, + "step": 5102 + }, + { + "epoch": 0.83, + "learning_rate": 6.604708091210957e-06, + "logits/chosen": -1.439095377922058, + "logits/rejected": -1.4545974731445312, + "logps/chosen": -124.32933044433594, + "logps/rejected": -127.4544448852539, + "loss": 0.7529, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.3200180530548096, + "rewards/margins": -0.6864051818847656, + "rewards/rejected": 2.006423234939575, + "step": 5103 + }, + { + "epoch": 0.83, + "learning_rate": 6.603463307210316e-06, + "logits/chosen": -1.163521647453308, + "logits/rejected": -1.085541844367981, + "logps/chosen": -53.06391143798828, + "logps/rejected": -33.01350402832031, + "loss": 1.0015, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9352478981018066, + "rewards/margins": 1.7342438697814941, + "rewards/rejected": 1.2010040283203125, + "step": 5104 + }, + { + "epoch": 0.83, + "learning_rate": 6.602218412425532e-06, + "logits/chosen": -0.8753345012664795, + "logits/rejected": -0.8985103368759155, + "logps/chosen": -37.94516372680664, + "logps/rejected": -62.36125183105469, + "loss": 1.326, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.1674282550811768, + "rewards/margins": 0.8054485321044922, + "rewards/rejected": 0.3619796931743622, + "step": 5105 + }, + { + "epoch": 0.83, + "learning_rate": 6.600973406942617e-06, + "logits/chosen": -1.4543997049331665, + "logits/rejected": -1.4443857669830322, + "logps/chosen": -104.54888916015625, + "logps/rejected": -64.25357818603516, + "loss": 1.1821, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.327210903167725, + "rewards/margins": 2.359830379486084, + "rewards/rejected": 2.9673805236816406, + "step": 5106 + }, + { + "epoch": 0.83, + "learning_rate": 6.5997282908475865e-06, + "logits/chosen": -1.1510874032974243, + "logits/rejected": -1.055546760559082, + "logps/chosen": -60.12415313720703, + "logps/rejected": -38.4126091003418, + "loss": 0.2414, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.633811950683594, + "rewards/margins": 3.511802911758423, + "rewards/rejected": 3.122009038925171, + "step": 5107 + }, + { + "epoch": 0.83, + "learning_rate": 6.5984830642264695e-06, + "logits/chosen": -0.9306119084358215, + "logits/rejected": -0.9048649072647095, + "logps/chosen": -87.45706176757812, + "logps/rejected": -78.0382080078125, + "loss": 0.2778, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.857020616531372, + "rewards/margins": 0.35545873641967773, + "rewards/rejected": 2.5015618801116943, + "step": 5108 + }, + { + "epoch": 0.83, + "learning_rate": 6.597237727165298e-06, + "logits/chosen": -0.6379067301750183, + "logits/rejected": -0.6379067301750183, + "logps/chosen": -5.859416961669922, + "logps/rejected": -5.859416961669922, + "loss": 0.3818, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.2904081344604492, + "rewards/margins": 0.0, + "rewards/rejected": 0.2904081344604492, + "step": 5109 + }, + { + "epoch": 0.83, + "learning_rate": 6.595992279750111e-06, + "logits/chosen": -0.9096025824546814, + "logits/rejected": -0.9654120206832886, + "logps/chosen": -117.76575469970703, + "logps/rejected": -85.1939697265625, + "loss": 0.7522, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.909151554107666, + "rewards/margins": 1.9584771394729614, + "rewards/rejected": 1.9506744146347046, + "step": 5110 + }, + { + "epoch": 0.83, + "learning_rate": 6.594746722066959e-06, + "logits/chosen": -0.7448300719261169, + "logits/rejected": -0.7523084878921509, + "logps/chosen": -19.024091720581055, + "logps/rejected": -26.124874114990234, + "loss": 0.6891, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5701425671577454, + "rewards/margins": 0.01606309413909912, + "rewards/rejected": 0.5540794730186462, + "step": 5111 + }, + { + "epoch": 0.83, + "learning_rate": 6.5935010542019e-06, + "logits/chosen": -0.9859868288040161, + "logits/rejected": -0.9554756283760071, + "logps/chosen": -62.627479553222656, + "logps/rejected": -51.105072021484375, + "loss": 1.0443, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.5181755423545837, + "rewards/margins": -1.0709068775177002, + "rewards/rejected": 1.5890823602676392, + "step": 5112 + }, + { + "epoch": 0.83, + "learning_rate": 6.592255276240994e-06, + "logits/chosen": -1.4667657613754272, + "logits/rejected": -1.4667657613754272, + "logps/chosen": -59.6773681640625, + "logps/rejected": -59.6773681640625, + "loss": 0.5311, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.9783729314804077, + "rewards/margins": 0.0, + "rewards/rejected": 1.9783729314804077, + "step": 5113 + }, + { + "epoch": 0.83, + "learning_rate": 6.591009388270315e-06, + "logits/chosen": -0.9866348505020142, + "logits/rejected": -0.8624509572982788, + "logps/chosen": -56.869956970214844, + "logps/rejected": -17.28034019470215, + "loss": 0.5918, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.6052017211914062, + "rewards/margins": 1.9693406820297241, + "rewards/rejected": 0.6358610391616821, + "step": 5114 + }, + { + "epoch": 0.83, + "learning_rate": 6.589763390375942e-06, + "logits/chosen": -1.3762885332107544, + "logits/rejected": -1.2184802293777466, + "logps/chosen": -114.0435791015625, + "logps/rejected": -61.31683349609375, + "loss": 0.7198, + "rewards/accuracies": 0.0, + "rewards/chosen": 4.545909404754639, + "rewards/margins": -0.7146086692810059, + "rewards/rejected": 5.2605180740356445, + "step": 5115 + }, + { + "epoch": 0.83, + "learning_rate": 6.588517282643961e-06, + "logits/chosen": -1.115242838859558, + "logits/rejected": -1.1606311798095703, + "logps/chosen": -90.52091979980469, + "logps/rejected": -105.77976989746094, + "loss": 1.1939, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.1209068298339844, + "rewards/margins": -2.0551109313964844, + "rewards/rejected": 3.1760177612304688, + "step": 5116 + }, + { + "epoch": 0.83, + "learning_rate": 6.587271065160465e-06, + "logits/chosen": -1.393236517906189, + "logits/rejected": -1.3199553489685059, + "logps/chosen": -121.3597412109375, + "logps/rejected": -87.79332733154297, + "loss": 1.1463, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.207144260406494, + "rewards/margins": 0.1323690414428711, + "rewards/rejected": 5.074775218963623, + "step": 5117 + }, + { + "epoch": 0.83, + "learning_rate": 6.58602473801156e-06, + "logits/chosen": -1.1731066703796387, + "logits/rejected": -1.1731066703796387, + "logps/chosen": -74.2748794555664, + "logps/rejected": -74.2748794555664, + "loss": 0.4096, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.0650596618652344, + "rewards/margins": 0.0, + "rewards/rejected": 2.0650596618652344, + "step": 5118 + }, + { + "epoch": 0.83, + "learning_rate": 6.584778301283352e-06, + "logits/chosen": -1.4128974676132202, + "logits/rejected": -1.4273377656936646, + "logps/chosen": -114.53226470947266, + "logps/rejected": -82.73432922363281, + "loss": 1.0308, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.395012617111206, + "rewards/margins": -1.380286455154419, + "rewards/rejected": 3.775299072265625, + "step": 5119 + }, + { + "epoch": 0.83, + "learning_rate": 6.583531755061959e-06, + "logits/chosen": -1.03371000289917, + "logits/rejected": -1.1077518463134766, + "logps/chosen": -24.92344093322754, + "logps/rejected": -104.26398468017578, + "loss": 1.2855, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.575092077255249, + "rewards/margins": -2.430046319961548, + "rewards/rejected": 5.005138397216797, + "step": 5120 + }, + { + "epoch": 0.83, + "learning_rate": 6.582285099433503e-06, + "logits/chosen": -0.9233807325363159, + "logits/rejected": -0.9368986487388611, + "logps/chosen": -66.35284423828125, + "logps/rejected": -62.23049545288086, + "loss": 0.714, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.204463243484497, + "rewards/margins": -0.2907276153564453, + "rewards/rejected": 2.4951908588409424, + "step": 5121 + }, + { + "epoch": 0.83, + "learning_rate": 6.58103833448412e-06, + "logits/chosen": -1.307636022567749, + "logits/rejected": -1.142824411392212, + "logps/chosen": -93.01651000976562, + "logps/rejected": -79.9425048828125, + "loss": 0.0137, + "rewards/accuracies": 1.0, + "rewards/chosen": 7.961804389953613, + "rewards/margins": 5.738960266113281, + "rewards/rejected": 2.222843885421753, + "step": 5122 + }, + { + "epoch": 0.83, + "learning_rate": 6.579791460299948e-06, + "logits/chosen": -1.276412844657898, + "logits/rejected": -1.344873309135437, + "logps/chosen": -158.31661987304688, + "logps/rejected": -102.41627502441406, + "loss": 1.6597, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.05494537577033043, + "rewards/margins": -0.8261581659317017, + "rewards/rejected": 0.881103515625, + "step": 5123 + }, + { + "epoch": 0.83, + "learning_rate": 6.5785444769671335e-06, + "logits/chosen": -0.9129014015197754, + "logits/rejected": -0.9354346990585327, + "logps/chosen": -95.82183074951172, + "logps/rejected": -109.04768371582031, + "loss": 0.3177, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.2217628955841064, + "rewards/margins": 1.1696243286132812, + "rewards/rejected": 1.0521385669708252, + "step": 5124 + }, + { + "epoch": 0.83, + "learning_rate": 6.577297384571832e-06, + "logits/chosen": -0.49846160411834717, + "logits/rejected": -0.49846160411834717, + "logps/chosen": -45.247413635253906, + "logps/rejected": -45.247413635253906, + "loss": 0.385, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.269873857498169, + "rewards/margins": 0.0, + "rewards/rejected": 1.269873857498169, + "step": 5125 + }, + { + "epoch": 0.83, + "learning_rate": 6.576050183200208e-06, + "logits/chosen": -1.4759984016418457, + "logits/rejected": -1.3596570491790771, + "logps/chosen": -93.36392211914062, + "logps/rejected": -17.360366821289062, + "loss": 0.1011, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.924861192703247, + "rewards/margins": 2.0065221786499023, + "rewards/rejected": 0.9183389544487, + "step": 5126 + }, + { + "epoch": 0.83, + "learning_rate": 6.5748028729384265e-06, + "logits/chosen": -0.6263065338134766, + "logits/rejected": -0.6263065338134766, + "logps/chosen": -17.09457778930664, + "logps/rejected": -17.09457778930664, + "loss": 0.3481, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.5753734707832336, + "rewards/margins": 0.0, + "rewards/rejected": 0.5753734707832336, + "step": 5127 + }, + { + "epoch": 0.83, + "learning_rate": 6.5735554538726685e-06, + "logits/chosen": -1.1539267301559448, + "logits/rejected": -1.1852906942367554, + "logps/chosen": -88.10194396972656, + "logps/rejected": -123.24081420898438, + "loss": 0.6213, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.334159851074219, + "rewards/margins": 0.24617624282836914, + "rewards/rejected": 6.08798360824585, + "step": 5128 + }, + { + "epoch": 0.83, + "learning_rate": 6.572307926089118e-06, + "logits/chosen": -1.3011311292648315, + "logits/rejected": -1.3853940963745117, + "logps/chosen": -72.4974136352539, + "logps/rejected": -132.20028686523438, + "loss": 2.2897, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.2671241760253906, + "rewards/margins": -4.560058116912842, + "rewards/rejected": 6.827182292938232, + "step": 5129 + }, + { + "epoch": 0.83, + "learning_rate": 6.571060289673966e-06, + "logits/chosen": -1.0891016721725464, + "logits/rejected": -1.0250887870788574, + "logps/chosen": -110.95521545410156, + "logps/rejected": -77.56727600097656, + "loss": 1.1481, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.968014717102051, + "rewards/margins": 1.4705827236175537, + "rewards/rejected": 3.497431993484497, + "step": 5130 + }, + { + "epoch": 0.83, + "learning_rate": 6.569812544713414e-06, + "logits/chosen": -1.37424635887146, + "logits/rejected": -1.2727608680725098, + "logps/chosen": -125.941162109375, + "logps/rejected": -54.64529800415039, + "loss": 0.5542, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.706526279449463, + "rewards/margins": 3.1950573921203613, + "rewards/rejected": 2.5114688873291016, + "step": 5131 + }, + { + "epoch": 0.83, + "learning_rate": 6.56856469129367e-06, + "logits/chosen": -1.3502436876296997, + "logits/rejected": -1.3725649118423462, + "logps/chosen": -31.994041442871094, + "logps/rejected": -12.795022010803223, + "loss": 0.4326, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.306069940328598, + "rewards/margins": -0.29422029852867126, + "rewards/rejected": 0.6002902388572693, + "step": 5132 + }, + { + "epoch": 0.83, + "learning_rate": 6.567316729500945e-06, + "logits/chosen": -1.134701132774353, + "logits/rejected": -1.1379711627960205, + "logps/chosen": -116.81825256347656, + "logps/rejected": -92.60887145996094, + "loss": 2.8212, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.608107089996338, + "rewards/margins": -0.2794158458709717, + "rewards/rejected": 3.8875229358673096, + "step": 5133 + }, + { + "epoch": 0.83, + "learning_rate": 6.566068659421467e-06, + "logits/chosen": -0.9481939673423767, + "logits/rejected": -0.9593067169189453, + "logps/chosen": -57.94477844238281, + "logps/rejected": -57.11443328857422, + "loss": 0.9613, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.0876967906951904, + "rewards/margins": 0.6225928068161011, + "rewards/rejected": 1.4651039838790894, + "step": 5134 + }, + { + "epoch": 0.83, + "learning_rate": 6.564820481141462e-06, + "logits/chosen": -1.3456740379333496, + "logits/rejected": -1.3028252124786377, + "logps/chosen": -55.536582946777344, + "logps/rejected": -65.5269775390625, + "loss": 1.322, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.8468315601348877, + "rewards/margins": -1.268862009048462, + "rewards/rejected": 4.11569356918335, + "step": 5135 + }, + { + "epoch": 0.83, + "learning_rate": 6.563572194747168e-06, + "logits/chosen": -1.00804603099823, + "logits/rejected": -1.1451138257980347, + "logps/chosen": -63.17063522338867, + "logps/rejected": -115.33448791503906, + "loss": 1.6958, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.0277225971221924, + "rewards/margins": -3.1159770488739014, + "rewards/rejected": 6.143699645996094, + "step": 5136 + }, + { + "epoch": 0.83, + "learning_rate": 6.562323800324829e-06, + "logits/chosen": -1.3627033233642578, + "logits/rejected": -1.3637990951538086, + "logps/chosen": -153.45773315429688, + "logps/rejected": -89.92189025878906, + "loss": 0.4458, + "rewards/accuracies": 0.0, + "rewards/chosen": 6.503399848937988, + "rewards/margins": -0.2190399169921875, + "rewards/rejected": 6.722439765930176, + "step": 5137 + }, + { + "epoch": 0.83, + "learning_rate": 6.5610752979607e-06, + "logits/chosen": -1.1249361038208008, + "logits/rejected": -1.1249361038208008, + "logps/chosen": -21.166288375854492, + "logps/rejected": -21.166288375854492, + "loss": 0.4864, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.3894674479961395, + "rewards/margins": 0.0, + "rewards/rejected": 0.3894674479961395, + "step": 5138 + }, + { + "epoch": 0.83, + "learning_rate": 6.559826687741038e-06, + "logits/chosen": -1.0696310997009277, + "logits/rejected": -0.9925338625907898, + "logps/chosen": -56.956756591796875, + "logps/rejected": -46.864044189453125, + "loss": 0.096, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.343379259109497, + "rewards/margins": 1.6769638061523438, + "rewards/rejected": 1.6664154529571533, + "step": 5139 + }, + { + "epoch": 0.83, + "learning_rate": 6.558577969752111e-06, + "logits/chosen": -1.0294352769851685, + "logits/rejected": -1.0984070301055908, + "logps/chosen": -10.835615158081055, + "logps/rejected": -87.10103607177734, + "loss": 0.713, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.6455604434013367, + "rewards/margins": -0.363838255405426, + "rewards/rejected": 1.0093986988067627, + "step": 5140 + }, + { + "epoch": 0.83, + "learning_rate": 6.557329144080193e-06, + "logits/chosen": -1.2679471969604492, + "logits/rejected": -1.2300782203674316, + "logps/chosen": -61.349342346191406, + "logps/rejected": -22.387279510498047, + "loss": 0.4965, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.4984642267227173, + "rewards/margins": 0.837838351726532, + "rewards/rejected": 0.6606258749961853, + "step": 5141 + }, + { + "epoch": 0.83, + "learning_rate": 6.556080210811568e-06, + "logits/chosen": -1.150620937347412, + "logits/rejected": -1.0912381410598755, + "logps/chosen": -114.90646362304688, + "logps/rejected": -56.2540168762207, + "loss": 0.1082, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.779887676239014, + "rewards/margins": 2.8083364963531494, + "rewards/rejected": 2.9715511798858643, + "step": 5142 + }, + { + "epoch": 0.83, + "learning_rate": 6.554831170032524e-06, + "logits/chosen": -0.9965250492095947, + "logits/rejected": -1.1116211414337158, + "logps/chosen": -34.58335876464844, + "logps/rejected": -59.313480377197266, + "loss": 1.089, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.283154249191284, + "rewards/margins": -1.2536983489990234, + "rewards/rejected": 3.5368525981903076, + "step": 5143 + }, + { + "epoch": 0.83, + "learning_rate": 6.553582021829359e-06, + "logits/chosen": -0.945794939994812, + "logits/rejected": -0.9142354130744934, + "logps/chosen": -32.393760681152344, + "logps/rejected": -4.135851860046387, + "loss": 1.5395, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.8448761105537415, + "rewards/margins": 0.38034287095069885, + "rewards/rejected": 0.4645332396030426, + "step": 5144 + }, + { + "epoch": 0.84, + "learning_rate": 6.552332766288374e-06, + "logits/chosen": -0.8053504228591919, + "logits/rejected": -0.8053504228591919, + "logps/chosen": -44.07566833496094, + "logps/rejected": -44.07566833496094, + "loss": 0.3739, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.6044063568115234, + "rewards/margins": 0.0, + "rewards/rejected": 1.6044063568115234, + "step": 5145 + }, + { + "epoch": 0.84, + "learning_rate": 6.551083403495885e-06, + "logits/chosen": -0.8312621712684631, + "logits/rejected": -0.8312621712684631, + "logps/chosen": -29.02432632446289, + "logps/rejected": -29.02432632446289, + "loss": 0.7013, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.4894806146621704, + "rewards/margins": 0.0, + "rewards/rejected": 1.4894806146621704, + "step": 5146 + }, + { + "epoch": 0.84, + "learning_rate": 6.549833933538209e-06, + "logits/chosen": -0.9873153567314148, + "logits/rejected": -0.9930798411369324, + "logps/chosen": -5.5754570960998535, + "logps/rejected": -3.8616366386413574, + "loss": 0.4105, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.10952859371900558, + "rewards/margins": -0.13940078020095825, + "rewards/rejected": 0.24892936646938324, + "step": 5147 + }, + { + "epoch": 0.84, + "learning_rate": 6.548584356501673e-06, + "logits/chosen": -1.343938946723938, + "logits/rejected": -1.3286077976226807, + "logps/chosen": -154.97738647460938, + "logps/rejected": -105.86985778808594, + "loss": 0.1881, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.892962694168091, + "rewards/margins": 0.8224380016326904, + "rewards/rejected": 3.0705246925354004, + "step": 5148 + }, + { + "epoch": 0.84, + "learning_rate": 6.547334672472609e-06, + "logits/chosen": -1.144287109375, + "logits/rejected": -1.0366098880767822, + "logps/chosen": -144.7420654296875, + "logps/rejected": -82.28565979003906, + "loss": 0.8362, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.63671875, + "rewards/margins": -0.03708195686340332, + "rewards/rejected": 1.6738007068634033, + "step": 5149 + }, + { + "epoch": 0.84, + "learning_rate": 6.546084881537362e-06, + "logits/chosen": -1.0230157375335693, + "logits/rejected": -0.9902897477149963, + "logps/chosen": -83.229248046875, + "logps/rejected": -116.01103210449219, + "loss": 0.1863, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.7500083446502686, + "rewards/margins": 1.066525936126709, + "rewards/rejected": 1.6834824085235596, + "step": 5150 + }, + { + "epoch": 0.84, + "learning_rate": 6.544834983782279e-06, + "logits/chosen": -1.4283310174942017, + "logits/rejected": -1.4288510084152222, + "logps/chosen": -89.81885528564453, + "logps/rejected": -93.11864471435547, + "loss": 0.745, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.6153221130371094, + "rewards/margins": -1.0956497192382812, + "rewards/rejected": 3.7109718322753906, + "step": 5151 + }, + { + "epoch": 0.84, + "learning_rate": 6.543584979293716e-06, + "logits/chosen": -1.308834433555603, + "logits/rejected": -1.2310091257095337, + "logps/chosen": -131.81956481933594, + "logps/rejected": -77.41942596435547, + "loss": 0.04, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.782594203948975, + "rewards/margins": 2.9269323348999023, + "rewards/rejected": 2.8556618690490723, + "step": 5152 + }, + { + "epoch": 0.84, + "learning_rate": 6.542334868158036e-06, + "logits/chosen": -1.072114109992981, + "logits/rejected": -1.0979279279708862, + "logps/chosen": -43.24224090576172, + "logps/rejected": -85.45599365234375, + "loss": 0.1503, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0662879943847656, + "rewards/margins": 1.2828880548477173, + "rewards/rejected": 1.7833999395370483, + "step": 5153 + }, + { + "epoch": 0.84, + "learning_rate": 6.54108465046161e-06, + "logits/chosen": -0.8404815196990967, + "logits/rejected": -0.7012484073638916, + "logps/chosen": -32.64311599731445, + "logps/rejected": -10.9044189453125, + "loss": 0.0938, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.396238327026367, + "rewards/margins": 1.826385498046875, + "rewards/rejected": 0.5698528289794922, + "step": 5154 + }, + { + "epoch": 0.84, + "learning_rate": 6.539834326290817e-06, + "logits/chosen": -1.2661999464035034, + "logits/rejected": -1.25289785861969, + "logps/chosen": -188.11273193359375, + "logps/rejected": -151.45889282226562, + "loss": 0.9732, + "rewards/accuracies": 0.0, + "rewards/chosen": 8.05344295501709, + "rewards/margins": -1.7813701629638672, + "rewards/rejected": 9.834813117980957, + "step": 5155 + }, + { + "epoch": 0.84, + "learning_rate": 6.538583895732042e-06, + "logits/chosen": -1.3468793630599976, + "logits/rejected": -1.3022089004516602, + "logps/chosen": -64.57538604736328, + "logps/rejected": -49.444175720214844, + "loss": 0.3172, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.3399055004119873, + "rewards/margins": 0.12233257293701172, + "rewards/rejected": 2.2175729274749756, + "step": 5156 + }, + { + "epoch": 0.84, + "learning_rate": 6.537333358871678e-06, + "logits/chosen": -1.2209970951080322, + "logits/rejected": -1.1592494249343872, + "logps/chosen": -107.86398315429688, + "logps/rejected": -109.4642333984375, + "loss": 0.7513, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.395001411437988, + "rewards/margins": 0.6071290969848633, + "rewards/rejected": 4.787872314453125, + "step": 5157 + }, + { + "epoch": 0.84, + "learning_rate": 6.536082715796125e-06, + "logits/chosen": -1.2269246578216553, + "logits/rejected": -1.2091619968414307, + "logps/chosen": -323.23968505859375, + "logps/rejected": -99.64818572998047, + "loss": 0.8931, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.01015043258667, + "rewards/margins": 2.7316691875457764, + "rewards/rejected": 2.2784812450408936, + "step": 5158 + }, + { + "epoch": 0.84, + "learning_rate": 6.534831966591791e-06, + "logits/chosen": -1.3438395261764526, + "logits/rejected": -1.3525502681732178, + "logps/chosen": -100.56956481933594, + "logps/rejected": -109.5448226928711, + "loss": 2.1722, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.8972901105880737, + "rewards/margins": 0.021210551261901855, + "rewards/rejected": 1.8760795593261719, + "step": 5159 + }, + { + "epoch": 0.84, + "learning_rate": 6.533581111345091e-06, + "logits/chosen": -1.1309547424316406, + "logits/rejected": -1.0942710638046265, + "logps/chosen": -58.326698303222656, + "logps/rejected": -9.238226890563965, + "loss": 0.9254, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.06348877400159836, + "rewards/margins": -0.7804833650588989, + "rewards/rejected": 0.71699458360672, + "step": 5160 + }, + { + "epoch": 0.84, + "learning_rate": 6.532330150142448e-06, + "logits/chosen": -0.7731366157531738, + "logits/rejected": -0.6922644376754761, + "logps/chosen": -34.69053649902344, + "logps/rejected": -59.965755462646484, + "loss": 0.291, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.993021845817566, + "rewards/margins": 0.27292561531066895, + "rewards/rejected": 1.720096230506897, + "step": 5161 + }, + { + "epoch": 0.84, + "learning_rate": 6.531079083070289e-06, + "logits/chosen": -1.2830445766448975, + "logits/rejected": -1.2823566198349, + "logps/chosen": -53.69391632080078, + "logps/rejected": -73.76536560058594, + "loss": 0.1895, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.032630205154419, + "rewards/margins": 0.912068247795105, + "rewards/rejected": 1.120561957359314, + "step": 5162 + }, + { + "epoch": 0.84, + "learning_rate": 6.529827910215053e-06, + "logits/chosen": -0.9302247762680054, + "logits/rejected": -0.8723623156547546, + "logps/chosen": -54.28175354003906, + "logps/rejected": -99.12413024902344, + "loss": 0.7566, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.446933269500732, + "rewards/margins": 1.2986314296722412, + "rewards/rejected": 3.148301839828491, + "step": 5163 + }, + { + "epoch": 0.84, + "learning_rate": 6.528576631663184e-06, + "logits/chosen": -0.9511743783950806, + "logits/rejected": -0.8968651294708252, + "logps/chosen": -52.54655075073242, + "logps/rejected": -30.413984298706055, + "loss": 0.4056, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.637378454208374, + "rewards/margins": 0.7107402086257935, + "rewards/rejected": 1.9266382455825806, + "step": 5164 + }, + { + "epoch": 0.84, + "learning_rate": 6.527325247501133e-06, + "logits/chosen": -1.2446476221084595, + "logits/rejected": -1.2994885444641113, + "logps/chosen": -130.89195251464844, + "logps/rejected": -106.08040618896484, + "loss": 0.0277, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.323197841644287, + "rewards/margins": 4.065937042236328, + "rewards/rejected": 1.2572609186172485, + "step": 5165 + }, + { + "epoch": 0.84, + "learning_rate": 6.5260737578153595e-06, + "logits/chosen": -0.9351041316986084, + "logits/rejected": -0.9351041316986084, + "logps/chosen": -41.485382080078125, + "logps/rejected": -41.485382080078125, + "loss": 1.2566, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.8557839393615723, + "rewards/margins": 0.0, + "rewards/rejected": 2.8557839393615723, + "step": 5166 + }, + { + "epoch": 0.84, + "learning_rate": 6.5248221626923295e-06, + "logits/chosen": -1.2998275756835938, + "logits/rejected": -1.3861377239227295, + "logps/chosen": -108.02545166015625, + "logps/rejected": -110.85372924804688, + "loss": 1.264, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.9905060529708862, + "rewards/margins": -2.176217555999756, + "rewards/rejected": 4.166723728179932, + "step": 5167 + }, + { + "epoch": 0.84, + "learning_rate": 6.523570462218516e-06, + "logits/chosen": -1.516746163368225, + "logits/rejected": -1.5209059715270996, + "logps/chosen": -97.4546127319336, + "logps/rejected": -95.16215515136719, + "loss": 1.0343, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.5736015439033508, + "rewards/margins": -1.9081850051879883, + "rewards/rejected": 2.4817864894866943, + "step": 5168 + }, + { + "epoch": 0.84, + "learning_rate": 6.522318656480398e-06, + "logits/chosen": -1.2814607620239258, + "logits/rejected": -1.271264672279358, + "logps/chosen": -104.8670425415039, + "logps/rejected": -146.68698120117188, + "loss": 0.4374, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0232093334198, + "rewards/margins": 1.193560004234314, + "rewards/rejected": 1.8296493291854858, + "step": 5169 + }, + { + "epoch": 0.84, + "learning_rate": 6.521066745564467e-06, + "logits/chosen": -0.7940928339958191, + "logits/rejected": -0.7903487086296082, + "logps/chosen": -4.0975117683410645, + "logps/rejected": -2.408111810684204, + "loss": 1.2762, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.2740841805934906, + "rewards/margins": -0.012104511260986328, + "rewards/rejected": 0.28618869185447693, + "step": 5170 + }, + { + "epoch": 0.84, + "learning_rate": 6.519814729557217e-06, + "logits/chosen": -1.1468288898468018, + "logits/rejected": -1.0892375707626343, + "logps/chosen": -54.94463348388672, + "logps/rejected": -106.03933715820312, + "loss": 1.3748, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.7856231927871704, + "rewards/margins": -2.6246261596679688, + "rewards/rejected": 4.41024923324585, + "step": 5171 + }, + { + "epoch": 0.84, + "learning_rate": 6.518562608545148e-06, + "logits/chosen": -1.0687806606292725, + "logits/rejected": -1.0534714460372925, + "logps/chosen": -35.65753936767578, + "logps/rejected": -51.94878387451172, + "loss": 0.6237, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.5651803016662598, + "rewards/margins": 0.30011677742004395, + "rewards/rejected": 2.265063524246216, + "step": 5172 + }, + { + "epoch": 0.84, + "learning_rate": 6.517310382614772e-06, + "logits/chosen": -1.0815770626068115, + "logits/rejected": -1.0888341665267944, + "logps/chosen": -28.09316062927246, + "logps/rejected": -40.62525177001953, + "loss": 0.4645, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.0132510662078857, + "rewards/margins": -0.14329266548156738, + "rewards/rejected": 2.156543731689453, + "step": 5173 + }, + { + "epoch": 0.84, + "learning_rate": 6.516058051852605e-06, + "logits/chosen": -0.9209417700767517, + "logits/rejected": -0.9205414056777954, + "logps/chosen": -2.2499406337738037, + "logps/rejected": -5.724109649658203, + "loss": 0.4118, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.30224889516830444, + "rewards/margins": 0.20177386701107025, + "rewards/rejected": 0.10047502815723419, + "step": 5174 + }, + { + "epoch": 0.84, + "learning_rate": 6.5148056163451735e-06, + "logits/chosen": -1.2616914510726929, + "logits/rejected": -1.466826319694519, + "logps/chosen": -73.70506286621094, + "logps/rejected": -38.246341705322266, + "loss": 0.3096, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.5701262950897217, + "rewards/margins": 0.7503818273544312, + "rewards/rejected": 1.8197444677352905, + "step": 5175 + }, + { + "epoch": 0.84, + "learning_rate": 6.513553076179005e-06, + "logits/chosen": -1.1242947578430176, + "logits/rejected": -1.093950867652893, + "logps/chosen": -132.65567016601562, + "logps/rejected": -102.25341796875, + "loss": 0.2079, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.236842393875122, + "rewards/margins": 0.6640312671661377, + "rewards/rejected": 1.5728111267089844, + "step": 5176 + }, + { + "epoch": 0.84, + "learning_rate": 6.512300431440642e-06, + "logits/chosen": -1.05325186252594, + "logits/rejected": -1.0428723096847534, + "logps/chosen": -84.06871032714844, + "logps/rejected": -65.64474487304688, + "loss": 1.7486, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.260141134262085, + "rewards/margins": -0.10825037956237793, + "rewards/rejected": 2.368391513824463, + "step": 5177 + }, + { + "epoch": 0.84, + "learning_rate": 6.511047682216628e-06, + "logits/chosen": -0.9598556160926819, + "logits/rejected": -0.9598556160926819, + "logps/chosen": -3.157968521118164, + "logps/rejected": -3.157968521118164, + "loss": 0.4621, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.6373019218444824, + "rewards/margins": 0.0, + "rewards/rejected": 0.6373019218444824, + "step": 5178 + }, + { + "epoch": 0.84, + "learning_rate": 6.509794828593516e-06, + "logits/chosen": -1.4569767713546753, + "logits/rejected": -1.3128821849822998, + "logps/chosen": -174.7698211669922, + "logps/rejected": -203.408203125, + "loss": 1.8899, + "rewards/accuracies": 0.0, + "rewards/chosen": 6.442430019378662, + "rewards/margins": -3.7544054985046387, + "rewards/rejected": 10.1968355178833, + "step": 5179 + }, + { + "epoch": 0.84, + "learning_rate": 6.508541870657868e-06, + "logits/chosen": -1.1849819421768188, + "logits/rejected": -1.1393018960952759, + "logps/chosen": -108.38737487792969, + "logps/rejected": -46.989288330078125, + "loss": 0.3247, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.4082093238830566, + "rewards/margins": 0.7542054653167725, + "rewards/rejected": 2.654003858566284, + "step": 5180 + }, + { + "epoch": 0.84, + "learning_rate": 6.507288808496251e-06, + "logits/chosen": -1.1356436014175415, + "logits/rejected": -1.1456999778747559, + "logps/chosen": -45.17402648925781, + "logps/rejected": -48.30996322631836, + "loss": 0.263, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.4630146026611328, + "rewards/margins": 0.4817500710487366, + "rewards/rejected": 0.9812645316123962, + "step": 5181 + }, + { + "epoch": 0.84, + "learning_rate": 6.506035642195239e-06, + "logits/chosen": -1.333956003189087, + "logits/rejected": -1.3079795837402344, + "logps/chosen": -148.04037475585938, + "logps/rejected": -173.01535034179688, + "loss": 0.3942, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.8891632556915283, + "rewards/margins": -0.17895817756652832, + "rewards/rejected": 2.0681214332580566, + "step": 5182 + }, + { + "epoch": 0.84, + "learning_rate": 6.504782371841414e-06, + "logits/chosen": -1.4592080116271973, + "logits/rejected": -1.4450109004974365, + "logps/chosen": -179.44046020507812, + "logps/rejected": -60.24263000488281, + "loss": 0.0409, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.086158752441406, + "rewards/margins": 3.1700797080993652, + "rewards/rejected": 0.9160789847373962, + "step": 5183 + }, + { + "epoch": 0.84, + "learning_rate": 6.503528997521365e-06, + "logits/chosen": -0.964471697807312, + "logits/rejected": -0.9979105591773987, + "logps/chosen": -64.4078140258789, + "logps/rejected": -66.2230224609375, + "loss": 0.8173, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.1650794744491577, + "rewards/margins": -0.3096604347229004, + "rewards/rejected": 1.474739909172058, + "step": 5184 + }, + { + "epoch": 0.84, + "learning_rate": 6.502275519321691e-06, + "logits/chosen": -0.7033200263977051, + "logits/rejected": -0.7477976679801941, + "logps/chosen": -87.6319351196289, + "logps/rejected": -75.19988250732422, + "loss": 0.1769, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.321049451828003, + "rewards/margins": 1.1150656938552856, + "rewards/rejected": 1.2059837579727173, + "step": 5185 + }, + { + "epoch": 0.84, + "learning_rate": 6.501021937328992e-06, + "logits/chosen": -1.1820898056030273, + "logits/rejected": -1.2184333801269531, + "logps/chosen": -48.80609893798828, + "logps/rejected": -70.65774536132812, + "loss": 1.4863, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.004955291748047, + "rewards/margins": 0.7666518688201904, + "rewards/rejected": 1.2383034229278564, + "step": 5186 + }, + { + "epoch": 0.84, + "learning_rate": 6.49976825162988e-06, + "logits/chosen": -1.365587830543518, + "logits/rejected": -1.2618193626403809, + "logps/chosen": -156.9661865234375, + "logps/rejected": -40.44697952270508, + "loss": 0.1497, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2300491333007812, + "rewards/margins": 2.342625379562378, + "rewards/rejected": 0.8874236941337585, + "step": 5187 + }, + { + "epoch": 0.84, + "learning_rate": 6.498514462310972e-06, + "logits/chosen": -1.1899864673614502, + "logits/rejected": -1.1150110960006714, + "logps/chosen": -57.44477844238281, + "logps/rejected": -10.062697410583496, + "loss": 0.1561, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.9394387006759644, + "rewards/margins": 1.6976126432418823, + "rewards/rejected": 0.24182605743408203, + "step": 5188 + }, + { + "epoch": 0.84, + "learning_rate": 6.4972605694588945e-06, + "logits/chosen": -1.5752588510513306, + "logits/rejected": -1.5490223169326782, + "logps/chosen": -103.2615966796875, + "logps/rejected": -88.67501831054688, + "loss": 0.3783, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.328961372375488, + "rewards/margins": 1.492285966873169, + "rewards/rejected": 3.8366754055023193, + "step": 5189 + }, + { + "epoch": 0.84, + "learning_rate": 6.496006573160278e-06, + "logits/chosen": -0.7991329431533813, + "logits/rejected": -0.7991329431533813, + "logps/chosen": -57.764556884765625, + "logps/rejected": -57.764556884765625, + "loss": 0.4164, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.2439601868391037, + "rewards/margins": 0.0, + "rewards/rejected": 0.2439601868391037, + "step": 5190 + }, + { + "epoch": 0.84, + "learning_rate": 6.4947524735017644e-06, + "logits/chosen": -1.215670108795166, + "logits/rejected": -1.2407861948013306, + "logps/chosen": -46.741634368896484, + "logps/rejected": -62.20024108886719, + "loss": 0.6346, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.729194402694702, + "rewards/margins": 0.8612821102142334, + "rewards/rejected": 1.8679122924804688, + "step": 5191 + }, + { + "epoch": 0.84, + "learning_rate": 6.493498270569998e-06, + "logits/chosen": -1.098272681236267, + "logits/rejected": -1.0132794380187988, + "logps/chosen": -86.87600708007812, + "logps/rejected": -90.65647888183594, + "loss": 0.5466, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9963624477386475, + "rewards/margins": 0.7648141384124756, + "rewards/rejected": 2.231548309326172, + "step": 5192 + }, + { + "epoch": 0.84, + "learning_rate": 6.492243964451632e-06, + "logits/chosen": -1.2110408544540405, + "logits/rejected": -1.1050218343734741, + "logps/chosen": -41.10015106201172, + "logps/rejected": -16.040372848510742, + "loss": 0.4817, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.297531843185425, + "rewards/margins": 1.9030723571777344, + "rewards/rejected": 0.3944595456123352, + "step": 5193 + }, + { + "epoch": 0.84, + "learning_rate": 6.490989555233328e-06, + "logits/chosen": -1.0971306562423706, + "logits/rejected": -1.155738353729248, + "logps/chosen": -79.9927749633789, + "logps/rejected": -78.33218383789062, + "loss": 0.5357, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.2642745971679688, + "rewards/margins": -0.6234101057052612, + "rewards/rejected": 1.88768470287323, + "step": 5194 + }, + { + "epoch": 0.84, + "learning_rate": 6.4897350430017526e-06, + "logits/chosen": -0.9994891881942749, + "logits/rejected": -1.007105827331543, + "logps/chosen": -101.6590576171875, + "logps/rejected": -52.777652740478516, + "loss": 0.4777, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.5310516357421875, + "rewards/margins": 1.5407962799072266, + "rewards/rejected": 0.9902553558349609, + "step": 5195 + }, + { + "epoch": 0.84, + "learning_rate": 6.488480427843583e-06, + "logits/chosen": -1.0329868793487549, + "logits/rejected": -1.0405645370483398, + "logps/chosen": -75.01507568359375, + "logps/rejected": -109.68666076660156, + "loss": 2.08, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.40751051902771, + "rewards/margins": -3.542036294937134, + "rewards/rejected": 5.949546813964844, + "step": 5196 + }, + { + "epoch": 0.84, + "learning_rate": 6.487225709845499e-06, + "logits/chosen": -1.0663871765136719, + "logits/rejected": -1.0652836561203003, + "logps/chosen": -55.235687255859375, + "logps/rejected": -48.91253662109375, + "loss": 0.5383, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.5484893321990967, + "rewards/margins": 0.09871673583984375, + "rewards/rejected": 2.449772596359253, + "step": 5197 + }, + { + "epoch": 0.84, + "learning_rate": 6.485970889094192e-06, + "logits/chosen": -1.2435040473937988, + "logits/rejected": -1.084845781326294, + "logps/chosen": -82.08317565917969, + "logps/rejected": -28.365034103393555, + "loss": 0.3309, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.253652334213257, + "rewards/margins": 1.030595302581787, + "rewards/rejected": 1.2230570316314697, + "step": 5198 + }, + { + "epoch": 0.84, + "learning_rate": 6.4847159656763585e-06, + "logits/chosen": -1.1040377616882324, + "logits/rejected": -1.1753525733947754, + "logps/chosen": -73.75424194335938, + "logps/rejected": -72.9082260131836, + "loss": 0.6521, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.99932861328125, + "rewards/margins": -0.9849021434783936, + "rewards/rejected": 2.9842307567596436, + "step": 5199 + }, + { + "epoch": 0.84, + "learning_rate": 6.4834609396786976e-06, + "logits/chosen": -1.2913720607757568, + "logits/rejected": -1.1267590522766113, + "logps/chosen": -105.74634552001953, + "logps/rejected": -21.83361053466797, + "loss": 0.2151, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.782787322998047, + "rewards/margins": 5.57684850692749, + "rewards/rejected": 1.205938696861267, + "step": 5200 + }, + { + "epoch": 0.84, + "learning_rate": 6.482205811187922e-06, + "logits/chosen": -1.080482840538025, + "logits/rejected": -1.0460081100463867, + "logps/chosen": -96.41610717773438, + "logps/rejected": -57.51205825805664, + "loss": 2.0248, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.2298660278320312, + "rewards/margins": -0.5579433441162109, + "rewards/rejected": 1.7878093719482422, + "step": 5201 + }, + { + "epoch": 0.84, + "learning_rate": 6.480950580290751e-06, + "logits/chosen": -1.2318334579467773, + "logits/rejected": -1.2318334579467773, + "logps/chosen": -45.494590759277344, + "logps/rejected": -45.494590759277344, + "loss": 0.3558, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.8575751781463623, + "rewards/margins": 0.0, + "rewards/rejected": 2.8575751781463623, + "step": 5202 + }, + { + "epoch": 0.84, + "learning_rate": 6.479695247073907e-06, + "logits/chosen": -0.7400863170623779, + "logits/rejected": -0.7444899082183838, + "logps/chosen": -37.294097900390625, + "logps/rejected": -36.88862991333008, + "loss": 0.4023, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.098019003868103, + "rewards/margins": 0.04833030700683594, + "rewards/rejected": 1.049688696861267, + "step": 5203 + }, + { + "epoch": 0.84, + "learning_rate": 6.478439811624123e-06, + "logits/chosen": -1.0524247884750366, + "logits/rejected": -1.190009355545044, + "logps/chosen": -73.39117431640625, + "logps/rejected": -142.73379516601562, + "loss": 3.3702, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.1473946571350098, + "rewards/margins": -6.69926118850708, + "rewards/rejected": 8.84665584564209, + "step": 5204 + }, + { + "epoch": 0.84, + "learning_rate": 6.477184274028137e-06, + "logits/chosen": -1.2615652084350586, + "logits/rejected": -1.3119512796401978, + "logps/chosen": -62.70228576660156, + "logps/rejected": -85.16999053955078, + "loss": 1.1665, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.7782013416290283, + "rewards/margins": -1.4694654941558838, + "rewards/rejected": 4.247666835784912, + "step": 5205 + }, + { + "epoch": 0.84, + "learning_rate": 6.475928634372695e-06, + "logits/chosen": -1.4499984979629517, + "logits/rejected": -1.3059054613113403, + "logps/chosen": -154.87942504882812, + "logps/rejected": -58.94486618041992, + "loss": 0.0685, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.693609714508057, + "rewards/margins": 2.212519407272339, + "rewards/rejected": 2.4810903072357178, + "step": 5206 + }, + { + "epoch": 0.85, + "learning_rate": 6.474672892744549e-06, + "logits/chosen": -1.0227245092391968, + "logits/rejected": -1.1865087747573853, + "logps/chosen": -82.59856414794922, + "logps/rejected": -185.2985076904297, + "loss": 2.2748, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.126000165939331, + "rewards/margins": -4.150179862976074, + "rewards/rejected": 7.276179790496826, + "step": 5207 + }, + { + "epoch": 0.85, + "learning_rate": 6.473417049230459e-06, + "logits/chosen": -1.2060091495513916, + "logits/rejected": -0.9005882143974304, + "logps/chosen": -124.27536010742188, + "logps/rejected": -55.10676574707031, + "loss": 0.0213, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0835723876953125, + "rewards/margins": 3.3074119091033936, + "rewards/rejected": -0.22383956611156464, + "step": 5208 + }, + { + "epoch": 0.85, + "learning_rate": 6.472161103917194e-06, + "logits/chosen": -1.589542269706726, + "logits/rejected": -1.4860581159591675, + "logps/chosen": -126.56336975097656, + "logps/rejected": -95.08245849609375, + "loss": 0.2302, + "rewards/accuracies": 1.0, + "rewards/chosen": 7.368846416473389, + "rewards/margins": 2.9493212699890137, + "rewards/rejected": 4.419525146484375, + "step": 5209 + }, + { + "epoch": 0.85, + "learning_rate": 6.470905056891526e-06, + "logits/chosen": -1.0622464418411255, + "logits/rejected": -1.1462241411209106, + "logps/chosen": -74.35452270507812, + "logps/rejected": -119.07046508789062, + "loss": 1.297, + "rewards/accuracies": 0.0, + "rewards/chosen": 5.757748603820801, + "rewards/margins": -2.12656831741333, + "rewards/rejected": 7.884316921234131, + "step": 5210 + }, + { + "epoch": 0.85, + "learning_rate": 6.469648908240236e-06, + "logits/chosen": -1.27302086353302, + "logits/rejected": -1.334897518157959, + "logps/chosen": -64.92160034179688, + "logps/rejected": -97.91424560546875, + "loss": 1.8262, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.9375641345977783, + "rewards/margins": -3.4025604724884033, + "rewards/rejected": 5.340124607086182, + "step": 5211 + }, + { + "epoch": 0.85, + "learning_rate": 6.468392658050113e-06, + "logits/chosen": -0.9328949451446533, + "logits/rejected": -0.728286623954773, + "logps/chosen": -101.685302734375, + "logps/rejected": -13.883879661560059, + "loss": 0.2431, + "rewards/accuracies": 1.0, + "rewards/chosen": 7.15401029586792, + "rewards/margins": 6.951735496520996, + "rewards/rejected": 0.20227490365505219, + "step": 5212 + }, + { + "epoch": 0.85, + "learning_rate": 6.467136306407951e-06, + "logits/chosen": -1.209781289100647, + "logits/rejected": -1.2339726686477661, + "logps/chosen": -67.19385528564453, + "logps/rejected": -54.70643615722656, + "loss": 0.8117, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.633896589279175, + "rewards/margins": -0.5464699268341064, + "rewards/rejected": 3.1803665161132812, + "step": 5213 + }, + { + "epoch": 0.85, + "learning_rate": 6.465879853400553e-06, + "logits/chosen": -1.3314341306686401, + "logits/rejected": -1.2688452005386353, + "logps/chosen": -43.54499053955078, + "logps/rejected": -32.27704620361328, + "loss": 0.341, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.671132802963257, + "rewards/margins": 0.6825673580169678, + "rewards/rejected": 1.988565444946289, + "step": 5214 + }, + { + "epoch": 0.85, + "learning_rate": 6.464623299114727e-06, + "logits/chosen": -1.449874997138977, + "logits/rejected": -1.4335792064666748, + "logps/chosen": -60.49948501586914, + "logps/rejected": -63.23837661743164, + "loss": 1.1, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.578943967819214, + "rewards/margins": -1.1860628128051758, + "rewards/rejected": 3.7650067806243896, + "step": 5215 + }, + { + "epoch": 0.85, + "learning_rate": 6.46336664363729e-06, + "logits/chosen": -1.3011566400527954, + "logits/rejected": -1.4925498962402344, + "logps/chosen": -82.68132019042969, + "logps/rejected": -35.846107482910156, + "loss": 0.0444, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.120344638824463, + "rewards/margins": 2.8243255615234375, + "rewards/rejected": 0.2960189878940582, + "step": 5216 + }, + { + "epoch": 0.85, + "learning_rate": 6.462109887055063e-06, + "logits/chosen": -1.0504193305969238, + "logits/rejected": -1.0504193305969238, + "logps/chosen": -58.55595397949219, + "logps/rejected": -58.55595397949219, + "loss": 0.5018, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.7449356317520142, + "rewards/margins": 0.0, + "rewards/rejected": 1.7449356317520142, + "step": 5217 + }, + { + "epoch": 0.85, + "learning_rate": 6.460853029454879e-06, + "logits/chosen": -1.173231840133667, + "logits/rejected": -1.1363157033920288, + "logps/chosen": -27.582435607910156, + "logps/rejected": -61.520111083984375, + "loss": 0.6634, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.7283989191055298, + "rewards/margins": -0.7885903120040894, + "rewards/rejected": 2.516989231109619, + "step": 5218 + }, + { + "epoch": 0.85, + "learning_rate": 6.459596070923573e-06, + "logits/chosen": -0.6229451298713684, + "logits/rejected": -0.628544270992279, + "logps/chosen": -6.8990912437438965, + "logps/rejected": -3.059990644454956, + "loss": 0.706, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.23818126320838928, + "rewards/margins": -0.15074288845062256, + "rewards/rejected": 0.38892415165901184, + "step": 5219 + }, + { + "epoch": 0.85, + "learning_rate": 6.458339011547989e-06, + "logits/chosen": -1.2367944717407227, + "logits/rejected": -1.0833594799041748, + "logps/chosen": -60.75593185424805, + "logps/rejected": -13.119363784790039, + "loss": 0.246, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.4840619564056396, + "rewards/margins": 1.6407452821731567, + "rewards/rejected": 0.8433166742324829, + "step": 5220 + }, + { + "epoch": 0.85, + "learning_rate": 6.457081851414977e-06, + "logits/chosen": -1.226859450340271, + "logits/rejected": -1.1714509725570679, + "logps/chosen": -122.38517761230469, + "logps/rejected": -155.93963623046875, + "loss": 1.8082, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.035022020339966, + "rewards/margins": -3.533099412918091, + "rewards/rejected": 5.568121433258057, + "step": 5221 + }, + { + "epoch": 0.85, + "learning_rate": 6.455824590611398e-06, + "logits/chosen": -1.0269598960876465, + "logits/rejected": -1.0214061737060547, + "logps/chosen": -53.6867790222168, + "logps/rejected": -74.44107055664062, + "loss": 0.9088, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.8591159582138062, + "rewards/margins": -1.014248251914978, + "rewards/rejected": 2.873364210128784, + "step": 5222 + }, + { + "epoch": 0.85, + "learning_rate": 6.4545672292241135e-06, + "logits/chosen": -1.1136735677719116, + "logits/rejected": -1.0697675943374634, + "logps/chosen": -78.7869873046875, + "logps/rejected": -88.58201599121094, + "loss": 0.3733, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.701489210128784, + "rewards/margins": -0.09108448028564453, + "rewards/rejected": 2.7925736904144287, + "step": 5223 + }, + { + "epoch": 0.85, + "learning_rate": 6.453309767339998e-06, + "logits/chosen": -1.4853078126907349, + "logits/rejected": -1.4695675373077393, + "logps/chosen": -41.868736267089844, + "logps/rejected": -108.039306640625, + "loss": 0.6865, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.0976784229278564, + "rewards/margins": -0.8649895191192627, + "rewards/rejected": 2.962667942047119, + "step": 5224 + }, + { + "epoch": 0.85, + "learning_rate": 6.452052205045929e-06, + "logits/chosen": -1.1906731128692627, + "logits/rejected": -1.1205034255981445, + "logps/chosen": -89.73831939697266, + "logps/rejected": -55.535396575927734, + "loss": 0.278, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.655886173248291, + "rewards/margins": 0.7423038482666016, + "rewards/rejected": 2.9135823249816895, + "step": 5225 + }, + { + "epoch": 0.85, + "learning_rate": 6.450794542428791e-06, + "logits/chosen": -1.1868034601211548, + "logits/rejected": -1.0853753089904785, + "logps/chosen": -69.2242431640625, + "logps/rejected": -25.318519592285156, + "loss": 1.6078, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.6481834650039673, + "rewards/margins": 1.849169373512268, + "rewards/rejected": -0.20098590850830078, + "step": 5226 + }, + { + "epoch": 0.85, + "learning_rate": 6.449536779575478e-06, + "logits/chosen": -0.7682556509971619, + "logits/rejected": -0.7354788780212402, + "logps/chosen": -95.19584655761719, + "logps/rejected": -60.43028259277344, + "loss": 0.4293, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.555883765220642, + "rewards/margins": -0.04684150218963623, + "rewards/rejected": 1.6027252674102783, + "step": 5227 + }, + { + "epoch": 0.85, + "learning_rate": 6.44827891657289e-06, + "logits/chosen": -0.9811496734619141, + "logits/rejected": -0.9270270466804504, + "logps/chosen": -75.70064544677734, + "logps/rejected": -82.56587982177734, + "loss": 1.6786, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.6158134937286377, + "rewards/margins": 0.06914222240447998, + "rewards/rejected": 1.5466712713241577, + "step": 5228 + }, + { + "epoch": 0.85, + "learning_rate": 6.447020953507932e-06, + "logits/chosen": -1.0625789165496826, + "logits/rejected": -1.1068730354309082, + "logps/chosen": -73.03394317626953, + "logps/rejected": -89.5477294921875, + "loss": 2.019, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.0260276794433594, + "rewards/margins": 0.8578453063964844, + "rewards/rejected": 1.168182373046875, + "step": 5229 + }, + { + "epoch": 0.85, + "learning_rate": 6.445762890467517e-06, + "logits/chosen": -1.010141372680664, + "logits/rejected": -1.010141372680664, + "logps/chosen": -22.67233657836914, + "logps/rejected": -22.67233657836914, + "loss": 0.7295, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.8230869770050049, + "rewards/margins": 0.0, + "rewards/rejected": 1.8230869770050049, + "step": 5230 + }, + { + "epoch": 0.85, + "learning_rate": 6.444504727538568e-06, + "logits/chosen": -0.8978091478347778, + "logits/rejected": -0.7741901874542236, + "logps/chosen": -44.57429504394531, + "logps/rejected": -40.19021224975586, + "loss": 0.3067, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.973248243331909, + "rewards/margins": 1.1226089000701904, + "rewards/rejected": 1.8506393432617188, + "step": 5231 + }, + { + "epoch": 0.85, + "learning_rate": 6.443246464808008e-06, + "logits/chosen": -1.034217357635498, + "logits/rejected": -1.1748663187026978, + "logps/chosen": -82.77120971679688, + "logps/rejected": -105.16117858886719, + "loss": 2.6365, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.308685302734375, + "rewards/margins": -3.547694683074951, + "rewards/rejected": 5.856379985809326, + "step": 5232 + }, + { + "epoch": 0.85, + "learning_rate": 6.4419881023627755e-06, + "logits/chosen": -1.3866482973098755, + "logits/rejected": -1.449708104133606, + "logps/chosen": -84.04573059082031, + "logps/rejected": -99.7091293334961, + "loss": 0.4696, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.3318793773651123, + "rewards/margins": -0.1103973388671875, + "rewards/rejected": 3.4422767162323, + "step": 5233 + }, + { + "epoch": 0.85, + "learning_rate": 6.440729640289809e-06, + "logits/chosen": -1.4169971942901611, + "logits/rejected": -1.5072979927062988, + "logps/chosen": -94.76637268066406, + "logps/rejected": -142.9188995361328, + "loss": 1.9638, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.4380600452423096, + "rewards/margins": -3.866986036300659, + "rewards/rejected": 6.305046081542969, + "step": 5234 + }, + { + "epoch": 0.85, + "learning_rate": 6.439471078676056e-06, + "logits/chosen": -1.3825913667678833, + "logits/rejected": -1.0993443727493286, + "logps/chosen": -281.0843505859375, + "logps/rejected": -85.64273834228516, + "loss": 0.3494, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.70955228805542, + "rewards/margins": 5.168184280395508, + "rewards/rejected": 1.5413681268692017, + "step": 5235 + }, + { + "epoch": 0.85, + "learning_rate": 6.438212417608473e-06, + "logits/chosen": -1.2047096490859985, + "logits/rejected": -1.154983401298523, + "logps/chosen": -82.45402526855469, + "logps/rejected": -35.80055236816406, + "loss": 0.1091, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.470947265625, + "rewards/margins": 1.4574699401855469, + "rewards/rejected": 0.013477325439453125, + "step": 5236 + }, + { + "epoch": 0.85, + "learning_rate": 6.436953657174019e-06, + "logits/chosen": -1.4332482814788818, + "logits/rejected": -1.4036540985107422, + "logps/chosen": -118.99807739257812, + "logps/rejected": -122.77011108398438, + "loss": 0.9806, + "rewards/accuracies": 0.0, + "rewards/chosen": 7.228930950164795, + "rewards/margins": -0.45339012145996094, + "rewards/rejected": 7.682321071624756, + "step": 5237 + }, + { + "epoch": 0.85, + "learning_rate": 6.435694797459664e-06, + "logits/chosen": -1.2233593463897705, + "logits/rejected": -1.1281548738479614, + "logps/chosen": -108.02931213378906, + "logps/rejected": -67.92997741699219, + "loss": 0.0133, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.419661045074463, + "rewards/margins": 3.990771532058716, + "rewards/rejected": 1.428889513015747, + "step": 5238 + }, + { + "epoch": 0.85, + "learning_rate": 6.4344358385523844e-06, + "logits/chosen": -1.0659193992614746, + "logits/rejected": -1.0655635595321655, + "logps/chosen": -109.64985656738281, + "logps/rejected": -176.16571044921875, + "loss": 0.3499, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.7734649181365967, + "rewards/margins": 1.2250548601150513, + "rewards/rejected": 1.5484100580215454, + "step": 5239 + }, + { + "epoch": 0.85, + "learning_rate": 6.433176780539161e-06, + "logits/chosen": -1.036879539489746, + "logits/rejected": -0.9272069931030273, + "logps/chosen": -57.32680130004883, + "logps/rejected": -7.279468059539795, + "loss": 0.4782, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.26023530960083, + "rewards/margins": 4.450326442718506, + "rewards/rejected": 0.8099090456962585, + "step": 5240 + }, + { + "epoch": 0.85, + "learning_rate": 6.431917623506982e-06, + "logits/chosen": -1.0953638553619385, + "logits/rejected": -1.0900309085845947, + "logps/chosen": -106.56036376953125, + "logps/rejected": -108.04154968261719, + "loss": 1.0651, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.6233842372894287, + "rewards/margins": -1.9658277034759521, + "rewards/rejected": 5.589211940765381, + "step": 5241 + }, + { + "epoch": 0.85, + "learning_rate": 6.4306583675428435e-06, + "logits/chosen": -1.4186348915100098, + "logits/rejected": -1.4186348915100098, + "logps/chosen": -75.93772888183594, + "logps/rejected": -75.93772888183594, + "loss": 0.3715, + "rewards/accuracies": 0.0, + "rewards/chosen": 4.335899353027344, + "rewards/margins": 0.0, + "rewards/rejected": 4.335899353027344, + "step": 5242 + }, + { + "epoch": 0.85, + "learning_rate": 6.42939901273375e-06, + "logits/chosen": -0.7877815961837769, + "logits/rejected": -0.6159659028053284, + "logps/chosen": -50.99781799316406, + "logps/rejected": -6.200814723968506, + "loss": 1.6278, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.8573296070098877, + "rewards/margins": 1.1544873714447021, + "rewards/rejected": 0.7028421759605408, + "step": 5243 + }, + { + "epoch": 0.85, + "learning_rate": 6.428139559166708e-06, + "logits/chosen": -1.6137384176254272, + "logits/rejected": -1.547984004020691, + "logps/chosen": -130.1259765625, + "logps/rejected": -62.302974700927734, + "loss": 0.508, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.363951206207275, + "rewards/margins": 2.247819185256958, + "rewards/rejected": 3.1161320209503174, + "step": 5244 + }, + { + "epoch": 0.85, + "learning_rate": 6.4268800069287385e-06, + "logits/chosen": -1.1156904697418213, + "logits/rejected": -1.1991844177246094, + "logps/chosen": -49.298004150390625, + "logps/rejected": -115.3214111328125, + "loss": 0.7847, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.6977698802948, + "rewards/margins": -1.2178001403808594, + "rewards/rejected": 3.915570020675659, + "step": 5245 + }, + { + "epoch": 0.85, + "learning_rate": 6.4256203561068605e-06, + "logits/chosen": -1.1999093294143677, + "logits/rejected": -1.1990052461624146, + "logps/chosen": -172.57015991210938, + "logps/rejected": -189.78492736816406, + "loss": 3.8642, + "rewards/accuracies": 0.0, + "rewards/chosen": 5.694116115570068, + "rewards/margins": -4.485890865325928, + "rewards/rejected": 10.180006980895996, + "step": 5246 + }, + { + "epoch": 0.85, + "learning_rate": 6.424360606788105e-06, + "logits/chosen": -0.9001337289810181, + "logits/rejected": -0.9071294665336609, + "logps/chosen": -37.388893127441406, + "logps/rejected": -46.677947998046875, + "loss": 0.8743, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.415098190307617, + "rewards/margins": -0.03247714042663574, + "rewards/rejected": 2.447575330734253, + "step": 5247 + }, + { + "epoch": 0.85, + "learning_rate": 6.4231007590595094e-06, + "logits/chosen": -0.6284477114677429, + "logits/rejected": -0.6294052600860596, + "logps/chosen": -1.7852370738983154, + "logps/rejected": -1.2275102138519287, + "loss": 0.6377, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.40631207823753357, + "rewards/margins": 0.09703952074050903, + "rewards/rejected": 0.30927255749702454, + "step": 5248 + }, + { + "epoch": 0.85, + "learning_rate": 6.421840813008117e-06, + "logits/chosen": -1.362040638923645, + "logits/rejected": -1.255941390991211, + "logps/chosen": -66.05001068115234, + "logps/rejected": -16.0924129486084, + "loss": 0.1574, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.606173038482666, + "rewards/margins": 2.584500789642334, + "rewards/rejected": 0.02167224884033203, + "step": 5249 + }, + { + "epoch": 0.85, + "learning_rate": 6.420580768720977e-06, + "logits/chosen": -0.9738302826881409, + "logits/rejected": -1.0430586338043213, + "logps/chosen": -61.7352294921875, + "logps/rejected": -45.41735076904297, + "loss": 0.6469, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.5048844814300537, + "rewards/margins": -0.37890148162841797, + "rewards/rejected": 2.8837859630584717, + "step": 5250 + }, + { + "epoch": 0.85, + "learning_rate": 6.419320626285148e-06, + "logits/chosen": -1.4027423858642578, + "logits/rejected": -1.3555220365524292, + "logps/chosen": -75.82173919677734, + "logps/rejected": -51.11558151245117, + "loss": 0.4799, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.5016372203826904, + "rewards/margins": -0.15578055381774902, + "rewards/rejected": 2.6574177742004395, + "step": 5251 + }, + { + "epoch": 0.85, + "learning_rate": 6.418060385787694e-06, + "logits/chosen": -1.045793056488037, + "logits/rejected": -0.6928987503051758, + "logps/chosen": -118.45030212402344, + "logps/rejected": -26.339982986450195, + "loss": 0.1801, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.195059299468994, + "rewards/margins": 3.003636360168457, + "rewards/rejected": 1.1914228200912476, + "step": 5252 + }, + { + "epoch": 0.85, + "learning_rate": 6.416800047315687e-06, + "logits/chosen": -1.4825373888015747, + "logits/rejected": -1.4044915437698364, + "logps/chosen": -49.87547302246094, + "logps/rejected": -28.859622955322266, + "loss": 0.0681, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.903344750404358, + "rewards/margins": 2.4833896160125732, + "rewards/rejected": -0.5800449252128601, + "step": 5253 + }, + { + "epoch": 0.85, + "learning_rate": 6.4155396109561995e-06, + "logits/chosen": -1.042426347732544, + "logits/rejected": -0.9955582022666931, + "logps/chosen": -56.650447845458984, + "logps/rejected": -38.970909118652344, + "loss": 0.3044, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.9091861844062805, + "rewards/margins": 0.2232772707939148, + "rewards/rejected": 0.6859089136123657, + "step": 5254 + }, + { + "epoch": 0.85, + "learning_rate": 6.414279076796321e-06, + "logits/chosen": -1.0237966775894165, + "logits/rejected": -0.9432703256607056, + "logps/chosen": -44.49602127075195, + "logps/rejected": -12.460094451904297, + "loss": 0.706, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.895638644695282, + "rewards/margins": -0.08705025911331177, + "rewards/rejected": 0.9826889038085938, + "step": 5255 + }, + { + "epoch": 0.85, + "learning_rate": 6.413018444923138e-06, + "logits/chosen": -1.2323845624923706, + "logits/rejected": -1.148719310760498, + "logps/chosen": -94.99988555908203, + "logps/rejected": -23.80733871459961, + "loss": 0.1084, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.6273170709609985, + "rewards/margins": 1.8325538635253906, + "rewards/rejected": -0.20523682236671448, + "step": 5256 + }, + { + "epoch": 0.85, + "learning_rate": 6.411757715423752e-06, + "logits/chosen": -1.0831748247146606, + "logits/rejected": -0.8505271077156067, + "logps/chosen": -170.3933868408203, + "logps/rejected": -82.69429779052734, + "loss": 0.2174, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.866502285003662, + "rewards/margins": 0.7001585960388184, + "rewards/rejected": 4.166343688964844, + "step": 5257 + }, + { + "epoch": 0.85, + "learning_rate": 6.410496888385266e-06, + "logits/chosen": -0.8626815676689148, + "logits/rejected": -0.9257334470748901, + "logps/chosen": -84.82589721679688, + "logps/rejected": -88.4364013671875, + "loss": 0.9049, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.4375648498535156, + "rewards/margins": -0.09706199169158936, + "rewards/rejected": 1.534626841545105, + "step": 5258 + }, + { + "epoch": 0.85, + "learning_rate": 6.409235963894791e-06, + "logits/chosen": -1.3401033878326416, + "logits/rejected": -1.3398743867874146, + "logps/chosen": -95.00121307373047, + "logps/rejected": -89.08103942871094, + "loss": 0.2691, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.6794145107269287, + "rewards/margins": 0.3475837707519531, + "rewards/rejected": 2.3318307399749756, + "step": 5259 + }, + { + "epoch": 0.85, + "learning_rate": 6.407974942039446e-06, + "logits/chosen": -1.0228804349899292, + "logits/rejected": -0.8406051993370056, + "logps/chosen": -56.747947692871094, + "logps/rejected": -7.8007707595825195, + "loss": 0.0877, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1604249477386475, + "rewards/margins": 2.730121374130249, + "rewards/rejected": 0.4303034842014313, + "step": 5260 + }, + { + "epoch": 0.85, + "learning_rate": 6.406713822906353e-06, + "logits/chosen": -1.066670536994934, + "logits/rejected": -1.106368899345398, + "logps/chosen": -40.970848083496094, + "logps/rejected": -63.07408905029297, + "loss": 0.448, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.118283987045288, + "rewards/margins": 0.8451173305511475, + "rewards/rejected": 2.2731666564941406, + "step": 5261 + }, + { + "epoch": 0.85, + "learning_rate": 6.405452606582647e-06, + "logits/chosen": -1.1005200147628784, + "logits/rejected": -1.094389796257019, + "logps/chosen": -42.77628707885742, + "logps/rejected": -142.65673828125, + "loss": 0.4325, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.2266972064971924, + "rewards/margins": 1.744479775428772, + "rewards/rejected": 0.482217401266098, + "step": 5262 + }, + { + "epoch": 0.85, + "learning_rate": 6.404191293155464e-06, + "logits/chosen": -1.2437225580215454, + "logits/rejected": -1.284029245376587, + "logps/chosen": -84.26420593261719, + "logps/rejected": -103.8705062866211, + "loss": 0.8384, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.599801778793335, + "rewards/margins": 0.18082809448242188, + "rewards/rejected": 2.418973684310913, + "step": 5263 + }, + { + "epoch": 0.85, + "learning_rate": 6.402929882711948e-06, + "logits/chosen": -0.89109206199646, + "logits/rejected": -0.9472516775131226, + "logps/chosen": -70.85006713867188, + "logps/rejected": -49.83479690551758, + "loss": 1.3537, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.9778663516044617, + "rewards/margins": -2.4536380767822266, + "rewards/rejected": 3.431504487991333, + "step": 5264 + }, + { + "epoch": 0.85, + "learning_rate": 6.401668375339254e-06, + "logits/chosen": -1.229385256767273, + "logits/rejected": -1.1512153148651123, + "logps/chosen": -61.141876220703125, + "logps/rejected": -37.72104263305664, + "loss": 0.5995, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.9335793256759644, + "rewards/margins": -0.8209766149520874, + "rewards/rejected": 2.7545559406280518, + "step": 5265 + }, + { + "epoch": 0.85, + "learning_rate": 6.4004067711245366e-06, + "logits/chosen": -1.3682819604873657, + "logits/rejected": -1.3862712383270264, + "logps/chosen": -65.2732162475586, + "logps/rejected": -83.13369750976562, + "loss": 0.7167, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.3887977600097656, + "rewards/margins": 0.8893852233886719, + "rewards/rejected": 2.4994125366210938, + "step": 5266 + }, + { + "epoch": 0.85, + "learning_rate": 6.399145070154962e-06, + "logits/chosen": -0.8011634945869446, + "logits/rejected": -0.7603450417518616, + "logps/chosen": -47.020503997802734, + "logps/rejected": -14.028428077697754, + "loss": 0.533, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.4902767241001129, + "rewards/margins": -0.5423905849456787, + "rewards/rejected": 1.0326672792434692, + "step": 5267 + }, + { + "epoch": 0.86, + "learning_rate": 6.397883272517702e-06, + "logits/chosen": -1.017651915550232, + "logits/rejected": -0.9593669772148132, + "logps/chosen": -62.34687042236328, + "logps/rejected": -77.77042388916016, + "loss": 1.1951, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.4392082691192627, + "rewards/margins": -0.9700310230255127, + "rewards/rejected": 3.4092392921447754, + "step": 5268 + }, + { + "epoch": 0.86, + "learning_rate": 6.3966213782999345e-06, + "logits/chosen": -1.0139195919036865, + "logits/rejected": -0.832181453704834, + "logps/chosen": -98.64863586425781, + "logps/rejected": -62.541259765625, + "loss": 0.0405, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.126997470855713, + "rewards/margins": 2.490374803543091, + "rewards/rejected": 2.636622667312622, + "step": 5269 + }, + { + "epoch": 0.86, + "learning_rate": 6.395359387588845e-06, + "logits/chosen": -1.0370266437530518, + "logits/rejected": -1.0577938556671143, + "logps/chosen": -111.63232421875, + "logps/rejected": -86.71463012695312, + "loss": 0.9659, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.951055884361267, + "rewards/margins": -0.7719749212265015, + "rewards/rejected": 2.7230308055877686, + "step": 5270 + }, + { + "epoch": 0.86, + "learning_rate": 6.394097300471626e-06, + "logits/chosen": -1.1295561790466309, + "logits/rejected": -1.1208258867263794, + "logps/chosen": -69.96910858154297, + "logps/rejected": -120.74394226074219, + "loss": 0.3194, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.6425727605819702, + "rewards/margins": 0.4636573791503906, + "rewards/rejected": 1.1789153814315796, + "step": 5271 + }, + { + "epoch": 0.86, + "learning_rate": 6.392835117035472e-06, + "logits/chosen": -1.3161238431930542, + "logits/rejected": -1.261474370956421, + "logps/chosen": -74.77200317382812, + "logps/rejected": -81.94447326660156, + "loss": 0.2194, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.5980675220489502, + "rewards/margins": 0.5981010794639587, + "rewards/rejected": 0.9999664425849915, + "step": 5272 + }, + { + "epoch": 0.86, + "learning_rate": 6.391572837367592e-06, + "logits/chosen": -1.27008855342865, + "logits/rejected": -1.4605334997177124, + "logps/chosen": -69.24388122558594, + "logps/rejected": -143.6961669921875, + "loss": 3.6116, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.989933729171753, + "rewards/margins": -6.700940132141113, + "rewards/rejected": 9.690874099731445, + "step": 5273 + }, + { + "epoch": 0.86, + "learning_rate": 6.3903104615551956e-06, + "logits/chosen": -1.4106223583221436, + "logits/rejected": -1.497868537902832, + "logps/chosen": -99.81698608398438, + "logps/rejected": -93.22279357910156, + "loss": 3.5238, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.2739410400390625, + "rewards/margins": -5.3033037185668945, + "rewards/rejected": 8.577244758605957, + "step": 5274 + }, + { + "epoch": 0.86, + "learning_rate": 6.3890479896855026e-06, + "logits/chosen": -1.2838404178619385, + "logits/rejected": -1.4190480709075928, + "logps/chosen": -173.98483276367188, + "logps/rejected": -184.14529418945312, + "loss": 0.4057, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.74043607711792, + "rewards/margins": 0.21202564239501953, + "rewards/rejected": 6.5284104347229, + "step": 5275 + }, + { + "epoch": 0.86, + "learning_rate": 6.387785421845736e-06, + "logits/chosen": -1.1230169534683228, + "logits/rejected": -1.1746097803115845, + "logps/chosen": -136.1503448486328, + "logps/rejected": -119.13427734375, + "loss": 0.1134, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.430418491363525, + "rewards/margins": 2.360530138015747, + "rewards/rejected": 2.0698883533477783, + "step": 5276 + }, + { + "epoch": 0.86, + "learning_rate": 6.3865227581231285e-06, + "logits/chosen": -1.055612564086914, + "logits/rejected": -0.988349974155426, + "logps/chosen": -37.41050720214844, + "logps/rejected": -10.461536407470703, + "loss": 0.1047, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.1926324367523193, + "rewards/margins": 1.4767353534698486, + "rewards/rejected": 0.7158970236778259, + "step": 5277 + }, + { + "epoch": 0.86, + "learning_rate": 6.385259998604918e-06, + "logits/chosen": -1.3112653493881226, + "logits/rejected": -1.2811602354049683, + "logps/chosen": -36.47520446777344, + "logps/rejected": -15.249138832092285, + "loss": 0.512, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.746712565422058, + "rewards/margins": 0.8928658962249756, + "rewards/rejected": 0.8538466691970825, + "step": 5278 + }, + { + "epoch": 0.86, + "learning_rate": 6.383997143378349e-06, + "logits/chosen": -0.9833205342292786, + "logits/rejected": -1.0014927387237549, + "logps/chosen": -53.586692810058594, + "logps/rejected": -36.48931121826172, + "loss": 0.5924, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.1547935009002686, + "rewards/margins": -0.7634592056274414, + "rewards/rejected": 2.91825270652771, + "step": 5279 + }, + { + "epoch": 0.86, + "learning_rate": 6.382734192530673e-06, + "logits/chosen": -1.176060676574707, + "logits/rejected": -1.190112829208374, + "logps/chosen": -91.013427734375, + "logps/rejected": -56.406864166259766, + "loss": 0.1334, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.607344150543213, + "rewards/margins": 1.1877567768096924, + "rewards/rejected": 1.4195873737335205, + "step": 5280 + }, + { + "epoch": 0.86, + "learning_rate": 6.381471146149147e-06, + "logits/chosen": -1.2322745323181152, + "logits/rejected": -1.1731237173080444, + "logps/chosen": -56.85392379760742, + "logps/rejected": -30.11163330078125, + "loss": 1.3198, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.3167980909347534, + "rewards/margins": -0.6054446697235107, + "rewards/rejected": 1.9222427606582642, + "step": 5281 + }, + { + "epoch": 0.86, + "learning_rate": 6.380208004321037e-06, + "logits/chosen": -0.8237420320510864, + "logits/rejected": -0.8147932887077332, + "logps/chosen": -14.587135314941406, + "logps/rejected": -13.576107025146484, + "loss": 0.4015, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.6497982144355774, + "rewards/margins": -0.12745189666748047, + "rewards/rejected": 0.7772501111030579, + "step": 5282 + }, + { + "epoch": 0.86, + "learning_rate": 6.378944767133614e-06, + "logits/chosen": -1.1112840175628662, + "logits/rejected": -1.088098168373108, + "logps/chosen": -2.408637285232544, + "logps/rejected": -31.52692985534668, + "loss": 0.7669, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.48357972502708435, + "rewards/margins": -0.37527552247047424, + "rewards/rejected": 0.8588552474975586, + "step": 5283 + }, + { + "epoch": 0.86, + "learning_rate": 6.3776814346741545e-06, + "logits/chosen": -0.9154929518699646, + "logits/rejected": -0.9445756673812866, + "logps/chosen": -45.7641716003418, + "logps/rejected": -37.317325592041016, + "loss": 0.9737, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.6548770666122437, + "rewards/margins": 1.440826416015625, + "rewards/rejected": 0.21405068039894104, + "step": 5284 + }, + { + "epoch": 0.86, + "learning_rate": 6.3764180070299446e-06, + "logits/chosen": -0.6826890110969543, + "logits/rejected": -0.6826890110969543, + "logps/chosen": -70.98184204101562, + "logps/rejected": -70.98184204101562, + "loss": 0.8273, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.7398826479911804, + "rewards/margins": 0.0, + "rewards/rejected": 0.7398826479911804, + "step": 5285 + }, + { + "epoch": 0.86, + "learning_rate": 6.375154484288273e-06, + "logits/chosen": -0.8297979831695557, + "logits/rejected": -0.8976820707321167, + "logps/chosen": -106.71295928955078, + "logps/rejected": -80.52531433105469, + "loss": 0.8541, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.694748640060425, + "rewards/margins": 1.3717970848083496, + "rewards/rejected": 1.3229515552520752, + "step": 5286 + }, + { + "epoch": 0.86, + "learning_rate": 6.3738908665364384e-06, + "logits/chosen": -1.2399650812149048, + "logits/rejected": -1.270485281944275, + "logps/chosen": -63.44804763793945, + "logps/rejected": -107.75227355957031, + "loss": 1.0481, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.168795347213745, + "rewards/margins": -0.5375814437866211, + "rewards/rejected": 2.706376791000366, + "step": 5287 + }, + { + "epoch": 0.86, + "learning_rate": 6.372627153861745e-06, + "logits/chosen": -1.1734232902526855, + "logits/rejected": -1.4549375772476196, + "logps/chosen": -64.38557434082031, + "logps/rejected": -36.65400695800781, + "loss": 0.3797, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.9470573663711548, + "rewards/margins": 1.6443675756454468, + "rewards/rejected": 0.3026897609233856, + "step": 5288 + }, + { + "epoch": 0.86, + "learning_rate": 6.371363346351505e-06, + "logits/chosen": -1.0094808340072632, + "logits/rejected": -1.0094808340072632, + "logps/chosen": -69.13977813720703, + "logps/rejected": -69.13977813720703, + "loss": 0.5633, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.9533119201660156, + "rewards/margins": 0.0, + "rewards/rejected": 2.9533119201660156, + "step": 5289 + }, + { + "epoch": 0.86, + "learning_rate": 6.370099444093032e-06, + "logits/chosen": -1.1431514024734497, + "logits/rejected": -1.0080357789993286, + "logps/chosen": -74.18888854980469, + "logps/rejected": -43.553409576416016, + "loss": 0.1882, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.8267905712127686, + "rewards/margins": 1.7317988872528076, + "rewards/rejected": 1.094991683959961, + "step": 5290 + }, + { + "epoch": 0.86, + "learning_rate": 6.368835447173651e-06, + "logits/chosen": -1.2884348630905151, + "logits/rejected": -1.4947943687438965, + "logps/chosen": -85.2777099609375, + "logps/rejected": -34.734683990478516, + "loss": 0.5393, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.872755527496338, + "rewards/margins": 2.7399492263793945, + "rewards/rejected": 0.13280640542507172, + "step": 5291 + }, + { + "epoch": 0.86, + "learning_rate": 6.367571355680693e-06, + "logits/chosen": -1.176134467124939, + "logits/rejected": -1.1562206745147705, + "logps/chosen": -72.48863220214844, + "logps/rejected": -85.2198486328125, + "loss": 0.5041, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.3916375637054443, + "rewards/margins": 0.4617166519165039, + "rewards/rejected": 2.9299209117889404, + "step": 5292 + }, + { + "epoch": 0.86, + "learning_rate": 6.366307169701495e-06, + "logits/chosen": -1.1081725358963013, + "logits/rejected": -1.1081725358963013, + "logps/chosen": -56.06924057006836, + "logps/rejected": -56.06924057006836, + "loss": 0.3807, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.243875503540039, + "rewards/margins": 0.0, + "rewards/rejected": 3.243875503540039, + "step": 5293 + }, + { + "epoch": 0.86, + "learning_rate": 6.3650428893234e-06, + "logits/chosen": -1.2175413370132446, + "logits/rejected": -1.1334936618804932, + "logps/chosen": -157.22987365722656, + "logps/rejected": -75.32099151611328, + "loss": 0.4056, + "rewards/accuracies": 0.0, + "rewards/chosen": 6.656808376312256, + "rewards/margins": -0.1757822036743164, + "rewards/rejected": 6.832590579986572, + "step": 5294 + }, + { + "epoch": 0.86, + "learning_rate": 6.363778514633756e-06, + "logits/chosen": -1.4156008958816528, + "logits/rejected": -1.2299706935882568, + "logps/chosen": -140.3751678466797, + "logps/rejected": -75.41738891601562, + "loss": 0.0958, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.147438049316406, + "rewards/margins": 1.8506827354431152, + "rewards/rejected": 4.296755313873291, + "step": 5295 + }, + { + "epoch": 0.86, + "learning_rate": 6.362514045719922e-06, + "logits/chosen": -1.229583501815796, + "logits/rejected": -1.1805016994476318, + "logps/chosen": -146.1354522705078, + "logps/rejected": -70.15214538574219, + "loss": 0.2833, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.190582275390625, + "rewards/margins": 0.5281738042831421, + "rewards/rejected": 0.6624084711074829, + "step": 5296 + }, + { + "epoch": 0.86, + "learning_rate": 6.361249482669259e-06, + "logits/chosen": -1.4320530891418457, + "logits/rejected": -1.4320530891418457, + "logps/chosen": -56.027740478515625, + "logps/rejected": -56.027740478515625, + "loss": 1.837, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.766017198562622, + "rewards/margins": 0.0, + "rewards/rejected": 2.766017198562622, + "step": 5297 + }, + { + "epoch": 0.86, + "learning_rate": 6.359984825569138e-06, + "logits/chosen": -1.183762788772583, + "logits/rejected": -1.1737900972366333, + "logps/chosen": -54.260440826416016, + "logps/rejected": -91.89999389648438, + "loss": 0.6662, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.7322720289230347, + "rewards/margins": 0.41475343704223633, + "rewards/rejected": 1.3175185918807983, + "step": 5298 + }, + { + "epoch": 0.86, + "learning_rate": 6.358720074506932e-06, + "logits/chosen": -1.4390567541122437, + "logits/rejected": -1.0646525621414185, + "logps/chosen": -50.412811279296875, + "logps/rejected": -47.172691345214844, + "loss": 1.9857, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.497335195541382, + "rewards/margins": -3.825227975845337, + "rewards/rejected": 6.322563171386719, + "step": 5299 + }, + { + "epoch": 0.86, + "learning_rate": 6.357455229570027e-06, + "logits/chosen": -1.2530694007873535, + "logits/rejected": -1.1525864601135254, + "logps/chosen": -120.96464538574219, + "logps/rejected": -41.77571487426758, + "loss": 0.032, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.568568468093872, + "rewards/margins": 3.016894817352295, + "rewards/rejected": 0.5516735315322876, + "step": 5300 + }, + { + "epoch": 0.86, + "learning_rate": 6.356190290845809e-06, + "logits/chosen": -0.956186830997467, + "logits/rejected": -0.962674081325531, + "logps/chosen": -54.846839904785156, + "logps/rejected": -95.18804931640625, + "loss": 0.4803, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.2752236127853394, + "rewards/margins": -0.4774589538574219, + "rewards/rejected": 1.7526825666427612, + "step": 5301 + }, + { + "epoch": 0.86, + "learning_rate": 6.354925258421676e-06, + "logits/chosen": -1.2051812410354614, + "logits/rejected": -1.2098948955535889, + "logps/chosen": -5.8338470458984375, + "logps/rejected": -1.8144696950912476, + "loss": 0.7192, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.09303941577672958, + "rewards/margins": -0.26890724897384644, + "rewards/rejected": 0.3619466722011566, + "step": 5302 + }, + { + "epoch": 0.86, + "learning_rate": 6.353660132385027e-06, + "logits/chosen": -1.2379496097564697, + "logits/rejected": -1.2078324556350708, + "logps/chosen": -75.56930541992188, + "logps/rejected": -76.6963119506836, + "loss": 1.0059, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.7549331188201904, + "rewards/margins": 0.0664205551147461, + "rewards/rejected": 2.6885125637054443, + "step": 5303 + }, + { + "epoch": 0.86, + "learning_rate": 6.35239491282327e-06, + "logits/chosen": -0.9973737001419067, + "logits/rejected": -0.9773011207580566, + "logps/chosen": -8.423282623291016, + "logps/rejected": -13.27958869934082, + "loss": 0.437, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.0130102634429932, + "rewards/margins": 0.08473283052444458, + "rewards/rejected": 0.9282774329185486, + "step": 5304 + }, + { + "epoch": 0.86, + "learning_rate": 6.351129599823822e-06, + "logits/chosen": -1.3262608051300049, + "logits/rejected": -1.2704137563705444, + "logps/chosen": -118.60123443603516, + "logps/rejected": -54.96045684814453, + "loss": 0.3126, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.530019521713257, + "rewards/margins": 0.14160537719726562, + "rewards/rejected": 2.388414144515991, + "step": 5305 + }, + { + "epoch": 0.86, + "learning_rate": 6.349864193474104e-06, + "logits/chosen": -1.3246650695800781, + "logits/rejected": -1.2018475532531738, + "logps/chosen": -77.45445251464844, + "logps/rejected": -33.055625915527344, + "loss": 0.1604, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.228048801422119, + "rewards/margins": 1.000803828239441, + "rewards/rejected": 1.2272449731826782, + "step": 5306 + }, + { + "epoch": 0.86, + "learning_rate": 6.348598693861541e-06, + "logits/chosen": -1.506663203239441, + "logits/rejected": -1.4884178638458252, + "logps/chosen": -89.82389831542969, + "logps/rejected": -97.82463073730469, + "loss": 0.0985, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.594334363937378, + "rewards/margins": 1.526371717453003, + "rewards/rejected": 1.067962646484375, + "step": 5307 + }, + { + "epoch": 0.86, + "learning_rate": 6.34733310107357e-06, + "logits/chosen": -1.304466962814331, + "logits/rejected": -1.3469661474227905, + "logps/chosen": -75.90240478515625, + "logps/rejected": -88.87742614746094, + "loss": 1.4835, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.218644857406616, + "rewards/margins": -2.331063985824585, + "rewards/rejected": 5.549708843231201, + "step": 5308 + }, + { + "epoch": 0.86, + "learning_rate": 6.34606741519763e-06, + "logits/chosen": -2.044374465942383, + "logits/rejected": -1.9424301385879517, + "logps/chosen": -68.84913635253906, + "logps/rejected": -166.3999481201172, + "loss": 2.0577, + "rewards/accuracies": 0.0, + "rewards/chosen": 4.206234931945801, + "rewards/margins": -3.4901061058044434, + "rewards/rejected": 7.696341037750244, + "step": 5309 + }, + { + "epoch": 0.86, + "learning_rate": 6.344801636321168e-06, + "logits/chosen": -1.137794852256775, + "logits/rejected": -1.0521546602249146, + "logps/chosen": -90.83964538574219, + "logps/rejected": -62.1816520690918, + "loss": 0.2472, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.770585775375366, + "rewards/margins": 0.8337277173995972, + "rewards/rejected": 1.936858057975769, + "step": 5310 + }, + { + "epoch": 0.86, + "learning_rate": 6.343535764531639e-06, + "logits/chosen": -1.195733666419983, + "logits/rejected": -1.1604321002960205, + "logps/chosen": -84.189697265625, + "logps/rejected": -59.4857177734375, + "loss": 1.9991, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.234610080718994, + "rewards/margins": -0.57454514503479, + "rewards/rejected": 2.809155225753784, + "step": 5311 + }, + { + "epoch": 0.86, + "learning_rate": 6.3422697999164995e-06, + "logits/chosen": -1.5081268548965454, + "logits/rejected": -1.4902805089950562, + "logps/chosen": -45.49468231201172, + "logps/rejected": -59.470890045166016, + "loss": 0.6601, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.1781349182128906, + "rewards/margins": -0.3160405158996582, + "rewards/rejected": 3.494175434112549, + "step": 5312 + }, + { + "epoch": 0.86, + "learning_rate": 6.341003742563219e-06, + "logits/chosen": -1.1482614278793335, + "logits/rejected": -1.162571668624878, + "logps/chosen": -73.9828109741211, + "logps/rejected": -53.788330078125, + "loss": 0.3501, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.481093883514404, + "rewards/margins": 0.037994384765625, + "rewards/rejected": 4.443099498748779, + "step": 5313 + }, + { + "epoch": 0.86, + "learning_rate": 6.3397375925592675e-06, + "logits/chosen": -1.1088896989822388, + "logits/rejected": -1.0932154655456543, + "logps/chosen": -110.29583740234375, + "logps/rejected": -50.44934844970703, + "loss": 0.1779, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.4847412109375, + "rewards/margins": 1.0467567443847656, + "rewards/rejected": 1.4379844665527344, + "step": 5314 + }, + { + "epoch": 0.86, + "learning_rate": 6.338471349992124e-06, + "logits/chosen": -0.9859042763710022, + "logits/rejected": -0.8937928080558777, + "logps/chosen": -274.8603515625, + "logps/rejected": -44.8565673828125, + "loss": 0.0761, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.997021675109863, + "rewards/margins": 3.8833823204040527, + "rewards/rejected": 1.1136394739151, + "step": 5315 + }, + { + "epoch": 0.86, + "learning_rate": 6.337205014949277e-06, + "logits/chosen": -1.179707646369934, + "logits/rejected": -1.1264936923980713, + "logps/chosen": -64.03551483154297, + "logps/rejected": -19.176353454589844, + "loss": 0.0611, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.5259385108947754, + "rewards/margins": 2.455594778060913, + "rewards/rejected": 0.07034378498792648, + "step": 5316 + }, + { + "epoch": 0.86, + "learning_rate": 6.335938587518216e-06, + "logits/chosen": -1.2190251350402832, + "logits/rejected": -1.1931120157241821, + "logps/chosen": -94.76057434082031, + "logps/rejected": -117.98422241210938, + "loss": 0.0775, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.548016309738159, + "rewards/margins": 3.1552581787109375, + "rewards/rejected": -0.6072418093681335, + "step": 5317 + }, + { + "epoch": 0.86, + "learning_rate": 6.334672067786438e-06, + "logits/chosen": -1.1193550825119019, + "logits/rejected": -1.0969749689102173, + "logps/chosen": -76.23500061035156, + "logps/rejected": -139.1956787109375, + "loss": 0.9839, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.994677782058716, + "rewards/margins": -1.7262651920318604, + "rewards/rejected": 4.720942974090576, + "step": 5318 + }, + { + "epoch": 0.86, + "learning_rate": 6.33340545584145e-06, + "logits/chosen": -1.2014471292495728, + "logits/rejected": -1.234132170677185, + "logps/chosen": -87.15731811523438, + "logps/rejected": -106.32608795166016, + "loss": 0.3586, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.5800498723983765, + "rewards/margins": 1.2042754888534546, + "rewards/rejected": 0.3757743835449219, + "step": 5319 + }, + { + "epoch": 0.86, + "learning_rate": 6.332138751770762e-06, + "logits/chosen": -1.353322148323059, + "logits/rejected": -1.3153674602508545, + "logps/chosen": -94.66175842285156, + "logps/rejected": -75.71681213378906, + "loss": 0.7548, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.8170883655548096, + "rewards/margins": 0.9674637317657471, + "rewards/rejected": 2.8496246337890625, + "step": 5320 + }, + { + "epoch": 0.86, + "learning_rate": 6.33087195566189e-06, + "logits/chosen": -1.034366488456726, + "logits/rejected": -1.0441861152648926, + "logps/chosen": -23.87784194946289, + "logps/rejected": -24.19474983215332, + "loss": 0.6656, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.6036630868911743, + "rewards/margins": -0.08315503597259521, + "rewards/rejected": 0.6868181228637695, + "step": 5321 + }, + { + "epoch": 0.86, + "learning_rate": 6.32960506760236e-06, + "logits/chosen": -1.1818033456802368, + "logits/rejected": -1.1866036653518677, + "logps/chosen": -118.36894989013672, + "logps/rejected": -149.98171997070312, + "loss": 0.2948, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.983100414276123, + "rewards/margins": 3.3089470863342285, + "rewards/rejected": 1.674153208732605, + "step": 5322 + }, + { + "epoch": 0.86, + "learning_rate": 6.3283380876797e-06, + "logits/chosen": -0.5078516006469727, + "logits/rejected": -0.5078516006469727, + "logps/chosen": -6.46918249130249, + "logps/rejected": -6.46918249130249, + "loss": 0.572, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.1534780114889145, + "rewards/margins": 0.0, + "rewards/rejected": 0.1534780114889145, + "step": 5323 + }, + { + "epoch": 0.86, + "learning_rate": 6.327071015981447e-06, + "logits/chosen": -1.3396921157836914, + "logits/rejected": -1.3416426181793213, + "logps/chosen": -106.99176788330078, + "logps/rejected": -202.735595703125, + "loss": 0.8861, + "rewards/accuracies": 0.0, + "rewards/chosen": 5.349350929260254, + "rewards/margins": -1.323540210723877, + "rewards/rejected": 6.672891139984131, + "step": 5324 + }, + { + "epoch": 0.86, + "learning_rate": 6.325803852595144e-06, + "logits/chosen": -1.1736290454864502, + "logits/rejected": -1.1210333108901978, + "logps/chosen": -38.770347595214844, + "logps/rejected": -14.16828727722168, + "loss": 0.6704, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.8229705691337585, + "rewards/margins": 0.09328246116638184, + "rewards/rejected": 0.7296881079673767, + "step": 5325 + }, + { + "epoch": 0.86, + "learning_rate": 6.32453659760834e-06, + "logits/chosen": -0.9917718172073364, + "logits/rejected": -0.9587726593017578, + "logps/chosen": -52.24079895019531, + "logps/rejected": -139.24343872070312, + "loss": 0.2615, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.2867554426193237, + "rewards/margins": 0.3781784772872925, + "rewards/rejected": 0.9085769653320312, + "step": 5326 + }, + { + "epoch": 0.86, + "learning_rate": 6.323269251108588e-06, + "logits/chosen": -1.1299363374710083, + "logits/rejected": -1.1225378513336182, + "logps/chosen": -37.75016784667969, + "logps/rejected": -91.76384735107422, + "loss": 0.6394, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.616529941558838, + "rewards/margins": -0.520179033279419, + "rewards/rejected": 3.136708974838257, + "step": 5327 + }, + { + "epoch": 0.86, + "learning_rate": 6.322001813183454e-06, + "logits/chosen": -1.1272461414337158, + "logits/rejected": -1.0478007793426514, + "logps/chosen": -79.68853759765625, + "logps/rejected": -37.43874740600586, + "loss": 0.1702, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.14350962638855, + "rewards/margins": 1.0574740171432495, + "rewards/rejected": 1.0860356092453003, + "step": 5328 + }, + { + "epoch": 0.86, + "learning_rate": 6.320734283920502e-06, + "logits/chosen": -1.104312539100647, + "logits/rejected": -1.1523700952529907, + "logps/chosen": -151.5079803466797, + "logps/rejected": -131.34381103515625, + "loss": 2.1027, + "rewards/accuracies": 0.0, + "rewards/chosen": 4.564113140106201, + "rewards/margins": -3.4525084495544434, + "rewards/rejected": 8.016621589660645, + "step": 5329 + }, + { + "epoch": 0.87, + "learning_rate": 6.319466663407309e-06, + "logits/chosen": -1.3616656064987183, + "logits/rejected": -1.2764863967895508, + "logps/chosen": -103.08058166503906, + "logps/rejected": -68.0003890991211, + "loss": 0.781, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.6960693597793579, + "rewards/margins": -0.6482284069061279, + "rewards/rejected": 1.3442977666854858, + "step": 5330 + }, + { + "epoch": 0.87, + "learning_rate": 6.318198951731454e-06, + "logits/chosen": -1.349355697631836, + "logits/rejected": -1.3142008781433105, + "logps/chosen": -81.88652038574219, + "logps/rejected": -91.40577697753906, + "loss": 0.2358, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.301311016082764, + "rewards/margins": 0.5156984329223633, + "rewards/rejected": 3.7856125831604004, + "step": 5331 + }, + { + "epoch": 0.87, + "learning_rate": 6.316931148980523e-06, + "logits/chosen": -1.213155746459961, + "logits/rejected": -1.304892897605896, + "logps/chosen": -74.646240234375, + "logps/rejected": -99.89664459228516, + "loss": 2.0031, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.1821229457855225, + "rewards/margins": -3.030733346939087, + "rewards/rejected": 6.212856292724609, + "step": 5332 + }, + { + "epoch": 0.87, + "learning_rate": 6.315663255242112e-06, + "logits/chosen": -1.126015305519104, + "logits/rejected": -0.9895181655883789, + "logps/chosen": -89.98202514648438, + "logps/rejected": -79.20661926269531, + "loss": 0.6139, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.4574600458145142, + "rewards/margins": -0.3022186756134033, + "rewards/rejected": 1.7596787214279175, + "step": 5333 + }, + { + "epoch": 0.87, + "learning_rate": 6.314395270603819e-06, + "logits/chosen": -0.8635444045066833, + "logits/rejected": -0.9313715100288391, + "logps/chosen": -66.28054809570312, + "logps/rejected": -139.6968994140625, + "loss": 1.0735, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.2466217279434204, + "rewards/margins": -0.6812041997909546, + "rewards/rejected": 1.927825927734375, + "step": 5334 + }, + { + "epoch": 0.87, + "learning_rate": 6.313127195153248e-06, + "logits/chosen": -1.3211792707443237, + "logits/rejected": -1.414110541343689, + "logps/chosen": -149.9324951171875, + "logps/rejected": -95.89810180664062, + "loss": 0.5944, + "rewards/accuracies": 0.0, + "rewards/chosen": 4.813321113586426, + "rewards/margins": -0.21203279495239258, + "rewards/rejected": 5.025353908538818, + "step": 5335 + }, + { + "epoch": 0.87, + "learning_rate": 6.311859028978014e-06, + "logits/chosen": -0.5848618745803833, + "logits/rejected": -0.6386849880218506, + "logps/chosen": -56.114871978759766, + "logps/rejected": -107.49346160888672, + "loss": 0.6494, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.5133999586105347, + "rewards/margins": 0.6492230296134949, + "rewards/rejected": 0.8641769289970398, + "step": 5336 + }, + { + "epoch": 0.87, + "learning_rate": 6.310590772165735e-06, + "logits/chosen": -1.0049335956573486, + "logits/rejected": -0.9824373722076416, + "logps/chosen": -103.21630859375, + "logps/rejected": -62.68384552001953, + "loss": 0.7456, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.7001144289970398, + "rewards/margins": -1.156315565109253, + "rewards/rejected": 1.8564300537109375, + "step": 5337 + }, + { + "epoch": 0.87, + "learning_rate": 6.309322424804034e-06, + "logits/chosen": -0.7267642617225647, + "logits/rejected": -0.7048629522323608, + "logps/chosen": -63.347259521484375, + "logps/rejected": -62.43388748168945, + "loss": 0.9312, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.9284194707870483, + "rewards/margins": -0.3619626760482788, + "rewards/rejected": 2.290382146835327, + "step": 5338 + }, + { + "epoch": 0.87, + "learning_rate": 6.308053986980543e-06, + "logits/chosen": -0.971099317073822, + "logits/rejected": -0.882499635219574, + "logps/chosen": -79.13131713867188, + "logps/rejected": -67.26563262939453, + "loss": 0.1756, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.5295883417129517, + "rewards/margins": 0.8784691095352173, + "rewards/rejected": 0.6511192321777344, + "step": 5339 + }, + { + "epoch": 0.87, + "learning_rate": 6.306785458782897e-06, + "logits/chosen": -1.058092713356018, + "logits/rejected": -1.021157145500183, + "logps/chosen": -54.83894348144531, + "logps/rejected": -60.483909606933594, + "loss": 0.5991, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.254364013671875, + "rewards/margins": -0.2648826837539673, + "rewards/rejected": 1.5192466974258423, + "step": 5340 + }, + { + "epoch": 0.87, + "learning_rate": 6.305516840298741e-06, + "logits/chosen": -1.0506898164749146, + "logits/rejected": -1.0290241241455078, + "logps/chosen": -92.87345123291016, + "logps/rejected": -120.95117950439453, + "loss": 0.3203, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.5859017372131348, + "rewards/margins": 2.2212204933166504, + "rewards/rejected": 0.3646812438964844, + "step": 5341 + }, + { + "epoch": 0.87, + "learning_rate": 6.304248131615724e-06, + "logits/chosen": -0.5239275097846985, + "logits/rejected": -0.530059814453125, + "logps/chosen": -5.217805862426758, + "logps/rejected": -1.215441346168518, + "loss": 1.2045, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.23691320419311523, + "rewards/margins": -0.009327933192253113, + "rewards/rejected": 0.24624113738536835, + "step": 5342 + }, + { + "epoch": 0.87, + "learning_rate": 6.302979332821504e-06, + "logits/chosen": -1.0760352611541748, + "logits/rejected": -0.6105297803878784, + "logps/chosen": -92.43316650390625, + "logps/rejected": -70.99237823486328, + "loss": 0.975, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.8252975940704346, + "rewards/margins": -1.5098671913146973, + "rewards/rejected": 3.335164785385132, + "step": 5343 + }, + { + "epoch": 0.87, + "learning_rate": 6.301710444003739e-06, + "logits/chosen": -1.095595121383667, + "logits/rejected": -1.0681838989257812, + "logps/chosen": -118.05743408203125, + "logps/rejected": -70.97186279296875, + "loss": 1.7796, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.401648044586182, + "rewards/margins": 2.250257968902588, + "rewards/rejected": 3.1513900756835938, + "step": 5344 + }, + { + "epoch": 0.87, + "learning_rate": 6.3004414652501e-06, + "logits/chosen": -0.9776837229728699, + "logits/rejected": -0.9108090400695801, + "logps/chosen": -56.435325622558594, + "logps/rejected": -52.02642822265625, + "loss": 1.1073, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.238813042640686, + "rewards/margins": 0.04394030570983887, + "rewards/rejected": 1.1948727369308472, + "step": 5345 + }, + { + "epoch": 0.87, + "learning_rate": 6.29917239664826e-06, + "logits/chosen": -1.3323765993118286, + "logits/rejected": -1.289103627204895, + "logps/chosen": -64.0053482055664, + "logps/rejected": -80.89520263671875, + "loss": 0.7521, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.767622470855713, + "rewards/margins": -0.22320246696472168, + "rewards/rejected": 2.9908249378204346, + "step": 5346 + }, + { + "epoch": 0.87, + "learning_rate": 6.2979032382859e-06, + "logits/chosen": -1.4013481140136719, + "logits/rejected": -1.3905930519104004, + "logps/chosen": -55.20136260986328, + "logps/rejected": -55.994503021240234, + "loss": 0.4288, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.6807419061660767, + "rewards/margins": -0.09040218591690063, + "rewards/rejected": 0.7711440920829773, + "step": 5347 + }, + { + "epoch": 0.87, + "learning_rate": 6.296633990250709e-06, + "logits/chosen": -1.1882623434066772, + "logits/rejected": -1.3004792928695679, + "logps/chosen": -34.422584533691406, + "logps/rejected": -126.87286376953125, + "loss": 2.1189, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.2442986965179443, + "rewards/margins": -4.17777156829834, + "rewards/rejected": 6.422070503234863, + "step": 5348 + }, + { + "epoch": 0.87, + "learning_rate": 6.295364652630377e-06, + "logits/chosen": -0.9390665292739868, + "logits/rejected": -0.9214438199996948, + "logps/chosen": -64.43314361572266, + "logps/rejected": -75.87281799316406, + "loss": 0.2412, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.9343757629394531, + "rewards/margins": 0.49608612060546875, + "rewards/rejected": 1.4382896423339844, + "step": 5349 + }, + { + "epoch": 0.87, + "learning_rate": 6.294095225512604e-06, + "logits/chosen": -1.4790235757827759, + "logits/rejected": -1.449308156967163, + "logps/chosen": -71.21417236328125, + "logps/rejected": -82.6268310546875, + "loss": 1.1624, + "rewards/accuracies": 0.0, + "rewards/chosen": 4.468364238739014, + "rewards/margins": -1.954540729522705, + "rewards/rejected": 6.422904968261719, + "step": 5350 + }, + { + "epoch": 0.87, + "learning_rate": 6.2928257089850966e-06, + "logits/chosen": -1.0979273319244385, + "logits/rejected": -1.0530757904052734, + "logps/chosen": -72.12040710449219, + "logps/rejected": -60.52626419067383, + "loss": 0.2972, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.4921271800994873, + "rewards/margins": 0.29592323303222656, + "rewards/rejected": 2.1962039470672607, + "step": 5351 + }, + { + "epoch": 0.87, + "learning_rate": 6.291556103135566e-06, + "logits/chosen": -1.0557317733764648, + "logits/rejected": -1.0147478580474854, + "logps/chosen": -91.3665542602539, + "logps/rejected": -82.43081665039062, + "loss": 0.9333, + "rewards/accuracies": 0.0, + "rewards/chosen": 4.075695037841797, + "rewards/margins": -0.15535211563110352, + "rewards/rejected": 4.2310471534729, + "step": 5352 + }, + { + "epoch": 0.87, + "learning_rate": 6.290286408051727e-06, + "logits/chosen": -1.2489349842071533, + "logits/rejected": -1.1925736665725708, + "logps/chosen": -58.10605239868164, + "logps/rejected": -24.518312454223633, + "loss": 0.3255, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5523784756660461, + "rewards/margins": 0.1779354214668274, + "rewards/rejected": 0.37444305419921875, + "step": 5353 + }, + { + "epoch": 0.87, + "learning_rate": 6.289016623821308e-06, + "logits/chosen": -1.2751927375793457, + "logits/rejected": -1.124765157699585, + "logps/chosen": -81.58016204833984, + "logps/rejected": -72.39035034179688, + "loss": 0.5134, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9799880981445312, + "rewards/margins": 1.1966956853866577, + "rewards/rejected": 1.7832924127578735, + "step": 5354 + }, + { + "epoch": 0.87, + "learning_rate": 6.287746750532037e-06, + "logits/chosen": -0.6343741416931152, + "logits/rejected": -0.5551007390022278, + "logps/chosen": -36.12485885620117, + "logps/rejected": -13.543915748596191, + "loss": 0.4925, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.098043441772461, + "rewards/margins": 1.5058975219726562, + "rewards/rejected": 0.5921458601951599, + "step": 5355 + }, + { + "epoch": 0.87, + "learning_rate": 6.286476788271649e-06, + "logits/chosen": -1.0919846296310425, + "logits/rejected": -1.0808861255645752, + "logps/chosen": -122.9798583984375, + "logps/rejected": -98.21858215332031, + "loss": 1.6107, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.141444444656372, + "rewards/margins": -3.1711394786834717, + "rewards/rejected": 6.312583923339844, + "step": 5356 + }, + { + "epoch": 0.87, + "learning_rate": 6.285206737127889e-06, + "logits/chosen": -1.4342788457870483, + "logits/rejected": -1.4167166948318481, + "logps/chosen": -93.25578308105469, + "logps/rejected": -116.31788635253906, + "loss": 0.5078, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.4583930969238281, + "rewards/margins": -0.3614799976348877, + "rewards/rejected": 1.8198730945587158, + "step": 5357 + }, + { + "epoch": 0.87, + "learning_rate": 6.2839365971885036e-06, + "logits/chosen": -1.1912777423858643, + "logits/rejected": -1.1768571138381958, + "logps/chosen": -135.87681579589844, + "logps/rejected": -96.18290710449219, + "loss": 1.1097, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.424656629562378, + "rewards/margins": 1.0849037170410156, + "rewards/rejected": 2.3397529125213623, + "step": 5358 + }, + { + "epoch": 0.87, + "learning_rate": 6.282666368541251e-06, + "logits/chosen": -0.5848726034164429, + "logits/rejected": -0.5973520278930664, + "logps/chosen": -2.638118267059326, + "logps/rejected": -24.57818031311035, + "loss": 0.6589, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.11381568759679794, + "rewards/margins": -0.928936243057251, + "rewards/rejected": 1.0427519083023071, + "step": 5359 + }, + { + "epoch": 0.87, + "learning_rate": 6.281396051273885e-06, + "logits/chosen": -0.805728554725647, + "logits/rejected": -0.805728554725647, + "logps/chosen": -23.791322708129883, + "logps/rejected": -23.791322708129883, + "loss": 2.1907, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.215341091156006, + "rewards/margins": 0.0, + "rewards/rejected": 2.215341091156006, + "step": 5360 + }, + { + "epoch": 0.87, + "learning_rate": 6.2801256454741775e-06, + "logits/chosen": -1.1568769216537476, + "logits/rejected": -1.1632845401763916, + "logps/chosen": -57.88585662841797, + "logps/rejected": -90.92868041992188, + "loss": 3.4573, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.818838596343994, + "rewards/margins": -0.5993285179138184, + "rewards/rejected": 3.4181671142578125, + "step": 5361 + }, + { + "epoch": 0.87, + "learning_rate": 6.2788551512299014e-06, + "logits/chosen": -1.2606006860733032, + "logits/rejected": -1.106482982635498, + "logps/chosen": -187.77487182617188, + "logps/rejected": -106.48162078857422, + "loss": 0.2588, + "rewards/accuracies": 1.0, + "rewards/chosen": 7.830807685852051, + "rewards/margins": 2.4219460487365723, + "rewards/rejected": 5.4088616371154785, + "step": 5362 + }, + { + "epoch": 0.87, + "learning_rate": 6.277584568628834e-06, + "logits/chosen": -1.1228636503219604, + "logits/rejected": -1.107581615447998, + "logps/chosen": -75.45323944091797, + "logps/rejected": -35.82872009277344, + "loss": 0.6578, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.900134325027466, + "rewards/margins": -0.8009850978851318, + "rewards/rejected": 4.701119422912598, + "step": 5363 + }, + { + "epoch": 0.87, + "learning_rate": 6.276313897758761e-06, + "logits/chosen": -1.1731631755828857, + "logits/rejected": -1.099088191986084, + "logps/chosen": -100.09587860107422, + "logps/rejected": -161.4637451171875, + "loss": 0.7165, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.5691070556640625, + "rewards/margins": -1.0150115489959717, + "rewards/rejected": 2.584118604660034, + "step": 5364 + }, + { + "epoch": 0.87, + "learning_rate": 6.2750431387074754e-06, + "logits/chosen": -0.7922354340553284, + "logits/rejected": -0.9775245785713196, + "logps/chosen": -57.074485778808594, + "logps/rejected": -87.03672790527344, + "loss": 2.3781, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.9407700300216675, + "rewards/margins": -3.2006421089172363, + "rewards/rejected": 5.141412258148193, + "step": 5365 + }, + { + "epoch": 0.87, + "learning_rate": 6.273772291562774e-06, + "logits/chosen": -1.186340570449829, + "logits/rejected": -1.1755677461624146, + "logps/chosen": -83.07164001464844, + "logps/rejected": -54.818641662597656, + "loss": 0.6095, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.5571649074554443, + "rewards/margins": 0.05226898193359375, + "rewards/rejected": 2.5048959255218506, + "step": 5366 + }, + { + "epoch": 0.87, + "learning_rate": 6.272501356412459e-06, + "logits/chosen": -1.1991673707962036, + "logits/rejected": -1.0836466550827026, + "logps/chosen": -86.93949890136719, + "logps/rejected": -59.262813568115234, + "loss": 0.0426, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.5471649169921875, + "rewards/margins": 4.054214000701904, + "rewards/rejected": 1.4929507970809937, + "step": 5367 + }, + { + "epoch": 0.87, + "learning_rate": 6.27123033334434e-06, + "logits/chosen": -1.1955609321594238, + "logits/rejected": -1.2144365310668945, + "logps/chosen": -77.69340515136719, + "logps/rejected": -51.440826416015625, + "loss": 0.6167, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.661872148513794, + "rewards/margins": -0.28813397884368896, + "rewards/rejected": 1.950006127357483, + "step": 5368 + }, + { + "epoch": 0.87, + "learning_rate": 6.269959222446235e-06, + "logits/chosen": -0.8990032076835632, + "logits/rejected": -0.8990032076835632, + "logps/chosen": -27.238618850708008, + "logps/rejected": -27.238618850708008, + "loss": 0.8529, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.7150697708129883, + "rewards/margins": 0.0, + "rewards/rejected": 1.7150697708129883, + "step": 5369 + }, + { + "epoch": 0.87, + "learning_rate": 6.268688023805965e-06, + "logits/chosen": -1.3302854299545288, + "logits/rejected": -1.3240457773208618, + "logps/chosen": -188.62518310546875, + "logps/rejected": -45.542388916015625, + "loss": 0.2083, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.5274553298950195, + "rewards/margins": 3.488424777984619, + "rewards/rejected": 3.0390305519104004, + "step": 5370 + }, + { + "epoch": 0.87, + "learning_rate": 6.267416737511355e-06, + "logits/chosen": -0.995670735836029, + "logits/rejected": -0.9154911637306213, + "logps/chosen": -56.14313507080078, + "logps/rejected": -46.94940948486328, + "loss": 0.4588, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.6060676574707031, + "rewards/margins": -0.3482459783554077, + "rewards/rejected": 1.9543136358261108, + "step": 5371 + }, + { + "epoch": 0.87, + "learning_rate": 6.266145363650241e-06, + "logits/chosen": -1.2558696269989014, + "logits/rejected": -1.2435306310653687, + "logps/chosen": -52.01869201660156, + "logps/rejected": -102.3635025024414, + "loss": 0.0954, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.1210923194885254, + "rewards/margins": 1.5731209516525269, + "rewards/rejected": 0.5479713678359985, + "step": 5372 + }, + { + "epoch": 0.87, + "learning_rate": 6.264873902310463e-06, + "logits/chosen": -1.1452146768569946, + "logits/rejected": -1.0800385475158691, + "logps/chosen": -111.50588989257812, + "logps/rejected": -68.32432556152344, + "loss": 0.171, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.337139844894409, + "rewards/margins": 1.3895224332809448, + "rewards/rejected": 1.9476174116134644, + "step": 5373 + }, + { + "epoch": 0.87, + "learning_rate": 6.263602353579868e-06, + "logits/chosen": -0.6128652691841125, + "logits/rejected": -0.5871230959892273, + "logps/chosen": -27.44947624206543, + "logps/rejected": -20.43069839477539, + "loss": 0.3303, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.40957772731781, + "rewards/margins": 1.7045984268188477, + "rewards/rejected": -0.2950206696987152, + "step": 5374 + }, + { + "epoch": 0.87, + "learning_rate": 6.262330717546305e-06, + "logits/chosen": -1.0064767599105835, + "logits/rejected": -0.9023069739341736, + "logps/chosen": -73.70852661132812, + "logps/rejected": -28.668048858642578, + "loss": 0.2816, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.5031068325042725, + "rewards/margins": 1.24650239944458, + "rewards/rejected": 1.2566044330596924, + "step": 5375 + }, + { + "epoch": 0.87, + "learning_rate": 6.261058994297634e-06, + "logits/chosen": -1.2234790325164795, + "logits/rejected": -1.2081596851348877, + "logps/chosen": -42.42718505859375, + "logps/rejected": -103.70808410644531, + "loss": 0.6113, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.714141845703125, + "rewards/margins": -0.1802995204925537, + "rewards/rejected": 2.8944413661956787, + "step": 5376 + }, + { + "epoch": 0.87, + "learning_rate": 6.25978718392172e-06, + "logits/chosen": -1.1132349967956543, + "logits/rejected": -1.145544409751892, + "logps/chosen": -91.59808349609375, + "logps/rejected": -111.59005737304688, + "loss": 1.3403, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.7176742553710938, + "rewards/margins": -2.3931198120117188, + "rewards/rejected": 4.1107940673828125, + "step": 5377 + }, + { + "epoch": 0.87, + "learning_rate": 6.25851528650643e-06, + "logits/chosen": -0.923190712928772, + "logits/rejected": -0.923190712928772, + "logps/chosen": -9.33716869354248, + "logps/rejected": -9.33716869354248, + "loss": 0.35, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.8689813017845154, + "rewards/margins": 0.0, + "rewards/rejected": 0.8689813017845154, + "step": 5378 + }, + { + "epoch": 0.87, + "learning_rate": 6.2572433021396424e-06, + "logits/chosen": -1.0449846982955933, + "logits/rejected": -1.0293456315994263, + "logps/chosen": -37.63045120239258, + "logps/rejected": -58.59543991088867, + "loss": 0.4744, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.9879894256591797, + "rewards/margins": -0.1327049732208252, + "rewards/rejected": 2.120694398880005, + "step": 5379 + }, + { + "epoch": 0.87, + "learning_rate": 6.255971230909239e-06, + "logits/chosen": -0.713793158531189, + "logits/rejected": -0.7130216360092163, + "logps/chosen": -9.225738525390625, + "logps/rejected": -3.062317371368408, + "loss": 0.5897, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.1790420562028885, + "rewards/margins": -0.056050628423690796, + "rewards/rejected": 0.23509268462657928, + "step": 5380 + }, + { + "epoch": 0.87, + "learning_rate": 6.254699072903108e-06, + "logits/chosen": -1.1742944717407227, + "logits/rejected": -1.2120312452316284, + "logps/chosen": -90.6382064819336, + "logps/rejected": -79.83551025390625, + "loss": 0.6139, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.408480167388916, + "rewards/margins": -0.7509644031524658, + "rewards/rejected": 3.159444570541382, + "step": 5381 + }, + { + "epoch": 0.87, + "learning_rate": 6.253426828209144e-06, + "logits/chosen": -1.0983858108520508, + "logits/rejected": -0.8103745579719543, + "logps/chosen": -104.91390228271484, + "logps/rejected": -30.22539520263672, + "loss": 0.2111, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.982247352600098, + "rewards/margins": 4.476541996002197, + "rewards/rejected": 1.5057052373886108, + "step": 5382 + }, + { + "epoch": 0.87, + "learning_rate": 6.252154496915244e-06, + "logits/chosen": -1.6622552871704102, + "logits/rejected": -1.4768617153167725, + "logps/chosen": -142.65383911132812, + "logps/rejected": -97.24242401123047, + "loss": 0.0608, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.8124680519104, + "rewards/margins": 2.1732873916625977, + "rewards/rejected": 4.639180660247803, + "step": 5383 + }, + { + "epoch": 0.87, + "learning_rate": 6.250882079109317e-06, + "logits/chosen": -1.1819745302200317, + "logits/rejected": -1.221521019935608, + "logps/chosen": -119.04881286621094, + "logps/rejected": -123.25949096679688, + "loss": 1.206, + "rewards/accuracies": 0.0, + "rewards/chosen": 6.76631498336792, + "rewards/margins": -0.42876577377319336, + "rewards/rejected": 7.195080757141113, + "step": 5384 + }, + { + "epoch": 0.87, + "learning_rate": 6.249609574879275e-06, + "logits/chosen": -1.1619021892547607, + "logits/rejected": -1.107920527458191, + "logps/chosen": -89.5145263671875, + "logps/rejected": -83.07049560546875, + "loss": 0.4501, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.6188035011291504, + "rewards/margins": -0.37088918685913086, + "rewards/rejected": 2.9896926879882812, + "step": 5385 + }, + { + "epoch": 0.87, + "learning_rate": 6.248336984313035e-06, + "logits/chosen": -1.279630184173584, + "logits/rejected": -1.2421904802322388, + "logps/chosen": -130.73268127441406, + "logps/rejected": -67.86320495605469, + "loss": 1.3256, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.6498794555664062, + "rewards/margins": 0.05592036247253418, + "rewards/rejected": 2.593959093093872, + "step": 5386 + }, + { + "epoch": 0.87, + "learning_rate": 6.247064307498521e-06, + "logits/chosen": -1.4194895029067993, + "logits/rejected": -1.343213438987732, + "logps/chosen": -157.6741180419922, + "logps/rejected": -93.00244903564453, + "loss": 0.1146, + "rewards/accuracies": 1.0, + "rewards/chosen": 8.831331253051758, + "rewards/margins": 1.3960809707641602, + "rewards/rejected": 7.435250282287598, + "step": 5387 + }, + { + "epoch": 0.87, + "learning_rate": 6.245791544523664e-06, + "logits/chosen": -1.1237564086914062, + "logits/rejected": -1.1919996738433838, + "logps/chosen": -54.686790466308594, + "logps/rejected": -78.1131591796875, + "loss": 2.0021, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.303401231765747, + "rewards/margins": -2.908451795578003, + "rewards/rejected": 4.21185302734375, + "step": 5388 + }, + { + "epoch": 0.87, + "learning_rate": 6.244518695476398e-06, + "logits/chosen": -1.0455635786056519, + "logits/rejected": -1.036001443862915, + "logps/chosen": -68.4468765258789, + "logps/rejected": -61.842933654785156, + "loss": 0.7771, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.5384072065353394, + "rewards/margins": -0.7397719621658325, + "rewards/rejected": 2.278179168701172, + "step": 5389 + }, + { + "epoch": 0.87, + "learning_rate": 6.243245760444666e-06, + "logits/chosen": -1.372559905052185, + "logits/rejected": -1.391039252281189, + "logps/chosen": -58.68482971191406, + "logps/rejected": -63.53594970703125, + "loss": 0.4839, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.1701323986053467, + "rewards/margins": 0.17811119556427002, + "rewards/rejected": 1.9920212030410767, + "step": 5390 + }, + { + "epoch": 0.88, + "learning_rate": 6.2419727395164165e-06, + "logits/chosen": -1.4236055612564087, + "logits/rejected": -1.5440558195114136, + "logps/chosen": -93.27264404296875, + "logps/rejected": -36.32383728027344, + "loss": 0.0123, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.9760193824768066, + "rewards/margins": 3.743985176086426, + "rewards/rejected": 0.23203431069850922, + "step": 5391 + }, + { + "epoch": 0.88, + "learning_rate": 6.240699632779602e-06, + "logits/chosen": -1.2282280921936035, + "logits/rejected": -1.1693896055221558, + "logps/chosen": -120.06084442138672, + "logps/rejected": -103.67535400390625, + "loss": 0.6759, + "rewards/accuracies": 0.0, + "rewards/chosen": 4.464515209197998, + "rewards/margins": -0.961514949798584, + "rewards/rejected": 5.426030158996582, + "step": 5392 + }, + { + "epoch": 0.88, + "learning_rate": 6.239426440322183e-06, + "logits/chosen": -0.9300085306167603, + "logits/rejected": -0.9167983531951904, + "logps/chosen": -78.65769958496094, + "logps/rejected": -44.797943115234375, + "loss": 0.4553, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.8289612531661987, + "rewards/margins": -0.37690889835357666, + "rewards/rejected": 2.2058701515197754, + "step": 5393 + }, + { + "epoch": 0.88, + "learning_rate": 6.2381531622321234e-06, + "logits/chosen": -1.1038436889648438, + "logits/rejected": -0.9479095935821533, + "logps/chosen": -52.83108901977539, + "logps/rejected": -37.108394622802734, + "loss": 0.736, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.311814546585083, + "rewards/margins": -1.1442039012908936, + "rewards/rejected": 3.4560184478759766, + "step": 5394 + }, + { + "epoch": 0.88, + "learning_rate": 6.236879798597396e-06, + "logits/chosen": -1.3871005773544312, + "logits/rejected": -1.3997437953948975, + "logps/chosen": -151.2539520263672, + "logps/rejected": -123.14642333984375, + "loss": 0.7687, + "rewards/accuracies": 0.0, + "rewards/chosen": 4.785606384277344, + "rewards/margins": -1.114363193511963, + "rewards/rejected": 5.899969577789307, + "step": 5395 + }, + { + "epoch": 0.88, + "learning_rate": 6.235606349505978e-06, + "logits/chosen": -1.1065751314163208, + "logits/rejected": -1.1263707876205444, + "logps/chosen": -83.72288513183594, + "logps/rejected": -79.51473999023438, + "loss": 1.4555, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.1291763335466385, + "rewards/margins": -2.622441053390503, + "rewards/rejected": 2.751617431640625, + "step": 5396 + }, + { + "epoch": 0.88, + "learning_rate": 6.234332815045852e-06, + "logits/chosen": -1.2945868968963623, + "logits/rejected": -1.0505822896957397, + "logps/chosen": -62.97256851196289, + "logps/rejected": -216.52197265625, + "loss": 2.5536, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.6606403589248657, + "rewards/margins": -5.035038471221924, + "rewards/rejected": 6.6956787109375, + "step": 5397 + }, + { + "epoch": 0.88, + "learning_rate": 6.23305919530501e-06, + "logits/chosen": -1.1259909868240356, + "logits/rejected": -0.9613581299781799, + "logps/chosen": -66.28059387207031, + "logps/rejected": -13.30290412902832, + "loss": 0.0354, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.823094129562378, + "rewards/margins": 3.386129379272461, + "rewards/rejected": 0.43696480989456177, + "step": 5398 + }, + { + "epoch": 0.88, + "learning_rate": 6.2317854903714425e-06, + "logits/chosen": -1.246683120727539, + "logits/rejected": -1.1550172567367554, + "logps/chosen": -88.92962646484375, + "logps/rejected": -83.13728332519531, + "loss": 1.0533, + "rewards/accuracies": 0.0, + "rewards/chosen": 5.3056182861328125, + "rewards/margins": -0.17084646224975586, + "rewards/rejected": 5.476464748382568, + "step": 5399 + }, + { + "epoch": 0.88, + "learning_rate": 6.230511700333154e-06, + "logits/chosen": -1.11933434009552, + "logits/rejected": -1.1298686265945435, + "logps/chosen": -142.702392578125, + "logps/rejected": -54.236087799072266, + "loss": 0.4901, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.8392274379730225, + "rewards/margins": 1.1014608144760132, + "rewards/rejected": 1.7377666234970093, + "step": 5400 + }, + { + "epoch": 0.88, + "learning_rate": 6.22923782527815e-06, + "logits/chosen": -0.6808648109436035, + "logits/rejected": -0.5779093503952026, + "logps/chosen": -58.416839599609375, + "logps/rejected": -57.067726135253906, + "loss": 0.3407, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.567059278488159, + "rewards/margins": 0.2503218650817871, + "rewards/rejected": 2.316737413406372, + "step": 5401 + }, + { + "epoch": 0.88, + "learning_rate": 6.227963865294444e-06, + "logits/chosen": -0.976733386516571, + "logits/rejected": -0.9145879149436951, + "logps/chosen": -35.4676399230957, + "logps/rejected": -42.13747024536133, + "loss": 0.2857, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.269815444946289, + "rewards/margins": 0.28016549348831177, + "rewards/rejected": 0.9896499514579773, + "step": 5402 + }, + { + "epoch": 0.88, + "learning_rate": 6.226689820470053e-06, + "logits/chosen": -0.9736401438713074, + "logits/rejected": -0.9407894611358643, + "logps/chosen": -85.05682373046875, + "logps/rejected": -127.9119873046875, + "loss": 0.1551, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.558164358139038, + "rewards/margins": 1.2314095497131348, + "rewards/rejected": 1.3267548084259033, + "step": 5403 + }, + { + "epoch": 0.88, + "learning_rate": 6.2254156908930045e-06, + "logits/chosen": -1.0182104110717773, + "logits/rejected": -0.9676680564880371, + "logps/chosen": -79.36839294433594, + "logps/rejected": -55.809059143066406, + "loss": 0.2877, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.9165672063827515, + "rewards/margins": 0.9302497506141663, + "rewards/rejected": 0.9863174557685852, + "step": 5404 + }, + { + "epoch": 0.88, + "learning_rate": 6.2241414766513255e-06, + "logits/chosen": -0.9652674198150635, + "logits/rejected": -0.931384265422821, + "logps/chosen": -69.92916870117188, + "logps/rejected": -57.95195770263672, + "loss": 0.3211, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.398766279220581, + "rewards/margins": 0.7260222434997559, + "rewards/rejected": 1.6727440357208252, + "step": 5405 + }, + { + "epoch": 0.88, + "learning_rate": 6.222867177833053e-06, + "logits/chosen": -1.243051290512085, + "logits/rejected": -1.044695496559143, + "logps/chosen": -127.07743835449219, + "logps/rejected": -35.749691009521484, + "loss": 0.0229, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.952561855316162, + "rewards/margins": 3.1045687198638916, + "rewards/rejected": 2.8479931354522705, + "step": 5406 + }, + { + "epoch": 0.88, + "learning_rate": 6.221592794526229e-06, + "logits/chosen": -1.015082836151123, + "logits/rejected": -1.1101382970809937, + "logps/chosen": -42.67503356933594, + "logps/rejected": -82.22004699707031, + "loss": 0.6062, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.39621901512146, + "rewards/margins": -0.8553526401519775, + "rewards/rejected": 3.2515716552734375, + "step": 5407 + }, + { + "epoch": 0.88, + "learning_rate": 6.220318326818902e-06, + "logits/chosen": -1.0294419527053833, + "logits/rejected": -1.050246000289917, + "logps/chosen": -78.43206787109375, + "logps/rejected": -47.86277770996094, + "loss": 0.3046, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.33500599861145, + "rewards/margins": 1.0939826965332031, + "rewards/rejected": 2.241023302078247, + "step": 5408 + }, + { + "epoch": 0.88, + "learning_rate": 6.219043774799126e-06, + "logits/chosen": -1.297564148902893, + "logits/rejected": -1.1548748016357422, + "logps/chosen": -102.1508560180664, + "logps/rejected": -59.20612716674805, + "loss": 0.0079, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.931638240814209, + "rewards/margins": 6.0686211585998535, + "rewards/rejected": 0.8630169034004211, + "step": 5409 + }, + { + "epoch": 0.88, + "learning_rate": 6.2177691385549595e-06, + "logits/chosen": -0.8520135879516602, + "logits/rejected": -0.8520135879516602, + "logps/chosen": -30.922029495239258, + "logps/rejected": -30.922029495239258, + "loss": 0.3538, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.995398998260498, + "rewards/margins": 0.0, + "rewards/rejected": 2.995398998260498, + "step": 5410 + }, + { + "epoch": 0.88, + "learning_rate": 6.216494418174469e-06, + "logits/chosen": -1.2142763137817383, + "logits/rejected": -1.2527493238449097, + "logps/chosen": -59.73139190673828, + "logps/rejected": -106.37859344482422, + "loss": 0.1785, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.8014488220214844, + "rewards/margins": 0.8668594360351562, + "rewards/rejected": 0.9345893859863281, + "step": 5411 + }, + { + "epoch": 0.88, + "learning_rate": 6.2152196137457245e-06, + "logits/chosen": -1.042810320854187, + "logits/rejected": -0.9256577491760254, + "logps/chosen": -80.63088989257812, + "logps/rejected": -33.09800720214844, + "loss": 0.1523, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.350212812423706, + "rewards/margins": 1.7022819519042969, + "rewards/rejected": 0.647930920124054, + "step": 5412 + }, + { + "epoch": 0.88, + "learning_rate": 6.2139447253568044e-06, + "logits/chosen": -1.2567191123962402, + "logits/rejected": -1.1188695430755615, + "logps/chosen": -102.24053192138672, + "logps/rejected": -74.08128356933594, + "loss": 0.3207, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.1532158851623535, + "rewards/margins": 0.15818476676940918, + "rewards/rejected": 3.9950311183929443, + "step": 5413 + }, + { + "epoch": 0.88, + "learning_rate": 6.212669753095788e-06, + "logits/chosen": -1.024420142173767, + "logits/rejected": -0.9972618222236633, + "logps/chosen": -104.82791137695312, + "logps/rejected": -74.9312973022461, + "loss": 0.6786, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.3231048583984375, + "rewards/margins": -1.0328469276428223, + "rewards/rejected": 4.35595178604126, + "step": 5414 + }, + { + "epoch": 0.88, + "learning_rate": 6.211394697050767e-06, + "logits/chosen": -1.5398151874542236, + "logits/rejected": -1.2470595836639404, + "logps/chosen": -138.64419555664062, + "logps/rejected": -55.97248077392578, + "loss": 0.3855, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.601443767547607, + "rewards/margins": 1.9080066680908203, + "rewards/rejected": 4.693437099456787, + "step": 5415 + }, + { + "epoch": 0.88, + "learning_rate": 6.210119557309834e-06, + "logits/chosen": -1.3879868984222412, + "logits/rejected": -1.2010443210601807, + "logps/chosen": -104.18983459472656, + "logps/rejected": -125.25901794433594, + "loss": 0.078, + "rewards/accuracies": 1.0, + "rewards/chosen": 9.862726211547852, + "rewards/margins": 2.8497776985168457, + "rewards/rejected": 7.012948513031006, + "step": 5416 + }, + { + "epoch": 0.88, + "learning_rate": 6.2088443339610905e-06, + "logits/chosen": -1.554727554321289, + "logits/rejected": -1.5941659212112427, + "logps/chosen": -244.31005859375, + "logps/rejected": -95.83966064453125, + "loss": 0.009, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.9066925048828125, + "rewards/margins": 4.269536018371582, + "rewards/rejected": 2.6371567249298096, + "step": 5417 + }, + { + "epoch": 0.88, + "learning_rate": 6.207569027092642e-06, + "logits/chosen": -1.1443023681640625, + "logits/rejected": -1.1465997695922852, + "logps/chosen": -41.65656661987305, + "logps/rejected": -51.942352294921875, + "loss": 0.5183, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.8148491382598877, + "rewards/margins": -0.046950459480285645, + "rewards/rejected": 1.8617995977401733, + "step": 5418 + }, + { + "epoch": 0.88, + "learning_rate": 6.206293636792599e-06, + "logits/chosen": -1.365021824836731, + "logits/rejected": -1.3063066005706787, + "logps/chosen": -97.6902847290039, + "logps/rejected": -185.55413818359375, + "loss": 0.7726, + "rewards/accuracies": 0.0, + "rewards/chosen": 4.651882171630859, + "rewards/margins": -1.0890727043151855, + "rewards/rejected": 5.740954875946045, + "step": 5419 + }, + { + "epoch": 0.88, + "learning_rate": 6.205018163149079e-06, + "logits/chosen": -1.3317761421203613, + "logits/rejected": -1.337469458580017, + "logps/chosen": -223.60000610351562, + "logps/rejected": -111.96868896484375, + "loss": 0.1397, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.819797039031982, + "rewards/margins": 1.2090363502502441, + "rewards/rejected": 5.610760688781738, + "step": 5420 + }, + { + "epoch": 0.88, + "learning_rate": 6.203742606250208e-06, + "logits/chosen": -0.9301565289497375, + "logits/rejected": -0.9210378527641296, + "logps/chosen": -28.56559944152832, + "logps/rejected": -2.471705913543701, + "loss": 0.8656, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.15676192939281464, + "rewards/margins": -0.290951132774353, + "rewards/rejected": 0.44771304726600647, + "step": 5421 + }, + { + "epoch": 0.88, + "learning_rate": 6.202466966184112e-06, + "logits/chosen": -1.2047803401947021, + "logits/rejected": -1.052773356437683, + "logps/chosen": -140.79490661621094, + "logps/rejected": -95.65350341796875, + "loss": 0.0137, + "rewards/accuracies": 1.0, + "rewards/chosen": 7.288964748382568, + "rewards/margins": 3.9999539852142334, + "rewards/rejected": 3.289010763168335, + "step": 5422 + }, + { + "epoch": 0.88, + "learning_rate": 6.201191243038927e-06, + "logits/chosen": -1.2573456764221191, + "logits/rejected": -0.8746869564056396, + "logps/chosen": -122.32008361816406, + "logps/rejected": -19.47274398803711, + "loss": 0.2071, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.7132415771484375, + "rewards/margins": 3.6993141174316406, + "rewards/rejected": 2.013927459716797, + "step": 5423 + }, + { + "epoch": 0.88, + "learning_rate": 6.199915436902792e-06, + "logits/chosen": -1.0602301359176636, + "logits/rejected": -1.0231196880340576, + "logps/chosen": -98.23683166503906, + "logps/rejected": -69.46833801269531, + "loss": 0.2366, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.1094024181365967, + "rewards/margins": 0.5094054937362671, + "rewards/rejected": 1.5999969244003296, + "step": 5424 + }, + { + "epoch": 0.88, + "learning_rate": 6.198639547863854e-06, + "logits/chosen": -1.1925756931304932, + "logits/rejected": -1.1354949474334717, + "logps/chosen": -60.18218231201172, + "logps/rejected": -61.253299713134766, + "loss": 1.4437, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.1704232692718506, + "rewards/margins": -2.2278354167938232, + "rewards/rejected": 5.398258686065674, + "step": 5425 + }, + { + "epoch": 0.88, + "learning_rate": 6.1973635760102645e-06, + "logits/chosen": -1.0167232751846313, + "logits/rejected": -0.9765446782112122, + "logps/chosen": -90.52445983886719, + "logps/rejected": -40.414161682128906, + "loss": 0.3253, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.1575286388397217, + "rewards/margins": 0.4846717119216919, + "rewards/rejected": 1.6728569269180298, + "step": 5426 + }, + { + "epoch": 0.88, + "learning_rate": 6.196087521430181e-06, + "logits/chosen": -0.9789246320724487, + "logits/rejected": -0.9647080302238464, + "logps/chosen": -37.325252532958984, + "logps/rejected": -34.67747116088867, + "loss": 1.7721, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.0829529762268066, + "rewards/margins": -0.8720324039459229, + "rewards/rejected": 2.9549853801727295, + "step": 5427 + }, + { + "epoch": 0.88, + "learning_rate": 6.194811384211769e-06, + "logits/chosen": -1.139971375465393, + "logits/rejected": -1.0523241758346558, + "logps/chosen": -55.33545684814453, + "logps/rejected": -52.462371826171875, + "loss": 0.2896, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.243428945541382, + "rewards/margins": 0.6938904523849487, + "rewards/rejected": 1.549538493156433, + "step": 5428 + }, + { + "epoch": 0.88, + "learning_rate": 6.193535164443194e-06, + "logits/chosen": -0.9367344379425049, + "logits/rejected": -0.9197930693626404, + "logps/chosen": -17.048826217651367, + "logps/rejected": -3.272529363632202, + "loss": 0.2575, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.9224552512168884, + "rewards/margins": 0.5040367841720581, + "rewards/rejected": 0.4184184968471527, + "step": 5429 + }, + { + "epoch": 0.88, + "learning_rate": 6.192258862212634e-06, + "logits/chosen": -1.2516207695007324, + "logits/rejected": -1.1728285551071167, + "logps/chosen": -35.48350143432617, + "logps/rejected": -22.289241790771484, + "loss": 0.0489, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.088813543319702, + "rewards/margins": 2.3790791034698486, + "rewards/rejected": 0.7097343802452087, + "step": 5430 + }, + { + "epoch": 0.88, + "learning_rate": 6.1909824776082674e-06, + "logits/chosen": -1.073433756828308, + "logits/rejected": -0.9115867614746094, + "logps/chosen": -104.50669860839844, + "logps/rejected": -84.01203918457031, + "loss": 0.0213, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.9568376541137695, + "rewards/margins": 3.3049814701080322, + "rewards/rejected": 2.6518561840057373, + "step": 5431 + }, + { + "epoch": 0.88, + "learning_rate": 6.18970601071828e-06, + "logits/chosen": -1.0678398609161377, + "logits/rejected": -0.8085926175117493, + "logps/chosen": -84.78260040283203, + "logps/rejected": -47.627716064453125, + "loss": 1.6504, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.3732284307479858, + "rewards/margins": -0.8560088872909546, + "rewards/rejected": 2.2292373180389404, + "step": 5432 + }, + { + "epoch": 0.88, + "learning_rate": 6.188429461630866e-06, + "logits/chosen": -1.259064793586731, + "logits/rejected": -1.3256573677062988, + "logps/chosen": -52.784950256347656, + "logps/rejected": -45.80089569091797, + "loss": 1.2235, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.0431435108184814, + "rewards/margins": -2.0649635791778564, + "rewards/rejected": 4.108107089996338, + "step": 5433 + }, + { + "epoch": 0.88, + "learning_rate": 6.18715283043422e-06, + "logits/chosen": -1.2163392305374146, + "logits/rejected": -1.2339112758636475, + "logps/chosen": -60.061527252197266, + "logps/rejected": -61.17647933959961, + "loss": 1.6126, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.5898258090019226, + "rewards/margins": -2.7873802185058594, + "rewards/rejected": 3.3772060871124268, + "step": 5434 + }, + { + "epoch": 0.88, + "learning_rate": 6.185876117216547e-06, + "logits/chosen": -1.375098705291748, + "logits/rejected": -1.3824244737625122, + "logps/chosen": -48.427093505859375, + "logps/rejected": -58.976173400878906, + "loss": 0.5716, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.264373779296875, + "rewards/margins": 0.4032813310623169, + "rewards/rejected": 1.861092448234558, + "step": 5435 + }, + { + "epoch": 0.88, + "learning_rate": 6.184599322066055e-06, + "logits/chosen": -0.9065272212028503, + "logits/rejected": -0.6279851198196411, + "logps/chosen": -122.00291442871094, + "logps/rejected": -26.924360275268555, + "loss": 0.003, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.818925380706787, + "rewards/margins": 6.567042350769043, + "rewards/rejected": 0.2518831193447113, + "step": 5436 + }, + { + "epoch": 0.88, + "learning_rate": 6.183322445070958e-06, + "logits/chosen": -1.328962802886963, + "logits/rejected": -0.8735992312431335, + "logps/chosen": -103.77200317382812, + "logps/rejected": -98.26274108886719, + "loss": 0.2119, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.475619792938232, + "rewards/margins": 0.6395096778869629, + "rewards/rejected": 4.8361101150512695, + "step": 5437 + }, + { + "epoch": 0.88, + "learning_rate": 6.182045486319478e-06, + "logits/chosen": -0.8016273975372314, + "logits/rejected": -0.8016273975372314, + "logps/chosen": -37.33206558227539, + "logps/rejected": -37.33206558227539, + "loss": 0.4398, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.15843887627124786, + "rewards/margins": 0.0, + "rewards/rejected": 0.15843887627124786, + "step": 5438 + }, + { + "epoch": 0.88, + "learning_rate": 6.180768445899839e-06, + "logits/chosen": -1.3583078384399414, + "logits/rejected": -1.2513365745544434, + "logps/chosen": -71.79381561279297, + "logps/rejected": -50.940826416015625, + "loss": 0.2296, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.264787197113037, + "rewards/margins": 0.640805721282959, + "rewards/rejected": 3.623981475830078, + "step": 5439 + }, + { + "epoch": 0.88, + "learning_rate": 6.179491323900273e-06, + "logits/chosen": -1.191244125366211, + "logits/rejected": -1.2904481887817383, + "logps/chosen": -88.38078308105469, + "logps/rejected": -104.93696594238281, + "loss": 1.4745, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.1525352001190186, + "rewards/margins": -2.8791301250457764, + "rewards/rejected": 6.031665325164795, + "step": 5440 + }, + { + "epoch": 0.88, + "learning_rate": 6.178214120409016e-06, + "logits/chosen": -0.7893907427787781, + "logits/rejected": -0.7923054099082947, + "logps/chosen": -17.44768524169922, + "logps/rejected": -16.968786239624023, + "loss": 2.1182, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.45054513216018677, + "rewards/margins": 0.0015741288661956787, + "rewards/rejected": 0.4489710032939911, + "step": 5441 + }, + { + "epoch": 0.88, + "learning_rate": 6.1769368355143125e-06, + "logits/chosen": -1.0242620706558228, + "logits/rejected": -1.120165228843689, + "logps/chosen": -33.872440338134766, + "logps/rejected": -69.4892578125, + "loss": 3.6173, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.0932323932647705, + "rewards/margins": -3.3900744915008545, + "rewards/rejected": 6.483306884765625, + "step": 5442 + }, + { + "epoch": 0.88, + "learning_rate": 6.17565946930441e-06, + "logits/chosen": -0.8746353387832642, + "logits/rejected": -0.8374010920524597, + "logps/chosen": -127.91877746582031, + "logps/rejected": -55.471168518066406, + "loss": 0.1488, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.7004318237304688, + "rewards/margins": 1.1574256420135498, + "rewards/rejected": 1.543006181716919, + "step": 5443 + }, + { + "epoch": 0.88, + "learning_rate": 6.174382021867562e-06, + "logits/chosen": -1.4451994895935059, + "logits/rejected": -1.3740190267562866, + "logps/chosen": -72.97095489501953, + "logps/rejected": -28.23168182373047, + "loss": 0.3127, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.250119924545288, + "rewards/margins": 2.2259788513183594, + "rewards/rejected": 1.0241409540176392, + "step": 5444 + }, + { + "epoch": 0.88, + "learning_rate": 6.1731044932920285e-06, + "logits/chosen": -1.1041620969772339, + "logits/rejected": -1.10662043094635, + "logps/chosen": -29.039674758911133, + "logps/rejected": -29.769712448120117, + "loss": 0.8386, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.0030683279037476, + "rewards/margins": -0.6501209735870361, + "rewards/rejected": 1.6531893014907837, + "step": 5445 + }, + { + "epoch": 0.88, + "learning_rate": 6.171826883666075e-06, + "logits/chosen": -1.4638220071792603, + "logits/rejected": -1.3064308166503906, + "logps/chosen": -133.63070678710938, + "logps/rejected": -60.63581848144531, + "loss": 0.0913, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.415902614593506, + "rewards/margins": 1.617832899093628, + "rewards/rejected": 2.798069715499878, + "step": 5446 + }, + { + "epoch": 0.88, + "learning_rate": 6.170549193077972e-06, + "logits/chosen": -0.73690265417099, + "logits/rejected": -0.7371451258659363, + "logps/chosen": -1.3947254419326782, + "logps/rejected": -2.948516607284546, + "loss": 2.8891, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.47392430901527405, + "rewards/margins": -0.1361415684223175, + "rewards/rejected": 0.6100658774375916, + "step": 5447 + }, + { + "epoch": 0.88, + "learning_rate": 6.1692714216159945e-06, + "logits/chosen": -1.0803114175796509, + "logits/rejected": -1.0803114175796509, + "logps/chosen": -38.98401641845703, + "logps/rejected": -38.98401641845703, + "loss": 0.5535, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.9158226251602173, + "rewards/margins": 0.0, + "rewards/rejected": 1.9158226251602173, + "step": 5448 + }, + { + "epoch": 0.88, + "learning_rate": 6.167993569368425e-06, + "logits/chosen": -1.259048342704773, + "logits/rejected": -1.0495421886444092, + "logps/chosen": -129.8162841796875, + "logps/rejected": -65.62187194824219, + "loss": 0.0744, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.105021953582764, + "rewards/margins": 1.8414263725280762, + "rewards/rejected": 3.2635955810546875, + "step": 5449 + }, + { + "epoch": 0.88, + "learning_rate": 6.166715636423552e-06, + "logits/chosen": -1.1785537004470825, + "logits/rejected": -1.0725904703140259, + "logps/chosen": -53.5234375, + "logps/rejected": -16.18480682373047, + "loss": 0.3016, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.333054304122925, + "rewards/margins": 3.0820324420928955, + "rewards/rejected": 0.25102177262306213, + "step": 5450 + }, + { + "epoch": 0.88, + "learning_rate": 6.16543762286967e-06, + "logits/chosen": -1.3933074474334717, + "logits/rejected": -0.9759024977684021, + "logps/chosen": -156.71807861328125, + "logps/rejected": -28.53140640258789, + "loss": 0.1144, + "rewards/accuracies": 1.0, + "rewards/chosen": 7.201499938964844, + "rewards/margins": 7.068637847900391, + "rewards/rejected": 0.1328618973493576, + "step": 5451 + }, + { + "epoch": 0.88, + "learning_rate": 6.164159528795074e-06, + "logits/chosen": -0.9899671673774719, + "logits/rejected": -0.9899671673774719, + "logps/chosen": -0.9993072748184204, + "logps/rejected": -0.9993072748184204, + "loss": 0.5167, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.23029637336730957, + "rewards/margins": 0.0, + "rewards/rejected": 0.23029637336730957, + "step": 5452 + }, + { + "epoch": 0.89, + "learning_rate": 6.162881354288071e-06, + "logits/chosen": -0.8283044099807739, + "logits/rejected": -0.8283044099807739, + "logps/chosen": -28.355640411376953, + "logps/rejected": -28.355640411376953, + "loss": 1.0484, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.9137150049209595, + "rewards/margins": 0.0, + "rewards/rejected": 0.9137150049209595, + "step": 5453 + }, + { + "epoch": 0.89, + "learning_rate": 6.161603099436968e-06, + "logits/chosen": -0.8610748648643494, + "logits/rejected": -0.6070848107337952, + "logps/chosen": -75.3868408203125, + "logps/rejected": -21.917654037475586, + "loss": 0.1738, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.8775978088378906, + "rewards/margins": 1.1494773626327515, + "rewards/rejected": 0.7281204462051392, + "step": 5454 + }, + { + "epoch": 0.89, + "learning_rate": 6.160324764330083e-06, + "logits/chosen": -1.3451249599456787, + "logits/rejected": -1.3684343099594116, + "logps/chosen": -121.29912567138672, + "logps/rejected": -134.47802734375, + "loss": 1.8233, + "rewards/accuracies": 0.0, + "rewards/chosen": 4.156606197357178, + "rewards/margins": -3.617630958557129, + "rewards/rejected": 7.774237155914307, + "step": 5455 + }, + { + "epoch": 0.89, + "learning_rate": 6.159046349055737e-06, + "logits/chosen": -1.1211942434310913, + "logits/rejected": -1.1164751052856445, + "logps/chosen": -27.202510833740234, + "logps/rejected": -29.384265899658203, + "loss": 2.2821, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.1387348175048828, + "rewards/margins": 0.42520368099212646, + "rewards/rejected": 0.7135311365127563, + "step": 5456 + }, + { + "epoch": 0.89, + "learning_rate": 6.157767853702254e-06, + "logits/chosen": -0.7101092338562012, + "logits/rejected": -0.7101092338562012, + "logps/chosen": -95.6392822265625, + "logps/rejected": -95.6392822265625, + "loss": 0.4133, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.0140702724456787, + "rewards/margins": 0.0, + "rewards/rejected": 2.0140702724456787, + "step": 5457 + }, + { + "epoch": 0.89, + "learning_rate": 6.156489278357967e-06, + "logits/chosen": -0.986821711063385, + "logits/rejected": -0.9342675805091858, + "logps/chosen": -31.988548278808594, + "logps/rejected": -29.574676513671875, + "loss": 1.2363, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.8503471612930298, + "rewards/margins": 0.8016707897186279, + "rewards/rejected": 1.0486763715744019, + "step": 5458 + }, + { + "epoch": 0.89, + "learning_rate": 6.155210623111213e-06, + "logits/chosen": -1.4284836053848267, + "logits/rejected": -1.1487021446228027, + "logps/chosen": -165.1259002685547, + "logps/rejected": -55.07334518432617, + "loss": 0.0562, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.361763000488281, + "rewards/margins": 2.305586576461792, + "rewards/rejected": 3.0561764240264893, + "step": 5459 + }, + { + "epoch": 0.89, + "learning_rate": 6.153931888050338e-06, + "logits/chosen": -1.068846344947815, + "logits/rejected": -0.7862977385520935, + "logps/chosen": -105.1180419921875, + "logps/rejected": -28.291397094726562, + "loss": 0.2386, + "rewards/accuracies": 1.0, + "rewards/chosen": 7.659341335296631, + "rewards/margins": 7.1156182289123535, + "rewards/rejected": 0.5437232851982117, + "step": 5460 + }, + { + "epoch": 0.89, + "learning_rate": 6.1526530732636855e-06, + "logits/chosen": -0.6681180000305176, + "logits/rejected": -0.6681180000305176, + "logps/chosen": -43.171669006347656, + "logps/rejected": -43.171669006347656, + "loss": 0.7416, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.155585527420044, + "rewards/margins": 0.0, + "rewards/rejected": 2.155585527420044, + "step": 5461 + }, + { + "epoch": 0.89, + "learning_rate": 6.151374178839613e-06, + "logits/chosen": -1.1046255826950073, + "logits/rejected": -1.1372286081314087, + "logps/chosen": -51.30487823486328, + "logps/rejected": -83.26033782958984, + "loss": 1.7136, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.8867263793945312, + "rewards/margins": -2.9356651306152344, + "rewards/rejected": 5.822391510009766, + "step": 5462 + }, + { + "epoch": 0.89, + "learning_rate": 6.15009520486648e-06, + "logits/chosen": -1.159295916557312, + "logits/rejected": -1.1382455825805664, + "logps/chosen": -43.720191955566406, + "logps/rejected": -59.37806701660156, + "loss": 1.6694, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.7803046703338623, + "rewards/margins": 0.029012203216552734, + "rewards/rejected": 2.7512924671173096, + "step": 5463 + }, + { + "epoch": 0.89, + "learning_rate": 6.148816151432649e-06, + "logits/chosen": -1.660823106765747, + "logits/rejected": -1.6343419551849365, + "logps/chosen": -235.28445434570312, + "logps/rejected": -28.72018814086914, + "loss": 0.0426, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.4552643299102783, + "rewards/margins": 2.437528610229492, + "rewards/rejected": 1.0177357196807861, + "step": 5464 + }, + { + "epoch": 0.89, + "learning_rate": 6.147537018626494e-06, + "logits/chosen": -0.9464170336723328, + "logits/rejected": -0.9987388253211975, + "logps/chosen": -44.33159637451172, + "logps/rejected": -94.51799774169922, + "loss": 0.6483, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.385923147201538, + "rewards/margins": 0.2072618007659912, + "rewards/rejected": 2.178661346435547, + "step": 5465 + }, + { + "epoch": 0.89, + "learning_rate": 6.14625780653639e-06, + "logits/chosen": -1.211423397064209, + "logits/rejected": -1.226922631263733, + "logps/chosen": -89.14532470703125, + "logps/rejected": -49.60908126831055, + "loss": 0.6364, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.7627899646759033, + "rewards/margins": -0.555556058883667, + "rewards/rejected": 3.3183460235595703, + "step": 5466 + }, + { + "epoch": 0.89, + "learning_rate": 6.1449785152507155e-06, + "logits/chosen": -0.7453703880310059, + "logits/rejected": -0.7465543150901794, + "logps/chosen": -2.3067262172698975, + "logps/rejected": -11.189603805541992, + "loss": 0.4901, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.2847120463848114, + "rewards/margins": -0.3450506031513214, + "rewards/rejected": 0.6297626495361328, + "step": 5467 + }, + { + "epoch": 0.89, + "learning_rate": 6.14369914485786e-06, + "logits/chosen": -0.8365653157234192, + "logits/rejected": -0.8365653157234192, + "logps/chosen": -21.907155990600586, + "logps/rejected": -21.907155990600586, + "loss": 0.7192, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.4501371383666992, + "rewards/margins": 0.0, + "rewards/rejected": 1.4501371383666992, + "step": 5468 + }, + { + "epoch": 0.89, + "learning_rate": 6.142419695446216e-06, + "logits/chosen": -1.467078685760498, + "logits/rejected": -1.4361214637756348, + "logps/chosen": -33.011104583740234, + "logps/rejected": -76.67560577392578, + "loss": 1.3516, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.9046878814697266, + "rewards/margins": 0.030712485313415527, + "rewards/rejected": 0.873975396156311, + "step": 5469 + }, + { + "epoch": 0.89, + "learning_rate": 6.141140167104179e-06, + "logits/chosen": -1.2292380332946777, + "logits/rejected": -1.4805134534835815, + "logps/chosen": -52.50147247314453, + "logps/rejected": -49.925838470458984, + "loss": 0.2931, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.1533119678497314, + "rewards/margins": 0.31182175874710083, + "rewards/rejected": 0.8414902091026306, + "step": 5470 + }, + { + "epoch": 0.89, + "learning_rate": 6.139860559920156e-06, + "logits/chosen": -1.3320430517196655, + "logits/rejected": -1.2937730550765991, + "logps/chosen": -182.3643341064453, + "logps/rejected": -87.29544830322266, + "loss": 0.0207, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.196235656738281, + "rewards/margins": 3.2460379600524902, + "rewards/rejected": 2.950197696685791, + "step": 5471 + }, + { + "epoch": 0.89, + "learning_rate": 6.138580873982553e-06, + "logits/chosen": -1.1242481470108032, + "logits/rejected": -1.0772514343261719, + "logps/chosen": -62.74585723876953, + "logps/rejected": -15.250136375427246, + "loss": 0.2508, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.884192705154419, + "rewards/margins": 1.4900939464569092, + "rewards/rejected": 0.39409875869750977, + "step": 5472 + }, + { + "epoch": 0.89, + "learning_rate": 6.137301109379784e-06, + "logits/chosen": -1.1834152936935425, + "logits/rejected": -1.1005786657333374, + "logps/chosen": -43.984100341796875, + "logps/rejected": -66.95863342285156, + "loss": 0.7077, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.8742949962615967, + "rewards/margins": -0.05989241600036621, + "rewards/rejected": 2.934187412261963, + "step": 5473 + }, + { + "epoch": 0.89, + "learning_rate": 6.136021266200271e-06, + "logits/chosen": -1.2390153408050537, + "logits/rejected": -1.146025538444519, + "logps/chosen": -66.16194915771484, + "logps/rejected": -31.634552001953125, + "loss": 1.2182, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.743006944656372, + "rewards/margins": -1.4755923748016357, + "rewards/rejected": 3.218599319458008, + "step": 5474 + }, + { + "epoch": 0.89, + "learning_rate": 6.134741344532436e-06, + "logits/chosen": -1.0945063829421997, + "logits/rejected": -1.0754438638687134, + "logps/chosen": -50.23149108886719, + "logps/rejected": -3.190176010131836, + "loss": 0.5304, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.6848034262657166, + "rewards/margins": 0.24038469791412354, + "rewards/rejected": 0.444418728351593, + "step": 5475 + }, + { + "epoch": 0.89, + "learning_rate": 6.133461344464713e-06, + "logits/chosen": -1.2840020656585693, + "logits/rejected": -1.2757434844970703, + "logps/chosen": -75.43875122070312, + "logps/rejected": -102.29326629638672, + "loss": 0.646, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.2077964544296265, + "rewards/margins": 0.17748868465423584, + "rewards/rejected": 1.0303077697753906, + "step": 5476 + }, + { + "epoch": 0.89, + "learning_rate": 6.132181266085535e-06, + "logits/chosen": -1.0081539154052734, + "logits/rejected": -1.0326374769210815, + "logps/chosen": -65.4483413696289, + "logps/rejected": -66.10831451416016, + "loss": 1.4426, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.4835288524627686, + "rewards/margins": -2.759819269180298, + "rewards/rejected": 5.243348121643066, + "step": 5477 + }, + { + "epoch": 0.89, + "learning_rate": 6.130901109483345e-06, + "logits/chosen": -1.0626170635223389, + "logits/rejected": -1.0379105806350708, + "logps/chosen": -13.111127853393555, + "logps/rejected": -4.85158634185791, + "loss": 0.5843, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.3879883289337158, + "rewards/margins": 0.49649643898010254, + "rewards/rejected": 0.8914918899536133, + "step": 5478 + }, + { + "epoch": 0.89, + "learning_rate": 6.129620874746588e-06, + "logits/chosen": -0.5363231301307678, + "logits/rejected": -0.5363231301307678, + "logps/chosen": -14.698448181152344, + "logps/rejected": -14.698448181152344, + "loss": 0.363, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.6099309921264648, + "rewards/margins": 0.0, + "rewards/rejected": 0.6099309921264648, + "step": 5479 + }, + { + "epoch": 0.89, + "learning_rate": 6.128340561963718e-06, + "logits/chosen": -1.149667739868164, + "logits/rejected": -1.1738895177841187, + "logps/chosen": -147.7762908935547, + "logps/rejected": -99.8486557006836, + "loss": 1.3164, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.4081084728240967, + "rewards/margins": 0.007818460464477539, + "rewards/rejected": 2.400290012359619, + "step": 5480 + }, + { + "epoch": 0.89, + "learning_rate": 6.127060171223191e-06, + "logits/chosen": -1.4207817316055298, + "logits/rejected": -1.0837066173553467, + "logps/chosen": -99.02462768554688, + "logps/rejected": -59.53202819824219, + "loss": 0.0086, + "rewards/accuracies": 1.0, + "rewards/chosen": 7.1330413818359375, + "rewards/margins": 4.390650749206543, + "rewards/rejected": 2.7423903942108154, + "step": 5481 + }, + { + "epoch": 0.89, + "learning_rate": 6.125779702613471e-06, + "logits/chosen": -1.0156865119934082, + "logits/rejected": -1.0027743577957153, + "logps/chosen": -82.77734375, + "logps/rejected": -68.56245422363281, + "loss": 0.1103, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.2652740478515625, + "rewards/margins": 1.5247032642364502, + "rewards/rejected": 0.7405708432197571, + "step": 5482 + }, + { + "epoch": 0.89, + "learning_rate": 6.124499156223026e-06, + "logits/chosen": -1.18996000289917, + "logits/rejected": -1.1499894857406616, + "logps/chosen": -49.07299041748047, + "logps/rejected": -49.78208541870117, + "loss": 0.8708, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.269150495529175, + "rewards/margins": -1.2779362201690674, + "rewards/rejected": 3.547086715698242, + "step": 5483 + }, + { + "epoch": 0.89, + "learning_rate": 6.12321853214033e-06, + "logits/chosen": -1.0049736499786377, + "logits/rejected": -0.940227210521698, + "logps/chosen": -49.3895263671875, + "logps/rejected": -34.83528518676758, + "loss": 0.2834, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.110830783843994, + "rewards/margins": 0.6883941888809204, + "rewards/rejected": 1.4224365949630737, + "step": 5484 + }, + { + "epoch": 0.89, + "learning_rate": 6.121937830453859e-06, + "logits/chosen": -1.0209993124008179, + "logits/rejected": -0.8482760787010193, + "logps/chosen": -51.671142578125, + "logps/rejected": -7.759003162384033, + "loss": 0.7027, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.5669784545898438, + "rewards/margins": 1.6567147970199585, + "rewards/rejected": 0.9102636575698853, + "step": 5485 + }, + { + "epoch": 0.89, + "learning_rate": 6.120657051252101e-06, + "logits/chosen": -1.2446379661560059, + "logits/rejected": -1.2446379661560059, + "logps/chosen": -74.37684631347656, + "logps/rejected": -74.37684631347656, + "loss": 1.2571, + "rewards/accuracies": 0.0, + "rewards/chosen": 5.449119567871094, + "rewards/margins": 0.0, + "rewards/rejected": 5.449119567871094, + "step": 5486 + }, + { + "epoch": 0.89, + "learning_rate": 6.119376194623545e-06, + "logits/chosen": -1.1536221504211426, + "logits/rejected": -1.2413908243179321, + "logps/chosen": -84.22980499267578, + "logps/rejected": -94.01519012451172, + "loss": 2.224, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.4790916442871094, + "rewards/margins": -2.870497226715088, + "rewards/rejected": 4.349588871002197, + "step": 5487 + }, + { + "epoch": 0.89, + "learning_rate": 6.118095260656686e-06, + "logits/chosen": -1.3999900817871094, + "logits/rejected": -1.3177244663238525, + "logps/chosen": -158.08218383789062, + "logps/rejected": -122.39691162109375, + "loss": 0.6657, + "rewards/accuracies": 0.0, + "rewards/chosen": 6.254800319671631, + "rewards/margins": -0.8206849098205566, + "rewards/rejected": 7.0754852294921875, + "step": 5488 + }, + { + "epoch": 0.89, + "learning_rate": 6.1168142494400226e-06, + "logits/chosen": -1.3393510580062866, + "logits/rejected": -1.3768929243087769, + "logps/chosen": -192.1200408935547, + "logps/rejected": -146.53309631347656, + "loss": 0.2195, + "rewards/accuracies": 1.0, + "rewards/chosen": 8.931193351745605, + "rewards/margins": 1.1888790130615234, + "rewards/rejected": 7.742314338684082, + "step": 5489 + }, + { + "epoch": 0.89, + "learning_rate": 6.115533161062062e-06, + "logits/chosen": -0.6492949724197388, + "logits/rejected": -0.6968647837638855, + "logps/chosen": -4.187460422515869, + "logps/rejected": -48.42990493774414, + "loss": 0.4434, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.564697802066803, + "rewards/margins": 0.013890624046325684, + "rewards/rejected": 0.5508071780204773, + "step": 5490 + }, + { + "epoch": 0.89, + "learning_rate": 6.114251995611315e-06, + "logits/chosen": -0.8554990887641907, + "logits/rejected": -0.8554990887641907, + "logps/chosen": -17.93393325805664, + "logps/rejected": -17.93393325805664, + "loss": 0.9155, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.9571030139923096, + "rewards/margins": 0.0, + "rewards/rejected": 2.9571030139923096, + "step": 5491 + }, + { + "epoch": 0.89, + "learning_rate": 6.1129707531763e-06, + "logits/chosen": -1.3841995000839233, + "logits/rejected": -1.4114278554916382, + "logps/chosen": -69.5223617553711, + "logps/rejected": -152.06317138671875, + "loss": 1.2581, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.381742238998413, + "rewards/margins": 0.43051087856292725, + "rewards/rejected": 1.9512313604354858, + "step": 5492 + }, + { + "epoch": 0.89, + "learning_rate": 6.111689433845536e-06, + "logits/chosen": -1.2771203517913818, + "logits/rejected": -1.2134144306182861, + "logps/chosen": -49.742713928222656, + "logps/rejected": -55.573486328125, + "loss": 0.4127, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.1903328895568848, + "rewards/margins": -0.19479966163635254, + "rewards/rejected": 2.3851325511932373, + "step": 5493 + }, + { + "epoch": 0.89, + "learning_rate": 6.110408037707551e-06, + "logits/chosen": -1.216389536857605, + "logits/rejected": -1.192455768585205, + "logps/chosen": -80.25233459472656, + "logps/rejected": -55.149314880371094, + "loss": 1.5118, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.5931382179260254, + "rewards/margins": 0.26822590827941895, + "rewards/rejected": 2.3249123096466064, + "step": 5494 + }, + { + "epoch": 0.89, + "learning_rate": 6.109126564850878e-06, + "logits/chosen": -1.0412607192993164, + "logits/rejected": -0.9411522150039673, + "logps/chosen": -58.47907257080078, + "logps/rejected": -50.180999755859375, + "loss": 0.2078, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0472984313964844, + "rewards/margins": 0.7063696384429932, + "rewards/rejected": 2.340928792953491, + "step": 5495 + }, + { + "epoch": 0.89, + "learning_rate": 6.107845015364054e-06, + "logits/chosen": -1.0358673334121704, + "logits/rejected": -1.025670051574707, + "logps/chosen": -20.532787322998047, + "logps/rejected": -4.7921552658081055, + "loss": 0.3616, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.4447288513183594, + "rewards/margins": 0.007848531007766724, + "rewards/rejected": 0.43688032031059265, + "step": 5496 + }, + { + "epoch": 0.89, + "learning_rate": 6.106563389335621e-06, + "logits/chosen": -1.08113431930542, + "logits/rejected": -0.8303214907646179, + "logps/chosen": -139.6324462890625, + "logps/rejected": -31.283308029174805, + "loss": 0.3453, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.18025541305542, + "rewards/margins": 3.847172975540161, + "rewards/rejected": 0.3330824077129364, + "step": 5497 + }, + { + "epoch": 0.89, + "learning_rate": 6.105281686854129e-06, + "logits/chosen": -0.8103343844413757, + "logits/rejected": -0.8723244071006775, + "logps/chosen": -97.70043182373047, + "logps/rejected": -54.61993408203125, + "loss": 0.8276, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.379651665687561, + "rewards/margins": -1.3810569047927856, + "rewards/rejected": 2.7607085704803467, + "step": 5498 + }, + { + "epoch": 0.89, + "learning_rate": 6.10399990800813e-06, + "logits/chosen": -1.0785810947418213, + "logits/rejected": -0.982305645942688, + "logps/chosen": -122.23646545410156, + "logps/rejected": -73.09170532226562, + "loss": 0.8153, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.7099655866622925, + "rewards/margins": -0.958297610282898, + "rewards/rejected": 2.6682631969451904, + "step": 5499 + }, + { + "epoch": 0.89, + "learning_rate": 6.102718052886184e-06, + "logits/chosen": -1.1979790925979614, + "logits/rejected": -1.129160761833191, + "logps/chosen": -77.23546600341797, + "logps/rejected": -77.04827880859375, + "loss": 0.8282, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.5846314430236816, + "rewards/margins": 0.5526597499847412, + "rewards/rejected": 2.0319716930389404, + "step": 5500 + }, + { + "epoch": 0.89, + "learning_rate": 6.101436121576855e-06, + "logits/chosen": -1.2704675197601318, + "logits/rejected": -1.2467358112335205, + "logps/chosen": -207.5492706298828, + "logps/rejected": -70.26663208007812, + "loss": 0.311, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.718550205230713, + "rewards/margins": 0.5647292137145996, + "rewards/rejected": 4.153820991516113, + "step": 5501 + }, + { + "epoch": 0.89, + "learning_rate": 6.10015411416871e-06, + "logits/chosen": -1.1175354719161987, + "logits/rejected": -1.1041256189346313, + "logps/chosen": -36.98027801513672, + "logps/rejected": -56.179901123046875, + "loss": 0.7738, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.344545841217041, + "rewards/margins": -0.18359756469726562, + "rewards/rejected": 2.5281434059143066, + "step": 5502 + }, + { + "epoch": 0.89, + "learning_rate": 6.098872030750328e-06, + "logits/chosen": -1.135183334350586, + "logits/rejected": -1.1915791034698486, + "logps/chosen": -66.89196014404297, + "logps/rejected": -63.11315155029297, + "loss": 1.2748, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.304253339767456, + "rewards/margins": -0.5391616821289062, + "rewards/rejected": 2.8434150218963623, + "step": 5503 + }, + { + "epoch": 0.89, + "learning_rate": 6.097589871410286e-06, + "logits/chosen": -1.0553451776504517, + "logits/rejected": -0.9914983510971069, + "logps/chosen": -65.3626937866211, + "logps/rejected": -74.02731323242188, + "loss": 2.0528, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.8536529541015625, + "rewards/margins": 0.6712051630020142, + "rewards/rejected": 1.1824477910995483, + "step": 5504 + }, + { + "epoch": 0.89, + "learning_rate": 6.096307636237167e-06, + "logits/chosen": -0.6945281624794006, + "logits/rejected": -0.6894075870513916, + "logps/chosen": -51.78810119628906, + "logps/rejected": -52.89158630371094, + "loss": 0.409, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.6305702924728394, + "rewards/margins": 0.47174835205078125, + "rewards/rejected": 1.158821940422058, + "step": 5505 + }, + { + "epoch": 0.89, + "learning_rate": 6.0950253253195656e-06, + "logits/chosen": -1.1629793643951416, + "logits/rejected": -1.1819851398468018, + "logps/chosen": -63.73644256591797, + "logps/rejected": -94.10701751708984, + "loss": 0.6293, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.4960838556289673, + "rewards/margins": -0.061983466148376465, + "rewards/rejected": 1.5580673217773438, + "step": 5506 + }, + { + "epoch": 0.89, + "learning_rate": 6.093742938746075e-06, + "logits/chosen": -1.547136902809143, + "logits/rejected": -1.4611146450042725, + "logps/chosen": -119.14762115478516, + "logps/rejected": -66.4024887084961, + "loss": 0.3571, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.868436336517334, + "rewards/margins": 3.8463666439056396, + "rewards/rejected": 2.0220696926116943, + "step": 5507 + }, + { + "epoch": 0.89, + "learning_rate": 6.092460476605296e-06, + "logits/chosen": -1.2060869932174683, + "logits/rejected": -1.241348385810852, + "logps/chosen": -119.15777587890625, + "logps/rejected": -166.46054077148438, + "loss": 0.9177, + "rewards/accuracies": 0.0, + "rewards/chosen": 5.0049285888671875, + "rewards/margins": -1.6511812210083008, + "rewards/rejected": 6.656109809875488, + "step": 5508 + }, + { + "epoch": 0.89, + "learning_rate": 6.091177938985836e-06, + "logits/chosen": -0.8783603310585022, + "logits/rejected": -1.0437077283859253, + "logps/chosen": -72.1252212524414, + "logps/rejected": -117.20294189453125, + "loss": 2.9555, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.449018120765686, + "rewards/margins": -3.6625819206237793, + "rewards/rejected": 5.111599922180176, + "step": 5509 + }, + { + "epoch": 0.89, + "learning_rate": 6.089895325976305e-06, + "logits/chosen": -1.6384004354476929, + "logits/rejected": -1.6282413005828857, + "logps/chosen": -171.002197265625, + "logps/rejected": -54.554771423339844, + "loss": 0.0316, + "rewards/accuracies": 1.0, + "rewards/chosen": 7.362301826477051, + "rewards/margins": 3.232175350189209, + "rewards/rejected": 4.130126476287842, + "step": 5510 + }, + { + "epoch": 0.89, + "learning_rate": 6.08861263766532e-06, + "logits/chosen": -1.4600645303726196, + "logits/rejected": -1.364884614944458, + "logps/chosen": -126.76826477050781, + "logps/rejected": -93.98123168945312, + "loss": 0.1392, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.4309587478637695, + "rewards/margins": 1.1568942070007324, + "rewards/rejected": 5.274064540863037, + "step": 5511 + }, + { + "epoch": 0.89, + "learning_rate": 6.087329874141501e-06, + "logits/chosen": -0.964301347732544, + "logits/rejected": -0.9623322486877441, + "logps/chosen": -69.79470825195312, + "logps/rejected": -63.10498046875, + "loss": 0.9365, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.0926055908203125, + "rewards/margins": -0.12983548641204834, + "rewards/rejected": 1.2224410772323608, + "step": 5512 + }, + { + "epoch": 0.89, + "learning_rate": 6.086047035493477e-06, + "logits/chosen": -1.3440232276916504, + "logits/rejected": -1.2939538955688477, + "logps/chosen": -74.38839721679688, + "logps/rejected": -71.60365295410156, + "loss": 0.0738, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.52386474609375, + "rewards/margins": 1.9869766235351562, + "rewards/rejected": 0.5368881225585938, + "step": 5513 + }, + { + "epoch": 0.89, + "learning_rate": 6.084764121809878e-06, + "logits/chosen": -1.0439614057540894, + "logits/rejected": -1.0013302564620972, + "logps/chosen": -82.65226745605469, + "logps/rejected": -40.164573669433594, + "loss": 0.654, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.930041551589966, + "rewards/margins": 1.1173092126846313, + "rewards/rejected": 1.8127323389053345, + "step": 5514 + }, + { + "epoch": 0.9, + "learning_rate": 6.083481133179344e-06, + "logits/chosen": -1.3822250366210938, + "logits/rejected": -1.1474195718765259, + "logps/chosen": -120.02607727050781, + "logps/rejected": -74.86444091796875, + "loss": 0.1654, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.058993816375732, + "rewards/margins": 1.794386625289917, + "rewards/rejected": 3.2646071910858154, + "step": 5515 + }, + { + "epoch": 0.9, + "learning_rate": 6.0821980696905145e-06, + "logits/chosen": -1.509725570678711, + "logits/rejected": -1.5019054412841797, + "logps/chosen": -111.6602783203125, + "logps/rejected": -108.0430679321289, + "loss": 1.732, + "rewards/accuracies": 0.0, + "rewards/chosen": 6.183706760406494, + "rewards/margins": -1.2305536270141602, + "rewards/rejected": 7.414260387420654, + "step": 5516 + }, + { + "epoch": 0.9, + "learning_rate": 6.080914931432039e-06, + "logits/chosen": -1.048211932182312, + "logits/rejected": -1.0314288139343262, + "logps/chosen": -90.17202758789062, + "logps/rejected": -104.91184997558594, + "loss": 0.5888, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.9719536304473877, + "rewards/margins": 0.4726959466934204, + "rewards/rejected": 1.4992576837539673, + "step": 5517 + }, + { + "epoch": 0.9, + "learning_rate": 6.079631718492569e-06, + "logits/chosen": -0.9826139211654663, + "logits/rejected": -1.065537929534912, + "logps/chosen": -103.68331909179688, + "logps/rejected": -108.68399047851562, + "loss": 0.7383, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.69026517868042, + "rewards/margins": 1.557403564453125, + "rewards/rejected": 4.132861614227295, + "step": 5518 + }, + { + "epoch": 0.9, + "learning_rate": 6.078348430960763e-06, + "logits/chosen": -1.0456124544143677, + "logits/rejected": -1.0564734935760498, + "logps/chosen": -56.08784103393555, + "logps/rejected": -44.3350715637207, + "loss": 0.6411, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.2244945764541626, + "rewards/margins": -0.5886528491973877, + "rewards/rejected": 1.8131474256515503, + "step": 5519 + }, + { + "epoch": 0.9, + "learning_rate": 6.077065068925284e-06, + "logits/chosen": -0.8473370671272278, + "logits/rejected": -0.8473370671272278, + "logps/chosen": -33.45446014404297, + "logps/rejected": -33.45446014404297, + "loss": 0.7549, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.1305387020111084, + "rewards/margins": 0.0, + "rewards/rejected": 2.1305387020111084, + "step": 5520 + }, + { + "epoch": 0.9, + "learning_rate": 6.0757816324748e-06, + "logits/chosen": -1.04331374168396, + "logits/rejected": -1.0462050437927246, + "logps/chosen": -41.56464385986328, + "logps/rejected": -33.88018035888672, + "loss": 0.2791, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.7024929523468018, + "rewards/margins": 0.6342194080352783, + "rewards/rejected": 1.0682735443115234, + "step": 5521 + }, + { + "epoch": 0.9, + "learning_rate": 6.074498121697983e-06, + "logits/chosen": -1.2873632907867432, + "logits/rejected": -1.499934434890747, + "logps/chosen": -82.09234619140625, + "logps/rejected": -32.60871505737305, + "loss": 0.8892, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.847084879875183, + "rewards/margins": 1.3878735303878784, + "rewards/rejected": 0.4592113494873047, + "step": 5522 + }, + { + "epoch": 0.9, + "learning_rate": 6.073214536683515e-06, + "logits/chosen": -0.9580207467079163, + "logits/rejected": -0.9580207467079163, + "logps/chosen": -0.9082881212234497, + "logps/rejected": -0.9082881212234497, + "loss": 0.5102, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.3722309172153473, + "rewards/margins": 0.0, + "rewards/rejected": 0.3722309172153473, + "step": 5523 + }, + { + "epoch": 0.9, + "learning_rate": 6.071930877520076e-06, + "logits/chosen": -1.386951208114624, + "logits/rejected": -1.4113770723342896, + "logps/chosen": -37.06684875488281, + "logps/rejected": -38.694122314453125, + "loss": 0.568, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.8137741088867188, + "rewards/margins": -0.5035278797149658, + "rewards/rejected": 3.3173019886016846, + "step": 5524 + }, + { + "epoch": 0.9, + "learning_rate": 6.070647144296356e-06, + "logits/chosen": -1.228084921836853, + "logits/rejected": -1.1096001863479614, + "logps/chosen": -53.656002044677734, + "logps/rejected": -79.78386688232422, + "loss": 2.1823, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.390761137008667, + "rewards/margins": -1.699333906173706, + "rewards/rejected": 4.090095043182373, + "step": 5525 + }, + { + "epoch": 0.9, + "learning_rate": 6.0693633371010495e-06, + "logits/chosen": -1.2707244157791138, + "logits/rejected": -1.2837337255477905, + "logps/chosen": -51.822357177734375, + "logps/rejected": -57.742393493652344, + "loss": 1.0063, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.6400039196014404, + "rewards/margins": -0.5632095336914062, + "rewards/rejected": 3.2032134532928467, + "step": 5526 + }, + { + "epoch": 0.9, + "learning_rate": 6.068079456022855e-06, + "logits/chosen": -0.7827269434928894, + "logits/rejected": -0.8156787753105164, + "logps/chosen": -66.22843933105469, + "logps/rejected": -38.45550537109375, + "loss": 1.2507, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.5514694452285767, + "rewards/margins": -1.701391339302063, + "rewards/rejected": 2.2528607845306396, + "step": 5527 + }, + { + "epoch": 0.9, + "learning_rate": 6.066795501150477e-06, + "logits/chosen": -0.9423593878746033, + "logits/rejected": -0.9216620922088623, + "logps/chosen": -45.20199203491211, + "logps/rejected": -39.84773254394531, + "loss": 0.5308, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.2836182117462158, + "rewards/margins": -0.5469176769256592, + "rewards/rejected": 1.830535888671875, + "step": 5528 + }, + { + "epoch": 0.9, + "learning_rate": 6.065511472572621e-06, + "logits/chosen": -1.0107980966567993, + "logits/rejected": -0.9733796715736389, + "logps/chosen": -59.34926223754883, + "logps/rejected": -55.16511535644531, + "loss": 0.5268, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.6379817724227905, + "rewards/margins": -0.5767887830734253, + "rewards/rejected": 2.214770555496216, + "step": 5529 + }, + { + "epoch": 0.9, + "learning_rate": 6.064227370378007e-06, + "logits/chosen": -1.1512150764465332, + "logits/rejected": -1.1256238222122192, + "logps/chosen": -71.30165100097656, + "logps/rejected": -54.67387771606445, + "loss": 0.2837, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.366209506988525, + "rewards/margins": 1.0199527740478516, + "rewards/rejected": 3.346256732940674, + "step": 5530 + }, + { + "epoch": 0.9, + "learning_rate": 6.062943194655351e-06, + "logits/chosen": -0.9694201350212097, + "logits/rejected": -0.9472607970237732, + "logps/chosen": -74.38751220703125, + "logps/rejected": -53.589256286621094, + "loss": 0.5488, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.871826171875, + "rewards/margins": -0.44506311416625977, + "rewards/rejected": 2.3168892860412598, + "step": 5531 + }, + { + "epoch": 0.9, + "learning_rate": 6.061658945493378e-06, + "logits/chosen": -1.1359091997146606, + "logits/rejected": -1.1132880449295044, + "logps/chosen": -153.4456329345703, + "logps/rejected": -68.93701171875, + "loss": 0.0315, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.8417863845825195, + "rewards/margins": 2.7619385719299316, + "rewards/rejected": 2.079847812652588, + "step": 5532 + }, + { + "epoch": 0.9, + "learning_rate": 6.060374622980816e-06, + "logits/chosen": -0.9623807072639465, + "logits/rejected": -0.9131305813789368, + "logps/chosen": -64.85069274902344, + "logps/rejected": -56.18727111816406, + "loss": 0.4797, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.7576690912246704, + "rewards/margins": -0.01921534538269043, + "rewards/rejected": 1.7768844366073608, + "step": 5533 + }, + { + "epoch": 0.9, + "learning_rate": 6.059090227206402e-06, + "logits/chosen": -1.3508379459381104, + "logits/rejected": -1.3764067888259888, + "logps/chosen": -93.02037048339844, + "logps/rejected": -129.74249267578125, + "loss": 0.5952, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.479609727859497, + "rewards/margins": -0.7803740501403809, + "rewards/rejected": 2.259983777999878, + "step": 5534 + }, + { + "epoch": 0.9, + "learning_rate": 6.057805758258874e-06, + "logits/chosen": -1.4316868782043457, + "logits/rejected": -1.4218828678131104, + "logps/chosen": -78.41983032226562, + "logps/rejected": -126.00654602050781, + "loss": 0.3279, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.845685005187988, + "rewards/margins": 0.09607696533203125, + "rewards/rejected": 5.749608039855957, + "step": 5535 + }, + { + "epoch": 0.9, + "learning_rate": 6.056521216226978e-06, + "logits/chosen": -1.33403480052948, + "logits/rejected": -1.2279144525527954, + "logps/chosen": -72.193603515625, + "logps/rejected": -43.861083984375, + "loss": 0.1847, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.5878777503967285, + "rewards/margins": 1.0500179529190063, + "rewards/rejected": 1.5378597974777222, + "step": 5536 + }, + { + "epoch": 0.9, + "learning_rate": 6.055236601199462e-06, + "logits/chosen": -1.2939566373825073, + "logits/rejected": -1.4168200492858887, + "logps/chosen": -42.706626892089844, + "logps/rejected": -65.74336242675781, + "loss": 1.7732, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.9572170972824097, + "rewards/margins": -3.515202522277832, + "rewards/rejected": 5.472419738769531, + "step": 5537 + }, + { + "epoch": 0.9, + "learning_rate": 6.053951913265083e-06, + "logits/chosen": -0.8452584743499756, + "logits/rejected": -0.8694325089454651, + "logps/chosen": -51.10186004638672, + "logps/rejected": -66.19778442382812, + "loss": 0.3882, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.7131996154785156, + "rewards/margins": -0.14623332023620605, + "rewards/rejected": 2.8594329357147217, + "step": 5538 + }, + { + "epoch": 0.9, + "learning_rate": 6.052667152512598e-06, + "logits/chosen": -0.9829250574111938, + "logits/rejected": -0.9994696378707886, + "logps/chosen": -28.101776123046875, + "logps/rejected": -48.858123779296875, + "loss": 0.4994, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.5038818717002869, + "rewards/margins": -0.44133830070495605, + "rewards/rejected": 0.9452201724052429, + "step": 5539 + }, + { + "epoch": 0.9, + "learning_rate": 6.051382319030773e-06, + "logits/chosen": -0.9527563452720642, + "logits/rejected": -0.9605107307434082, + "logps/chosen": -15.321313858032227, + "logps/rejected": -9.928083419799805, + "loss": 0.4896, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.34064236283302307, + "rewards/margins": 0.44723808765411377, + "rewards/rejected": -0.1065957099199295, + "step": 5540 + }, + { + "epoch": 0.9, + "learning_rate": 6.050097412908379e-06, + "logits/chosen": -1.3063223361968994, + "logits/rejected": -1.317800760269165, + "logps/chosen": -94.56582641601562, + "logps/rejected": -73.58190155029297, + "loss": 0.4318, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.322613716125488, + "rewards/margins": 1.6832153797149658, + "rewards/rejected": 2.6393983364105225, + "step": 5541 + }, + { + "epoch": 0.9, + "learning_rate": 6.048812434234189e-06, + "logits/chosen": -1.4611858129501343, + "logits/rejected": -1.4411346912384033, + "logps/chosen": -149.44833374023438, + "logps/rejected": -57.28404235839844, + "loss": 0.9139, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.068751573562622, + "rewards/margins": -1.304670810699463, + "rewards/rejected": 3.373422384262085, + "step": 5542 + }, + { + "epoch": 0.9, + "learning_rate": 6.047527383096984e-06, + "logits/chosen": -1.1699786186218262, + "logits/rejected": -1.1691770553588867, + "logps/chosen": -49.77369689941406, + "logps/rejected": -41.21855163574219, + "loss": 0.487, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.962489366531372, + "rewards/margins": -0.4957549571990967, + "rewards/rejected": 2.4582443237304688, + "step": 5543 + }, + { + "epoch": 0.9, + "learning_rate": 6.046242259585549e-06, + "logits/chosen": -1.3735252618789673, + "logits/rejected": -1.2707931995391846, + "logps/chosen": -86.79936218261719, + "logps/rejected": -61.923805236816406, + "loss": 0.5278, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.7634522914886475, + "rewards/margins": -0.381284236907959, + "rewards/rejected": 3.1447365283966064, + "step": 5544 + }, + { + "epoch": 0.9, + "learning_rate": 6.0449570637886736e-06, + "logits/chosen": -0.9292153716087341, + "logits/rejected": -0.8752558827400208, + "logps/chosen": -52.78879928588867, + "logps/rejected": -53.621585845947266, + "loss": 0.3186, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.7142170667648315, + "rewards/margins": 0.11893463134765625, + "rewards/rejected": 1.5952824354171753, + "step": 5545 + }, + { + "epoch": 0.9, + "learning_rate": 6.043671795795152e-06, + "logits/chosen": -1.4565585851669312, + "logits/rejected": -1.5042811632156372, + "logps/chosen": -279.22174072265625, + "logps/rejected": -57.38990783691406, + "loss": 0.568, + "rewards/accuracies": 1.0, + "rewards/chosen": 8.722625732421875, + "rewards/margins": 4.9685797691345215, + "rewards/rejected": 3.7540459632873535, + "step": 5546 + }, + { + "epoch": 0.9, + "learning_rate": 6.042386455693785e-06, + "logits/chosen": -1.2226762771606445, + "logits/rejected": -1.2226762771606445, + "logps/chosen": -32.24927520751953, + "logps/rejected": -32.24927520751953, + "loss": 1.4122, + "rewards/accuracies": 0.0, + "rewards/chosen": 4.686837196350098, + "rewards/margins": 0.0, + "rewards/rejected": 4.686837196350098, + "step": 5547 + }, + { + "epoch": 0.9, + "learning_rate": 6.041101043573376e-06, + "logits/chosen": -1.0320709943771362, + "logits/rejected": -1.0193568468093872, + "logps/chosen": -23.137004852294922, + "logps/rejected": -27.656156539916992, + "loss": 0.8626, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.9351219534873962, + "rewards/margins": 0.6817907691001892, + "rewards/rejected": 0.25333118438720703, + "step": 5548 + }, + { + "epoch": 0.9, + "learning_rate": 6.039815559522738e-06, + "logits/chosen": -1.1164993047714233, + "logits/rejected": -0.9512662291526794, + "logps/chosen": -97.56259155273438, + "logps/rejected": -71.41271209716797, + "loss": 0.0952, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.9041948318481445, + "rewards/margins": 2.6704583168029785, + "rewards/rejected": 4.233736515045166, + "step": 5549 + }, + { + "epoch": 0.9, + "learning_rate": 6.0385300036306825e-06, + "logits/chosen": -1.663488745689392, + "logits/rejected": -1.705918550491333, + "logps/chosen": -87.2289047241211, + "logps/rejected": -120.00857543945312, + "loss": 1.1437, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.413778066635132, + "rewards/margins": -2.1797034740448, + "rewards/rejected": 4.593481540679932, + "step": 5550 + }, + { + "epoch": 0.9, + "learning_rate": 6.03724437598603e-06, + "logits/chosen": -1.2358893156051636, + "logits/rejected": -1.1553083658218384, + "logps/chosen": -90.8869857788086, + "logps/rejected": -73.97078704833984, + "loss": 0.3052, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.38765025138855, + "rewards/margins": 0.17378520965576172, + "rewards/rejected": 2.213865041732788, + "step": 5551 + }, + { + "epoch": 0.9, + "learning_rate": 6.035958676677607e-06, + "logits/chosen": -1.8333849906921387, + "logits/rejected": -1.0596898794174194, + "logps/chosen": -103.11579132080078, + "logps/rejected": -119.25846099853516, + "loss": 1.0676, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.811041235923767, + "rewards/margins": -0.8259590864181519, + "rewards/rejected": 2.637000322341919, + "step": 5552 + }, + { + "epoch": 0.9, + "learning_rate": 6.03467290579424e-06, + "logits/chosen": -1.2579149007797241, + "logits/rejected": -1.1882761716842651, + "logps/chosen": -77.83396911621094, + "logps/rejected": -26.755043029785156, + "loss": 0.1747, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.525630235671997, + "rewards/margins": 2.146554946899414, + "rewards/rejected": 0.3790752589702606, + "step": 5553 + }, + { + "epoch": 0.9, + "learning_rate": 6.033387063424765e-06, + "logits/chosen": -1.0310895442962646, + "logits/rejected": -1.002132534980774, + "logps/chosen": -56.29449462890625, + "logps/rejected": -70.20233154296875, + "loss": 0.6172, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.9035195112228394, + "rewards/margins": -0.7304786443710327, + "rewards/rejected": 2.633998155593872, + "step": 5554 + }, + { + "epoch": 0.9, + "learning_rate": 6.032101149658023e-06, + "logits/chosen": -0.9367484450340271, + "logits/rejected": -0.9462611079216003, + "logps/chosen": -83.35838317871094, + "logps/rejected": -80.74569702148438, + "loss": 1.3544, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.5842087268829346, + "rewards/margins": -1.5064117908477783, + "rewards/rejected": 3.090620517730713, + "step": 5555 + }, + { + "epoch": 0.9, + "learning_rate": 6.030815164582857e-06, + "logits/chosen": -1.1554573774337769, + "logits/rejected": -1.1731895208358765, + "logps/chosen": -78.82251739501953, + "logps/rejected": -68.6033935546875, + "loss": 0.4318, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.6371161937713623, + "rewards/margins": -0.13042831420898438, + "rewards/rejected": 2.7675445079803467, + "step": 5556 + }, + { + "epoch": 0.9, + "learning_rate": 6.029529108288118e-06, + "logits/chosen": -0.7174528241157532, + "logits/rejected": -0.7174528241157532, + "logps/chosen": -37.28116226196289, + "logps/rejected": -37.28116226196289, + "loss": 0.7549, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.4381084442138672, + "rewards/margins": 0.0, + "rewards/rejected": 1.4381084442138672, + "step": 5557 + }, + { + "epoch": 0.9, + "learning_rate": 6.028242980862659e-06, + "logits/chosen": -1.4663655757904053, + "logits/rejected": -1.5256162881851196, + "logps/chosen": -101.58885955810547, + "logps/rejected": -91.88259887695312, + "loss": 1.2103, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.6139869689941406, + "rewards/margins": -2.3198630809783936, + "rewards/rejected": 3.933850049972534, + "step": 5558 + }, + { + "epoch": 0.9, + "learning_rate": 6.026956782395338e-06, + "logits/chosen": -1.1416311264038086, + "logits/rejected": -1.1444605588912964, + "logps/chosen": -25.76994514465332, + "logps/rejected": -38.9317512512207, + "loss": 0.6878, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.6215498447418213, + "rewards/margins": 0.6126132011413574, + "rewards/rejected": 2.008936643600464, + "step": 5559 + }, + { + "epoch": 0.9, + "learning_rate": 6.025670512975022e-06, + "logits/chosen": -0.7938490509986877, + "logits/rejected": -0.7938490509986877, + "logps/chosen": -29.01703453063965, + "logps/rejected": -29.01703453063965, + "loss": 1.2962, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.9368349313735962, + "rewards/margins": 0.0, + "rewards/rejected": 1.9368349313735962, + "step": 5560 + }, + { + "epoch": 0.9, + "learning_rate": 6.0243841726905775e-06, + "logits/chosen": -1.3285224437713623, + "logits/rejected": -1.312897801399231, + "logps/chosen": -89.04747009277344, + "logps/rejected": -84.99687957763672, + "loss": 0.8673, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.9490547180175781, + "rewards/margins": -1.0908095836639404, + "rewards/rejected": 2.0398643016815186, + "step": 5561 + }, + { + "epoch": 0.9, + "learning_rate": 6.023097761630879e-06, + "logits/chosen": -1.049970030784607, + "logits/rejected": -1.062340259552002, + "logps/chosen": -60.244834899902344, + "logps/rejected": -17.26910400390625, + "loss": 0.3532, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.1410248279571533, + "rewards/margins": 0.6004484295845032, + "rewards/rejected": 0.5405763983726501, + "step": 5562 + }, + { + "epoch": 0.9, + "learning_rate": 6.021811279884807e-06, + "logits/chosen": -0.918125331401825, + "logits/rejected": -0.9269897937774658, + "logps/chosen": -78.31664276123047, + "logps/rejected": -45.95383834838867, + "loss": 0.4201, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.5206489562988281, + "rewards/margins": -0.27066612243652344, + "rewards/rejected": 1.7913150787353516, + "step": 5563 + }, + { + "epoch": 0.9, + "learning_rate": 6.020524727541244e-06, + "logits/chosen": -1.111185073852539, + "logits/rejected": -1.1414676904678345, + "logps/chosen": -61.51377487182617, + "logps/rejected": -55.976619720458984, + "loss": 0.8779, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.6442677974700928, + "rewards/margins": 0.81333327293396, + "rewards/rejected": 1.8309345245361328, + "step": 5564 + }, + { + "epoch": 0.9, + "learning_rate": 6.019238104689078e-06, + "logits/chosen": -1.3137669563293457, + "logits/rejected": -1.2442576885223389, + "logps/chosen": -43.379913330078125, + "logps/rejected": -53.42340850830078, + "loss": 0.7548, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.2342941761016846, + "rewards/margins": -0.757361650466919, + "rewards/rejected": 2.9916558265686035, + "step": 5565 + }, + { + "epoch": 0.9, + "learning_rate": 6.017951411417203e-06, + "logits/chosen": -1.5754601955413818, + "logits/rejected": -1.5086054801940918, + "logps/chosen": -106.37556457519531, + "logps/rejected": -29.1253662109375, + "loss": 0.4415, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.863494873046875, + "rewards/margins": 1.5242828130722046, + "rewards/rejected": 1.3392120599746704, + "step": 5566 + }, + { + "epoch": 0.9, + "learning_rate": 6.016664647814518e-06, + "logits/chosen": -0.9286119937896729, + "logits/rejected": -0.9501217007637024, + "logps/chosen": -114.04975128173828, + "logps/rejected": -50.01946258544922, + "loss": 0.5008, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.3586204051971436, + "rewards/margins": -0.4552178382873535, + "rewards/rejected": 2.813838243484497, + "step": 5567 + }, + { + "epoch": 0.9, + "learning_rate": 6.015377813969925e-06, + "logits/chosen": -1.0651769638061523, + "logits/rejected": -0.9304618835449219, + "logps/chosen": -76.95191955566406, + "logps/rejected": -61.04814529418945, + "loss": 0.8439, + "rewards/accuracies": 1.0, + "rewards/chosen": 7.4598894119262695, + "rewards/margins": 4.173529624938965, + "rewards/rejected": 3.2863597869873047, + "step": 5568 + }, + { + "epoch": 0.9, + "learning_rate": 6.014090909972333e-06, + "logits/chosen": -1.3492639064788818, + "logits/rejected": -1.4140794277191162, + "logps/chosen": -193.64114379882812, + "logps/rejected": -164.1099090576172, + "loss": 1.077, + "rewards/accuracies": 0.0, + "rewards/chosen": 6.181938171386719, + "rewards/margins": -1.908212661743164, + "rewards/rejected": 8.090150833129883, + "step": 5569 + }, + { + "epoch": 0.9, + "learning_rate": 6.012803935910655e-06, + "logits/chosen": -1.171352505683899, + "logits/rejected": -1.1338131427764893, + "logps/chosen": -59.96519088745117, + "logps/rejected": -64.90372467041016, + "loss": 0.8934, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.835824966430664, + "rewards/margins": -0.003056764602661133, + "rewards/rejected": 2.838881731033325, + "step": 5570 + }, + { + "epoch": 0.9, + "learning_rate": 6.011516891873808e-06, + "logits/chosen": -1.2229069471359253, + "logits/rejected": -1.1540248394012451, + "logps/chosen": -143.97789001464844, + "logps/rejected": -110.91654968261719, + "loss": 0.3169, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.9675888419151306, + "rewards/margins": 0.22066271305084229, + "rewards/rejected": 0.7469261288642883, + "step": 5571 + }, + { + "epoch": 0.9, + "learning_rate": 6.0102297779507136e-06, + "logits/chosen": -1.2881855964660645, + "logits/rejected": -1.2575377225875854, + "logps/chosen": -40.60685729980469, + "logps/rejected": -72.17493438720703, + "loss": 0.9475, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.1148598194122314, + "rewards/margins": 0.04060983657836914, + "rewards/rejected": 2.0742499828338623, + "step": 5572 + }, + { + "epoch": 0.9, + "learning_rate": 6.008942594230303e-06, + "logits/chosen": -1.3436262607574463, + "logits/rejected": -1.334701418876648, + "logps/chosen": -67.6292724609375, + "logps/rejected": -125.67971801757812, + "loss": 0.3268, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.630258083343506, + "rewards/margins": 0.1377410888671875, + "rewards/rejected": 6.492516994476318, + "step": 5573 + }, + { + "epoch": 0.9, + "learning_rate": 6.0076553408015035e-06, + "logits/chosen": -1.249634027481079, + "logits/rejected": -1.245566964149475, + "logps/chosen": -56.11897659301758, + "logps/rejected": -52.02144241333008, + "loss": 0.4785, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.3911685943603516, + "rewards/margins": -0.3462791442871094, + "rewards/rejected": 1.737447738647461, + "step": 5574 + }, + { + "epoch": 0.9, + "learning_rate": 6.0063680177532555e-06, + "logits/chosen": -1.2784953117370605, + "logits/rejected": -1.1263153553009033, + "logps/chosen": -47.88656997680664, + "logps/rejected": -15.50712776184082, + "loss": 0.0651, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9275805950164795, + "rewards/margins": 2.1129672527313232, + "rewards/rejected": 0.8146133422851562, + "step": 5575 + }, + { + "epoch": 0.91, + "learning_rate": 6.0050806251745e-06, + "logits/chosen": -1.2847710847854614, + "logits/rejected": -1.2355049848556519, + "logps/chosen": -52.31148910522461, + "logps/rejected": -18.28488540649414, + "loss": 0.1092, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.7740436792373657, + "rewards/margins": 1.6502529382705688, + "rewards/rejected": 0.12379074096679688, + "step": 5576 + }, + { + "epoch": 0.91, + "learning_rate": 6.003793163154184e-06, + "logits/chosen": -1.121519923210144, + "logits/rejected": -1.1148831844329834, + "logps/chosen": -48.87914276123047, + "logps/rejected": -50.53708267211914, + "loss": 0.4843, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.0222434997558594, + "rewards/margins": -0.42894792556762695, + "rewards/rejected": 2.4511914253234863, + "step": 5577 + }, + { + "epoch": 0.91, + "learning_rate": 6.002505631781257e-06, + "logits/chosen": -0.9827877879142761, + "logits/rejected": -1.0014855861663818, + "logps/chosen": -113.89521789550781, + "logps/rejected": -53.29169845581055, + "loss": 1.6603, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.5751479864120483, + "rewards/margins": -0.6723896265029907, + "rewards/rejected": 2.247537612915039, + "step": 5578 + }, + { + "epoch": 0.91, + "learning_rate": 6.001218031144677e-06, + "logits/chosen": -1.0543233156204224, + "logits/rejected": -1.053303837776184, + "logps/chosen": -81.7132339477539, + "logps/rejected": -49.627647399902344, + "loss": 0.921, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.9164962768554688, + "rewards/margins": -0.7289597988128662, + "rewards/rejected": 2.645456075668335, + "step": 5579 + }, + { + "epoch": 0.91, + "learning_rate": 5.999930361333405e-06, + "logits/chosen": -0.8428307771682739, + "logits/rejected": -0.7073342204093933, + "logps/chosen": -47.42759704589844, + "logps/rejected": -35.94107437133789, + "loss": 0.4488, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.0653904676437378, + "rewards/margins": 1.2679566144943237, + "rewards/rejected": -0.20256614685058594, + "step": 5580 + }, + { + "epoch": 0.91, + "learning_rate": 5.998642622436406e-06, + "logits/chosen": -1.200819969177246, + "logits/rejected": -1.1359868049621582, + "logps/chosen": -58.99761199951172, + "logps/rejected": -8.611634254455566, + "loss": 0.1092, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.0848045349121094, + "rewards/margins": 1.5923471450805664, + "rewards/rejected": 0.49245738983154297, + "step": 5581 + }, + { + "epoch": 0.91, + "learning_rate": 5.997354814542649e-06, + "logits/chosen": -0.8791155219078064, + "logits/rejected": -0.8807899951934814, + "logps/chosen": -58.096923828125, + "logps/rejected": -60.561302185058594, + "loss": 0.9904, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9018287658691406, + "rewards/margins": 0.21154165267944336, + "rewards/rejected": 2.6902871131896973, + "step": 5582 + }, + { + "epoch": 0.91, + "learning_rate": 5.996066937741114e-06, + "logits/chosen": -1.5721163749694824, + "logits/rejected": -1.2353895902633667, + "logps/chosen": -109.7331314086914, + "logps/rejected": -27.893978118896484, + "loss": 0.386, + "rewards/accuracies": 1.0, + "rewards/chosen": 7.675579071044922, + "rewards/margins": 7.233826160430908, + "rewards/rejected": 0.44175300002098083, + "step": 5583 + }, + { + "epoch": 0.91, + "learning_rate": 5.994778992120779e-06, + "logits/chosen": -1.117460012435913, + "logits/rejected": -1.0973128080368042, + "logps/chosen": -49.42760467529297, + "logps/rejected": -87.7970962524414, + "loss": 0.2502, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.7687249183654785, + "rewards/margins": 1.2736176252365112, + "rewards/rejected": 1.4951072931289673, + "step": 5584 + }, + { + "epoch": 0.91, + "learning_rate": 5.993490977770626e-06, + "logits/chosen": -0.481131911277771, + "logits/rejected": -0.481131911277771, + "logps/chosen": -34.738094329833984, + "logps/rejected": -34.738094329833984, + "loss": 0.3654, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.4470329284667969, + "rewards/margins": 0.0, + "rewards/rejected": 1.4470329284667969, + "step": 5585 + }, + { + "epoch": 0.91, + "learning_rate": 5.9922028947796495e-06, + "logits/chosen": -1.1404333114624023, + "logits/rejected": -1.088268756866455, + "logps/chosen": -56.810760498046875, + "logps/rejected": -83.36103820800781, + "loss": 1.3498, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.1910767555236816, + "rewards/margins": -0.7444427013397217, + "rewards/rejected": 2.9355194568634033, + "step": 5586 + }, + { + "epoch": 0.91, + "learning_rate": 5.99091474323684e-06, + "logits/chosen": -1.1711020469665527, + "logits/rejected": -1.2313342094421387, + "logps/chosen": -70.53395080566406, + "logps/rejected": -120.59075164794922, + "loss": 1.3937, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.2755982875823975, + "rewards/margins": -2.521655797958374, + "rewards/rejected": 4.7972540855407715, + "step": 5587 + }, + { + "epoch": 0.91, + "learning_rate": 5.989626523231198e-06, + "logits/chosen": -1.1036970615386963, + "logits/rejected": -1.1031889915466309, + "logps/chosen": -4.703762531280518, + "logps/rejected": -14.08408260345459, + "loss": 1.1372, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.2323325127363205, + "rewards/margins": -0.8028212189674377, + "rewards/rejected": 1.0351537466049194, + "step": 5588 + }, + { + "epoch": 0.91, + "learning_rate": 5.9883382348517275e-06, + "logits/chosen": -1.0777587890625, + "logits/rejected": -0.9396005272865295, + "logps/chosen": -254.45809936523438, + "logps/rejected": -48.54756164550781, + "loss": 0.086, + "rewards/accuracies": 1.0, + "rewards/chosen": 7.991049289703369, + "rewards/margins": 6.738970756530762, + "rewards/rejected": 1.2520782947540283, + "step": 5589 + }, + { + "epoch": 0.91, + "learning_rate": 5.987049878187437e-06, + "logits/chosen": -1.4436688423156738, + "logits/rejected": -1.4631316661834717, + "logps/chosen": -170.405029296875, + "logps/rejected": -98.83934783935547, + "loss": 0.1901, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.356170654296875, + "rewards/margins": 0.7801047563552856, + "rewards/rejected": 1.5760658979415894, + "step": 5590 + }, + { + "epoch": 0.91, + "learning_rate": 5.985761453327338e-06, + "logits/chosen": -1.3476356267929077, + "logits/rejected": -1.3039512634277344, + "logps/chosen": -106.47047424316406, + "logps/rejected": -170.72103881835938, + "loss": 1.1683, + "rewards/accuracies": 0.0, + "rewards/chosen": 5.542035102844238, + "rewards/margins": -2.2326841354370117, + "rewards/rejected": 7.77471923828125, + "step": 5591 + }, + { + "epoch": 0.91, + "learning_rate": 5.98447296036045e-06, + "logits/chosen": -1.5786863565444946, + "logits/rejected": -1.6073920726776123, + "logps/chosen": -81.72847747802734, + "logps/rejected": -141.65902709960938, + "loss": 0.507, + "rewards/accuracies": 0.0, + "rewards/chosen": 5.736515045166016, + "rewards/margins": -0.5627374649047852, + "rewards/rejected": 6.299252510070801, + "step": 5592 + }, + { + "epoch": 0.91, + "learning_rate": 5.983184399375797e-06, + "logits/chosen": -1.2440159320831299, + "logits/rejected": -1.3259356021881104, + "logps/chosen": -189.13824462890625, + "logps/rejected": -114.21240234375, + "loss": 1.476, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.780621290206909, + "rewards/margins": -2.8736636638641357, + "rewards/rejected": 6.654284954071045, + "step": 5593 + }, + { + "epoch": 0.91, + "learning_rate": 5.9818957704624046e-06, + "logits/chosen": -1.4667412042617798, + "logits/rejected": -1.415407657623291, + "logps/chosen": -58.14220428466797, + "logps/rejected": -28.62787437438965, + "loss": 0.0429, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.5666024684906006, + "rewards/margins": 2.9570600986480713, + "rewards/rejected": -0.39045754075050354, + "step": 5594 + }, + { + "epoch": 0.91, + "learning_rate": 5.980607073709305e-06, + "logits/chosen": -0.7084346413612366, + "logits/rejected": -0.7031826972961426, + "logps/chosen": -3.439410924911499, + "logps/rejected": -2.293285846710205, + "loss": 0.5745, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.23763678967952728, + "rewards/margins": 0.02250450849533081, + "rewards/rejected": 0.21513228118419647, + "step": 5595 + }, + { + "epoch": 0.91, + "learning_rate": 5.979318309205535e-06, + "logits/chosen": -1.4974380731582642, + "logits/rejected": -1.4997326135635376, + "logps/chosen": -98.01937866210938, + "logps/rejected": -157.09185791015625, + "loss": 0.1372, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.829432964324951, + "rewards/margins": 1.1830124855041504, + "rewards/rejected": 4.646420478820801, + "step": 5596 + }, + { + "epoch": 0.91, + "learning_rate": 5.978029477040136e-06, + "logits/chosen": -0.9388998746871948, + "logits/rejected": -0.9388998746871948, + "logps/chosen": -34.92715072631836, + "logps/rejected": -34.92715072631836, + "loss": 0.7579, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.0382003784179688, + "rewards/margins": 0.0, + "rewards/rejected": 1.0382003784179688, + "step": 5597 + }, + { + "epoch": 0.91, + "learning_rate": 5.9767405773021545e-06, + "logits/chosen": -1.3083910942077637, + "logits/rejected": -1.226090908050537, + "logps/chosen": -63.98500442504883, + "logps/rejected": -26.469215393066406, + "loss": 0.3172, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.3301823139190674, + "rewards/margins": 0.404140830039978, + "rewards/rejected": 1.9260414838790894, + "step": 5598 + }, + { + "epoch": 0.91, + "learning_rate": 5.975451610080643e-06, + "logits/chosen": -1.1176190376281738, + "logits/rejected": -1.0545294284820557, + "logps/chosen": -50.577293395996094, + "logps/rejected": -37.91008377075195, + "loss": 0.675, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.250962972640991, + "rewards/margins": 0.6974278688430786, + "rewards/rejected": 1.5535351037979126, + "step": 5599 + }, + { + "epoch": 0.91, + "learning_rate": 5.974162575464652e-06, + "logits/chosen": -1.0844411849975586, + "logits/rejected": -1.1051757335662842, + "logps/chosen": -58.50437927246094, + "logps/rejected": -35.86907958984375, + "loss": 0.8561, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.2289986610412598, + "rewards/margins": -0.2889838218688965, + "rewards/rejected": 2.5179824829101562, + "step": 5600 + }, + { + "epoch": 0.91, + "learning_rate": 5.972873473543247e-06, + "logits/chosen": -1.0990090370178223, + "logits/rejected": -0.8226578235626221, + "logps/chosen": -61.68126678466797, + "logps/rejected": -42.614559173583984, + "loss": 0.8808, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.9292152523994446, + "rewards/margins": -1.2310841083526611, + "rewards/rejected": 2.160299301147461, + "step": 5601 + }, + { + "epoch": 0.91, + "learning_rate": 5.971584304405489e-06, + "logits/chosen": -0.7185521125793457, + "logits/rejected": -0.7185521125793457, + "logps/chosen": -37.81945037841797, + "logps/rejected": -37.81945037841797, + "loss": 0.5148, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.379097819328308, + "rewards/margins": 0.0, + "rewards/rejected": 1.379097819328308, + "step": 5602 + }, + { + "epoch": 0.91, + "learning_rate": 5.97029506814045e-06, + "logits/chosen": -1.3444573879241943, + "logits/rejected": -1.4108189344406128, + "logps/chosen": -77.08200073242188, + "logps/rejected": -103.18418884277344, + "loss": 1.8932, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.3344886302948, + "rewards/margins": -0.5459632873535156, + "rewards/rejected": 2.8804519176483154, + "step": 5603 + }, + { + "epoch": 0.91, + "learning_rate": 5.9690057648372015e-06, + "logits/chosen": -1.1641185283660889, + "logits/rejected": -1.2736443281173706, + "logps/chosen": -58.718814849853516, + "logps/rejected": -83.39193725585938, + "loss": 1.1827, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.5016323328018188, + "rewards/margins": -2.1329774856567383, + "rewards/rejected": 3.6346099376678467, + "step": 5604 + }, + { + "epoch": 0.91, + "learning_rate": 5.967716394584824e-06, + "logits/chosen": -0.8668118715286255, + "logits/rejected": -0.8652752041816711, + "logps/chosen": -2.127359390258789, + "logps/rejected": -8.071417808532715, + "loss": 0.4149, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.23068204522132874, + "rewards/margins": -0.15552926063537598, + "rewards/rejected": 0.3862113058567047, + "step": 5605 + }, + { + "epoch": 0.91, + "learning_rate": 5.9664269574724e-06, + "logits/chosen": -1.0860093832015991, + "logits/rejected": -0.8738396763801575, + "logps/chosen": -66.77770233154297, + "logps/rejected": -16.283401489257812, + "loss": 0.0586, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.7198235988616943, + "rewards/margins": 2.5625736713409424, + "rewards/rejected": 0.1572500318288803, + "step": 5606 + }, + { + "epoch": 0.91, + "learning_rate": 5.965137453589018e-06, + "logits/chosen": -0.7386605739593506, + "logits/rejected": -0.7126845717430115, + "logps/chosen": -22.121530532836914, + "logps/rejected": -3.2477359771728516, + "loss": 1.7514, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.7230057120323181, + "rewards/margins": 0.5049557089805603, + "rewards/rejected": 0.2180500030517578, + "step": 5607 + }, + { + "epoch": 0.91, + "learning_rate": 5.96384788302377e-06, + "logits/chosen": -1.3903254270553589, + "logits/rejected": -1.4097249507904053, + "logps/chosen": -94.35831451416016, + "logps/rejected": -59.587806701660156, + "loss": 0.5519, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.3318917751312256, + "rewards/margins": -0.6406295299530029, + "rewards/rejected": 3.9725213050842285, + "step": 5608 + }, + { + "epoch": 0.91, + "learning_rate": 5.9625582458657536e-06, + "logits/chosen": -0.9704390168190002, + "logits/rejected": -0.9874462485313416, + "logps/chosen": -67.84317016601562, + "logps/rejected": -59.53730773925781, + "loss": 0.483, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.412966251373291, + "rewards/margins": 0.5167199373245239, + "rewards/rejected": 1.896246314048767, + "step": 5609 + }, + { + "epoch": 0.91, + "learning_rate": 5.96126854220407e-06, + "logits/chosen": -1.2535533905029297, + "logits/rejected": -1.297616958618164, + "logps/chosen": -42.88703536987305, + "logps/rejected": -116.00873565673828, + "loss": 3.3843, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.28094220161438, + "rewards/margins": -4.215980529785156, + "rewards/rejected": 6.496922969818115, + "step": 5610 + }, + { + "epoch": 0.91, + "learning_rate": 5.959978772127826e-06, + "logits/chosen": -1.3095892667770386, + "logits/rejected": -1.302124261856079, + "logps/chosen": -73.80062866210938, + "logps/rejected": -40.1152458190918, + "loss": 0.3045, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.4367622137069702, + "rewards/margins": 0.19113194942474365, + "rewards/rejected": 1.2456302642822266, + "step": 5611 + }, + { + "epoch": 0.91, + "learning_rate": 5.958688935726132e-06, + "logits/chosen": -1.1800423860549927, + "logits/rejected": -1.2025686502456665, + "logps/chosen": -84.00737762451172, + "logps/rejected": -116.70753479003906, + "loss": 0.5243, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.845966339111328, + "rewards/margins": 0.40015172958374023, + "rewards/rejected": 2.445814609527588, + "step": 5612 + }, + { + "epoch": 0.91, + "learning_rate": 5.957399033088103e-06, + "logits/chosen": -1.3614245653152466, + "logits/rejected": -1.3511189222335815, + "logps/chosen": -222.8976287841797, + "logps/rejected": -125.75665283203125, + "loss": 0.2824, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.8916901350021362, + "rewards/margins": 0.3245788812637329, + "rewards/rejected": 1.5671112537384033, + "step": 5613 + }, + { + "epoch": 0.91, + "learning_rate": 5.956109064302862e-06, + "logits/chosen": -1.3547719717025757, + "logits/rejected": -1.3627806901931763, + "logps/chosen": -112.04810333251953, + "logps/rejected": -115.40707397460938, + "loss": 1.7076, + "rewards/accuracies": 0.0, + "rewards/chosen": 5.284442901611328, + "rewards/margins": -3.352301597595215, + "rewards/rejected": 8.636744499206543, + "step": 5614 + }, + { + "epoch": 0.91, + "learning_rate": 5.95481902945953e-06, + "logits/chosen": -1.570939064025879, + "logits/rejected": -1.5257599353790283, + "logps/chosen": -92.66223907470703, + "logps/rejected": -47.629661560058594, + "loss": 0.0862, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.069649696350098, + "rewards/margins": 2.4615540504455566, + "rewards/rejected": 2.608095645904541, + "step": 5615 + }, + { + "epoch": 0.91, + "learning_rate": 5.953528928647238e-06, + "logits/chosen": -1.0766371488571167, + "logits/rejected": -1.2230210304260254, + "logps/chosen": -57.554351806640625, + "logps/rejected": -117.33613586425781, + "loss": 1.9072, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.6332275867462158, + "rewards/margins": -3.076045274734497, + "rewards/rejected": 4.709272861480713, + "step": 5616 + }, + { + "epoch": 0.91, + "learning_rate": 5.952238761955117e-06, + "logits/chosen": -1.3197669982910156, + "logits/rejected": -1.242322564125061, + "logps/chosen": -152.98899841308594, + "logps/rejected": -94.82756042480469, + "loss": 0.1471, + "rewards/accuracies": 1.0, + "rewards/chosen": 7.378756999969482, + "rewards/margins": 1.0961685180664062, + "rewards/rejected": 6.282588481903076, + "step": 5617 + }, + { + "epoch": 0.91, + "learning_rate": 5.95094852947231e-06, + "logits/chosen": -1.1580919027328491, + "logits/rejected": -1.1358362436294556, + "logps/chosen": -48.0037841796875, + "logps/rejected": -8.12441349029541, + "loss": 0.5522, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.3328903913497925, + "rewards/margins": 0.6086239218711853, + "rewards/rejected": 0.7242664694786072, + "step": 5618 + }, + { + "epoch": 0.91, + "learning_rate": 5.949658231287958e-06, + "logits/chosen": -1.1564908027648926, + "logits/rejected": -1.2284399271011353, + "logps/chosen": -113.46479034423828, + "logps/rejected": -43.99882507324219, + "loss": 0.683, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.448072075843811, + "rewards/margins": -0.6940690279006958, + "rewards/rejected": 2.142141103744507, + "step": 5619 + }, + { + "epoch": 0.91, + "learning_rate": 5.948367867491207e-06, + "logits/chosen": -0.9169005155563354, + "logits/rejected": -0.9169005155563354, + "logps/chosen": -1.2865889072418213, + "logps/rejected": -1.2865889072418213, + "loss": 0.6906, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.2637427747249603, + "rewards/margins": 0.0, + "rewards/rejected": 0.2637427747249603, + "step": 5620 + }, + { + "epoch": 0.91, + "learning_rate": 5.947077438171211e-06, + "logits/chosen": -1.1151378154754639, + "logits/rejected": -1.1023117303848267, + "logps/chosen": -48.307586669921875, + "logps/rejected": -56.253211975097656, + "loss": 1.1866, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.884005069732666, + "rewards/margins": 1.1284394264221191, + "rewards/rejected": 1.7555656433105469, + "step": 5621 + }, + { + "epoch": 0.91, + "learning_rate": 5.9457869434171236e-06, + "logits/chosen": -1.2210428714752197, + "logits/rejected": -1.2210428714752197, + "logps/chosen": -53.950843811035156, + "logps/rejected": -53.950843811035156, + "loss": 0.3992, + "rewards/accuracies": 0.0, + "rewards/chosen": 4.3994269371032715, + "rewards/margins": 0.0, + "rewards/rejected": 4.3994269371032715, + "step": 5622 + }, + { + "epoch": 0.91, + "learning_rate": 5.944496383318109e-06, + "logits/chosen": -1.2549370527267456, + "logits/rejected": -1.2770484685897827, + "logps/chosen": -90.17112731933594, + "logps/rejected": -80.51876831054688, + "loss": 0.5315, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.539630174636841, + "rewards/margins": -0.6299118995666504, + "rewards/rejected": 3.169542074203491, + "step": 5623 + }, + { + "epoch": 0.91, + "learning_rate": 5.94320575796333e-06, + "logits/chosen": -0.7213782072067261, + "logits/rejected": -0.7039673924446106, + "logps/chosen": -12.789265632629395, + "logps/rejected": -2.390519618988037, + "loss": 0.3943, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.6462723612785339, + "rewards/margins": 0.2915956676006317, + "rewards/rejected": 0.3546766936779022, + "step": 5624 + }, + { + "epoch": 0.91, + "learning_rate": 5.941915067441959e-06, + "logits/chosen": -0.717890739440918, + "logits/rejected": -0.717890739440918, + "logps/chosen": -69.58969116210938, + "logps/rejected": -69.58969116210938, + "loss": 0.3698, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.1090545654296875, + "rewards/margins": 0.0, + "rewards/rejected": 1.1090545654296875, + "step": 5625 + }, + { + "epoch": 0.91, + "learning_rate": 5.94062431184317e-06, + "logits/chosen": -1.5730617046356201, + "logits/rejected": -1.489418387413025, + "logps/chosen": -133.1445770263672, + "logps/rejected": -109.91817474365234, + "loss": 0.0915, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.8431077003479, + "rewards/margins": 2.6628167629241943, + "rewards/rejected": 2.180290937423706, + "step": 5626 + }, + { + "epoch": 0.91, + "learning_rate": 5.939333491256141e-06, + "logits/chosen": -1.0379422903060913, + "logits/rejected": -1.1165720224380493, + "logps/chosen": -57.82072830200195, + "logps/rejected": -138.62750244140625, + "loss": 1.4972, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.980280637741089, + "rewards/margins": -2.756516218185425, + "rewards/rejected": 6.736796855926514, + "step": 5627 + }, + { + "epoch": 0.91, + "learning_rate": 5.938042605770054e-06, + "logits/chosen": -1.1037935018539429, + "logits/rejected": -1.1383312940597534, + "logps/chosen": -66.81330108642578, + "logps/rejected": -128.02378845214844, + "loss": 0.3219, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.920356035232544, + "rewards/margins": 1.2386481761932373, + "rewards/rejected": 0.6817077994346619, + "step": 5628 + }, + { + "epoch": 0.91, + "learning_rate": 5.9367516554741e-06, + "logits/chosen": -1.3421162366867065, + "logits/rejected": -1.3558547496795654, + "logps/chosen": -105.72866821289062, + "logps/rejected": -157.25933837890625, + "loss": 2.8404, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.0225465297698975, + "rewards/margins": -2.718945264816284, + "rewards/rejected": 5.741491794586182, + "step": 5629 + }, + { + "epoch": 0.91, + "learning_rate": 5.93546064045747e-06, + "logits/chosen": -1.3459552526474, + "logits/rejected": -1.3241660594940186, + "logps/chosen": -99.01569366455078, + "logps/rejected": -91.56583404541016, + "loss": 0.5812, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.216268301010132, + "rewards/margins": -0.6268644332885742, + "rewards/rejected": 3.843132734298706, + "step": 5630 + }, + { + "epoch": 0.91, + "learning_rate": 5.934169560809361e-06, + "logits/chosen": -1.133534550666809, + "logits/rejected": -0.979922354221344, + "logps/chosen": -58.14934158325195, + "logps/rejected": -24.880821228027344, + "loss": 0.241, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.9705013036727905, + "rewards/margins": 2.267228603363037, + "rewards/rejected": -0.29672738909721375, + "step": 5631 + }, + { + "epoch": 0.91, + "learning_rate": 5.9328784166189744e-06, + "logits/chosen": -1.0637049674987793, + "logits/rejected": -1.0121667385101318, + "logps/chosen": -64.435791015625, + "logps/rejected": -49.954383850097656, + "loss": 0.285, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.6363160610198975, + "rewards/margins": 0.8488458395004272, + "rewards/rejected": 1.7874702215194702, + "step": 5632 + }, + { + "epoch": 0.91, + "learning_rate": 5.931587207975517e-06, + "logits/chosen": -0.867501974105835, + "logits/rejected": -0.9456762671470642, + "logps/chosen": -71.22428894042969, + "logps/rejected": -92.72898864746094, + "loss": 0.3827, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.8765504360198975, + "rewards/margins": -0.085113525390625, + "rewards/rejected": 2.9616639614105225, + "step": 5633 + }, + { + "epoch": 0.91, + "learning_rate": 5.930295934968197e-06, + "logits/chosen": -0.9210451245307922, + "logits/rejected": -0.9230759739875793, + "logps/chosen": -1.3881722688674927, + "logps/rejected": -2.107290506362915, + "loss": 2.5087, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3039540946483612, + "rewards/margins": 0.08477625250816345, + "rewards/rejected": 0.21917784214019775, + "step": 5634 + }, + { + "epoch": 0.91, + "learning_rate": 5.929004597686232e-06, + "logits/chosen": -1.3346019983291626, + "logits/rejected": -1.2396413087844849, + "logps/chosen": -88.00726318359375, + "logps/rejected": -59.35126495361328, + "loss": 0.0151, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.846850872039795, + "rewards/margins": 3.694870948791504, + "rewards/rejected": 3.151979923248291, + "step": 5635 + }, + { + "epoch": 0.91, + "learning_rate": 5.927713196218839e-06, + "logits/chosen": -1.5436561107635498, + "logits/rejected": -1.538849949836731, + "logps/chosen": -86.85467529296875, + "logps/rejected": -165.4555206298828, + "loss": 2.3245, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.2298660278320312, + "rewards/margins": -3.7633438110351562, + "rewards/rejected": 5.9932098388671875, + "step": 5636 + }, + { + "epoch": 0.91, + "learning_rate": 5.926421730655242e-06, + "logits/chosen": -0.7791762948036194, + "logits/rejected": -0.7652865052223206, + "logps/chosen": -15.12014102935791, + "logps/rejected": -7.4224443435668945, + "loss": 0.5516, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.0041377544403076, + "rewards/margins": 0.6794363260269165, + "rewards/rejected": 0.3247013986110687, + "step": 5637 + }, + { + "epoch": 0.92, + "learning_rate": 5.92513020108467e-06, + "logits/chosen": -1.0019341707229614, + "logits/rejected": -1.038151741027832, + "logps/chosen": -87.94190216064453, + "logps/rejected": -83.57366180419922, + "loss": 1.8895, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.5929237604141235, + "rewards/margins": -3.7438769340515137, + "rewards/rejected": 5.336800575256348, + "step": 5638 + }, + { + "epoch": 0.92, + "learning_rate": 5.923838607596354e-06, + "logits/chosen": -1.3460057973861694, + "logits/rejected": -1.3460057973861694, + "logps/chosen": -29.976633071899414, + "logps/rejected": -29.976633071899414, + "loss": 0.3571, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.8789784908294678, + "rewards/margins": 0.0, + "rewards/rejected": 2.8789784908294678, + "step": 5639 + }, + { + "epoch": 0.92, + "learning_rate": 5.922546950279532e-06, + "logits/chosen": -1.5783021450042725, + "logits/rejected": -1.4816557168960571, + "logps/chosen": -129.05014038085938, + "logps/rejected": -75.00132751464844, + "loss": 0.1027, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.605813503265381, + "rewards/margins": 1.5759825706481934, + "rewards/rejected": 4.0298309326171875, + "step": 5640 + }, + { + "epoch": 0.92, + "learning_rate": 5.9212552292234436e-06, + "logits/chosen": -1.1475493907928467, + "logits/rejected": -1.2078595161437988, + "logps/chosen": -73.69319152832031, + "logps/rejected": -128.50387573242188, + "loss": 0.6408, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.487323760986328, + "rewards/margins": 0.012415170669555664, + "rewards/rejected": 2.4749085903167725, + "step": 5641 + }, + { + "epoch": 0.92, + "learning_rate": 5.919963444517338e-06, + "logits/chosen": -0.8580256700515747, + "logits/rejected": -0.8758279085159302, + "logps/chosen": -66.41883850097656, + "logps/rejected": -104.06309509277344, + "loss": 0.1954, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.635969638824463, + "rewards/margins": 0.8707481622695923, + "rewards/rejected": 1.7652214765548706, + "step": 5642 + }, + { + "epoch": 0.92, + "learning_rate": 5.918671596250461e-06, + "logits/chosen": -0.9829667210578918, + "logits/rejected": -0.9863288402557373, + "logps/chosen": -26.64165496826172, + "logps/rejected": -82.44834899902344, + "loss": 1.2204, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.2621742486953735, + "rewards/margins": -1.2509583234786987, + "rewards/rejected": 2.5131325721740723, + "step": 5643 + }, + { + "epoch": 0.92, + "learning_rate": 5.917379684512071e-06, + "logits/chosen": -1.3921127319335938, + "logits/rejected": -1.1548091173171997, + "logps/chosen": -92.85450744628906, + "logps/rejected": -76.13499450683594, + "loss": 0.0226, + "rewards/accuracies": 1.0, + "rewards/chosen": 7.381068706512451, + "rewards/margins": 3.8555643558502197, + "rewards/rejected": 3.5255043506622314, + "step": 5644 + }, + { + "epoch": 0.92, + "learning_rate": 5.9160877093914234e-06, + "logits/chosen": -1.0346282720565796, + "logits/rejected": -1.078336477279663, + "logps/chosen": -82.25927734375, + "logps/rejected": -117.2392349243164, + "loss": 0.5354, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.3865203857421875, + "rewards/margins": -0.28144311904907227, + "rewards/rejected": 2.6679635047912598, + "step": 5645 + }, + { + "epoch": 0.92, + "learning_rate": 5.914795670977785e-06, + "logits/chosen": -1.1173776388168335, + "logits/rejected": -1.1173776388168335, + "logps/chosen": -71.33645629882812, + "logps/rejected": -71.33645629882812, + "loss": 0.6783, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.5015435218811035, + "rewards/margins": 0.0, + "rewards/rejected": 2.5015435218811035, + "step": 5646 + }, + { + "epoch": 0.92, + "learning_rate": 5.913503569360422e-06, + "logits/chosen": -1.0937144756317139, + "logits/rejected": -1.0357013940811157, + "logps/chosen": -62.523765563964844, + "logps/rejected": -106.8960952758789, + "loss": 1.8139, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.7075157165527344, + "rewards/margins": 0.8151565790176392, + "rewards/rejected": 1.8923591375350952, + "step": 5647 + }, + { + "epoch": 0.92, + "learning_rate": 5.912211404628604e-06, + "logits/chosen": -0.7695914506912231, + "logits/rejected": -0.9566776156425476, + "logps/chosen": -42.49517059326172, + "logps/rejected": -251.35736083984375, + "loss": 1.0646, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.9860950708389282, + "rewards/margins": -0.3080791234970093, + "rewards/rejected": 2.2941741943359375, + "step": 5648 + }, + { + "epoch": 0.92, + "learning_rate": 5.91091917687161e-06, + "logits/chosen": -1.6233378648757935, + "logits/rejected": -1.5401932001113892, + "logps/chosen": -113.50625610351562, + "logps/rejected": -111.00240325927734, + "loss": 0.1624, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.99223792552948, + "rewards/margins": 1.0621063709259033, + "rewards/rejected": 0.9301315546035767, + "step": 5649 + }, + { + "epoch": 0.92, + "learning_rate": 5.909626886178721e-06, + "logits/chosen": -0.7951138019561768, + "logits/rejected": -0.7946872115135193, + "logps/chosen": -25.4551944732666, + "logps/rejected": -18.110353469848633, + "loss": 0.8953, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.07353534549474716, + "rewards/margins": -0.523999035358429, + "rewards/rejected": 0.5975343585014343, + "step": 5650 + }, + { + "epoch": 0.92, + "learning_rate": 5.90833453263922e-06, + "logits/chosen": -1.038393259048462, + "logits/rejected": -1.038393259048462, + "logps/chosen": -115.37200164794922, + "logps/rejected": -115.37200164794922, + "loss": 0.3694, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.8461556434631348, + "rewards/margins": 0.0, + "rewards/rejected": 2.8461556434631348, + "step": 5651 + }, + { + "epoch": 0.92, + "learning_rate": 5.907042116342399e-06, + "logits/chosen": -1.3249869346618652, + "logits/rejected": -1.2741937637329102, + "logps/chosen": -71.41117095947266, + "logps/rejected": -93.13285064697266, + "loss": 0.3677, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.4868186712265015, + "rewards/margins": 0.03208613395690918, + "rewards/rejected": 1.4547325372695923, + "step": 5652 + }, + { + "epoch": 0.92, + "learning_rate": 5.905749637377549e-06, + "logits/chosen": -1.1830253601074219, + "logits/rejected": -1.1238975524902344, + "logps/chosen": -133.86294555664062, + "logps/rejected": -133.04454040527344, + "loss": 0.5345, + "rewards/accuracies": 0.0, + "rewards/chosen": 4.796063423156738, + "rewards/margins": -0.3964247703552246, + "rewards/rejected": 5.192488193511963, + "step": 5653 + }, + { + "epoch": 0.92, + "learning_rate": 5.90445709583397e-06, + "logits/chosen": -1.1540406942367554, + "logits/rejected": -1.10901939868927, + "logps/chosen": -87.95449829101562, + "logps/rejected": -53.668582916259766, + "loss": 0.6544, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.3514000177383423, + "rewards/margins": -0.18288159370422363, + "rewards/rejected": 1.534281611442566, + "step": 5654 + }, + { + "epoch": 0.92, + "learning_rate": 5.903164491800963e-06, + "logits/chosen": -1.1874852180480957, + "logits/rejected": -1.1741631031036377, + "logps/chosen": -52.1249885559082, + "logps/rejected": -93.61067962646484, + "loss": 0.3645, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.754638671875, + "rewards/margins": -0.06445235013961792, + "rewards/rejected": 0.8190910220146179, + "step": 5655 + }, + { + "epoch": 0.92, + "learning_rate": 5.901871825367835e-06, + "logits/chosen": -1.1621273756027222, + "logits/rejected": -1.1322206258773804, + "logps/chosen": -68.91610717773438, + "logps/rejected": -114.86145782470703, + "loss": 0.0507, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.7135987281799316, + "rewards/margins": 2.3561835289001465, + "rewards/rejected": 1.3574150800704956, + "step": 5656 + }, + { + "epoch": 0.92, + "learning_rate": 5.900579096623899e-06, + "logits/chosen": -0.7270837426185608, + "logits/rejected": -0.7285206317901611, + "logps/chosen": -48.987388610839844, + "logps/rejected": -39.025699615478516, + "loss": 0.8151, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.5787239074707031, + "rewards/margins": -0.14360356330871582, + "rewards/rejected": 1.722327470779419, + "step": 5657 + }, + { + "epoch": 0.92, + "learning_rate": 5.899286305658468e-06, + "logits/chosen": -1.0094547271728516, + "logits/rejected": -0.9851205348968506, + "logps/chosen": -49.43736267089844, + "logps/rejected": -54.26303482055664, + "loss": 1.0594, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.652437686920166, + "rewards/margins": -0.4200136661529541, + "rewards/rejected": 3.07245135307312, + "step": 5658 + }, + { + "epoch": 0.92, + "learning_rate": 5.8979934525608625e-06, + "logits/chosen": -1.413729190826416, + "logits/rejected": -1.3539906740188599, + "logps/chosen": -91.8740234375, + "logps/rejected": -50.57041931152344, + "loss": 0.0689, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.377109050750732, + "rewards/margins": 2.0246188640594482, + "rewards/rejected": 2.352490186691284, + "step": 5659 + }, + { + "epoch": 0.92, + "learning_rate": 5.8967005374204065e-06, + "logits/chosen": -1.2740706205368042, + "logits/rejected": -1.2360177040100098, + "logps/chosen": -85.71290588378906, + "logps/rejected": -88.44879913330078, + "loss": 0.966, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.4233672618865967, + "rewards/margins": 0.18547511100769043, + "rewards/rejected": 2.2378921508789062, + "step": 5660 + }, + { + "epoch": 0.92, + "learning_rate": 5.8954075603264285e-06, + "logits/chosen": -1.180696725845337, + "logits/rejected": -1.1533992290496826, + "logps/chosen": -52.740840911865234, + "logps/rejected": -57.1838264465332, + "loss": 0.7678, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.945608139038086, + "rewards/margins": -0.5868301391601562, + "rewards/rejected": 2.532438278198242, + "step": 5661 + }, + { + "epoch": 0.92, + "learning_rate": 5.894114521368259e-06, + "logits/chosen": -1.5340898036956787, + "logits/rejected": -1.5457271337509155, + "logps/chosen": -145.08306884765625, + "logps/rejected": -73.01130676269531, + "loss": 0.0315, + "rewards/accuracies": 1.0, + "rewards/chosen": 7.087323188781738, + "rewards/margins": 4.1845245361328125, + "rewards/rejected": 2.9027984142303467, + "step": 5662 + }, + { + "epoch": 0.92, + "learning_rate": 5.892821420635237e-06, + "logits/chosen": -1.5881248712539673, + "logits/rejected": -1.5881248712539673, + "logps/chosen": -45.1701545715332, + "logps/rejected": -45.1701545715332, + "loss": 0.4421, + "rewards/accuracies": 0.0, + "rewards/chosen": 4.364841938018799, + "rewards/margins": 0.0, + "rewards/rejected": 4.364841938018799, + "step": 5663 + }, + { + "epoch": 0.92, + "learning_rate": 5.891528258216703e-06, + "logits/chosen": -1.4439516067504883, + "logits/rejected": -1.429481029510498, + "logps/chosen": -71.653076171875, + "logps/rejected": -51.12543487548828, + "loss": 0.8262, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.5640838146209717, + "rewards/margins": -1.1705071926116943, + "rewards/rejected": 4.734591007232666, + "step": 5664 + }, + { + "epoch": 0.92, + "learning_rate": 5.890235034202002e-06, + "logits/chosen": -0.9972056150436401, + "logits/rejected": -0.8954959511756897, + "logps/chosen": -68.81024169921875, + "logps/rejected": -64.70802307128906, + "loss": 0.1193, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.683723449707031, + "rewards/margins": 2.500986337661743, + "rewards/rejected": 3.182737112045288, + "step": 5665 + }, + { + "epoch": 0.92, + "learning_rate": 5.888941748680484e-06, + "logits/chosen": -1.0494967699050903, + "logits/rejected": -1.0494967699050903, + "logps/chosen": -54.27039337158203, + "logps/rejected": -54.27039337158203, + "loss": 0.3533, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.8047112226486206, + "rewards/margins": 0.0, + "rewards/rejected": 1.8047112226486206, + "step": 5666 + }, + { + "epoch": 0.92, + "learning_rate": 5.887648401741503e-06, + "logits/chosen": -1.2245627641677856, + "logits/rejected": -1.2245627641677856, + "logps/chosen": -89.7236328125, + "logps/rejected": -89.7236328125, + "loss": 0.357, + "rewards/accuracies": 0.0, + "rewards/chosen": 4.888679504394531, + "rewards/margins": 0.0, + "rewards/rejected": 4.888679504394531, + "step": 5667 + }, + { + "epoch": 0.92, + "learning_rate": 5.8863549934744145e-06, + "logits/chosen": -1.0010477304458618, + "logits/rejected": -0.9643590450286865, + "logps/chosen": -39.32012176513672, + "logps/rejected": -35.46551513671875, + "loss": 1.4297, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.8099159002304077, + "rewards/margins": -1.614569067955017, + "rewards/rejected": 3.424484968185425, + "step": 5668 + }, + { + "epoch": 0.92, + "learning_rate": 5.885061523968584e-06, + "logits/chosen": -1.1249257326126099, + "logits/rejected": -1.0202105045318604, + "logps/chosen": -92.77902221679688, + "logps/rejected": -37.643035888671875, + "loss": 0.2561, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.9245223999023438, + "rewards/margins": 0.8488799333572388, + "rewards/rejected": 1.075642466545105, + "step": 5669 + }, + { + "epoch": 0.92, + "learning_rate": 5.8837679933133765e-06, + "logits/chosen": -1.3087215423583984, + "logits/rejected": -0.9944373369216919, + "logps/chosen": -9.754150390625, + "logps/rejected": -174.06935119628906, + "loss": 2.5491, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.07023363560438156, + "rewards/margins": -5.080031394958496, + "rewards/rejected": 5.0097975730896, + "step": 5670 + }, + { + "epoch": 0.92, + "learning_rate": 5.882474401598163e-06, + "logits/chosen": -1.066911220550537, + "logits/rejected": -1.1096585988998413, + "logps/chosen": -56.70911407470703, + "logps/rejected": -76.93292999267578, + "loss": 1.8369, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.1146066188812256, + "rewards/margins": -0.41652965545654297, + "rewards/rejected": 2.5311362743377686, + "step": 5671 + }, + { + "epoch": 0.92, + "learning_rate": 5.881180748912319e-06, + "logits/chosen": -1.2221906185150146, + "logits/rejected": -1.2221906185150146, + "logps/chosen": -75.55670928955078, + "logps/rejected": -75.55670928955078, + "loss": 0.4032, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.1065468788146973, + "rewards/margins": 0.0, + "rewards/rejected": 3.1065468788146973, + "step": 5672 + }, + { + "epoch": 0.92, + "learning_rate": 5.879887035345221e-06, + "logits/chosen": -1.0294908285140991, + "logits/rejected": -1.0531691312789917, + "logps/chosen": -57.68579864501953, + "logps/rejected": -57.847251892089844, + "loss": 0.1817, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.778398871421814, + "rewards/margins": 0.8893806338310242, + "rewards/rejected": 0.8890182375907898, + "step": 5673 + }, + { + "epoch": 0.92, + "learning_rate": 5.878593260986256e-06, + "logits/chosen": -1.1285417079925537, + "logits/rejected": -1.1285417079925537, + "logps/chosen": -35.86920166015625, + "logps/rejected": -35.86920166015625, + "loss": 0.649, + "rewards/accuracies": 0.0, + "rewards/chosen": 4.374215126037598, + "rewards/margins": 0.0, + "rewards/rejected": 4.374215126037598, + "step": 5674 + }, + { + "epoch": 0.92, + "learning_rate": 5.87729942592481e-06, + "logits/chosen": -1.0047686100006104, + "logits/rejected": -1.0029051303863525, + "logps/chosen": -58.93753433227539, + "logps/rejected": -59.80724334716797, + "loss": 0.914, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.0957005023956299, + "rewards/margins": -1.4552288055419922, + "rewards/rejected": 2.550929307937622, + "step": 5675 + }, + { + "epoch": 0.92, + "learning_rate": 5.876005530250274e-06, + "logits/chosen": -1.145836353302002, + "logits/rejected": -1.0666275024414062, + "logps/chosen": -54.377220153808594, + "logps/rejected": -23.280620574951172, + "loss": 0.2667, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.475332021713257, + "rewards/margins": 0.8148709535598755, + "rewards/rejected": 1.6604610681533813, + "step": 5676 + }, + { + "epoch": 0.92, + "learning_rate": 5.874711574052046e-06, + "logits/chosen": -1.2816671133041382, + "logits/rejected": -1.257787823677063, + "logps/chosen": -51.57551956176758, + "logps/rejected": -42.23418426513672, + "loss": 0.193, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.5218425989151, + "rewards/margins": 0.7542530298233032, + "rewards/rejected": 0.7675895690917969, + "step": 5677 + }, + { + "epoch": 0.92, + "learning_rate": 5.873417557419523e-06, + "logits/chosen": -1.3191437721252441, + "logits/rejected": -1.3282653093338013, + "logps/chosen": -106.79426574707031, + "logps/rejected": -44.780601501464844, + "loss": 0.9366, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.05069732666015625, + "rewards/margins": -1.6800880432128906, + "rewards/rejected": 1.7307853698730469, + "step": 5678 + }, + { + "epoch": 0.92, + "learning_rate": 5.872123480442112e-06, + "logits/chosen": -1.2913856506347656, + "logits/rejected": -1.3063806295394897, + "logps/chosen": -99.76570129394531, + "logps/rejected": -111.55265808105469, + "loss": 0.5679, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.950463891029358, + "rewards/margins": -0.2591797113418579, + "rewards/rejected": 2.209643602371216, + "step": 5679 + }, + { + "epoch": 0.92, + "learning_rate": 5.870829343209221e-06, + "logits/chosen": -0.8703230619430542, + "logits/rejected": -0.8703230619430542, + "logps/chosen": -1.8788214921951294, + "logps/rejected": -1.8788214921951294, + "loss": 0.5705, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.3756698668003082, + "rewards/margins": 0.0, + "rewards/rejected": 0.3756698668003082, + "step": 5680 + }, + { + "epoch": 0.92, + "learning_rate": 5.869535145810263e-06, + "logits/chosen": -1.5523600578308105, + "logits/rejected": -1.5388779640197754, + "logps/chosen": -148.6033935546875, + "logps/rejected": -132.5625457763672, + "loss": 1.8718, + "rewards/accuracies": 0.0, + "rewards/chosen": 7.407727241516113, + "rewards/margins": -2.0813446044921875, + "rewards/rejected": 9.4890718460083, + "step": 5681 + }, + { + "epoch": 0.92, + "learning_rate": 5.8682408883346535e-06, + "logits/chosen": -0.7640133500099182, + "logits/rejected": -0.7640133500099182, + "logps/chosen": -84.31663513183594, + "logps/rejected": -84.31663513183594, + "loss": 2.1201, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.1388185024261475, + "rewards/margins": 0.0, + "rewards/rejected": 2.1388185024261475, + "step": 5682 + }, + { + "epoch": 0.92, + "learning_rate": 5.866946570871812e-06, + "logits/chosen": -0.9165662527084351, + "logits/rejected": -0.9165662527084351, + "logps/chosen": -60.168128967285156, + "logps/rejected": -60.168128967285156, + "loss": 0.9296, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.378435492515564, + "rewards/margins": 0.0, + "rewards/rejected": 1.378435492515564, + "step": 5683 + }, + { + "epoch": 0.92, + "learning_rate": 5.8656521935111676e-06, + "logits/chosen": -1.1330742835998535, + "logits/rejected": -1.1973754167556763, + "logps/chosen": -72.42373657226562, + "logps/rejected": -114.11294555664062, + "loss": 2.2519, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.891801595687866, + "rewards/margins": -0.6522383689880371, + "rewards/rejected": 3.5440399646759033, + "step": 5684 + }, + { + "epoch": 0.92, + "learning_rate": 5.864357756342147e-06, + "logits/chosen": -1.4137933254241943, + "logits/rejected": -1.2142030000686646, + "logps/chosen": -81.48985290527344, + "logps/rejected": -69.46602630615234, + "loss": 0.4357, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.528149604797363, + "rewards/margins": 1.1395468711853027, + "rewards/rejected": 5.3886027336120605, + "step": 5685 + }, + { + "epoch": 0.92, + "learning_rate": 5.863063259454185e-06, + "logits/chosen": -0.8752212524414062, + "logits/rejected": -0.8649414777755737, + "logps/chosen": -77.16651916503906, + "logps/rejected": -69.87687683105469, + "loss": 0.4288, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.70402991771698, + "rewards/margins": -0.08391261100769043, + "rewards/rejected": 1.7879425287246704, + "step": 5686 + }, + { + "epoch": 0.92, + "learning_rate": 5.8617687029367165e-06, + "logits/chosen": -0.9656317830085754, + "logits/rejected": -0.9933422207832336, + "logps/chosen": -90.50463104248047, + "logps/rejected": -72.74111938476562, + "loss": 0.763, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.842501163482666, + "rewards/margins": 1.1596627235412598, + "rewards/rejected": 1.6828384399414062, + "step": 5687 + }, + { + "epoch": 0.92, + "learning_rate": 5.860474086879186e-06, + "logits/chosen": -1.1854575872421265, + "logits/rejected": -1.1503140926361084, + "logps/chosen": -80.06250762939453, + "logps/rejected": -52.107269287109375, + "loss": 0.4718, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.3633811473846436, + "rewards/margins": -0.4409456253051758, + "rewards/rejected": 3.8043267726898193, + "step": 5688 + }, + { + "epoch": 0.92, + "learning_rate": 5.859179411371037e-06, + "logits/chosen": -0.9687523245811462, + "logits/rejected": -0.9590966105461121, + "logps/chosen": -45.55828094482422, + "logps/rejected": -42.37031555175781, + "loss": 1.0398, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.634615421295166, + "rewards/margins": 0.057744741439819336, + "rewards/rejected": 3.5768706798553467, + "step": 5689 + }, + { + "epoch": 0.92, + "learning_rate": 5.857884676501721e-06, + "logits/chosen": -1.6381728649139404, + "logits/rejected": -1.6590639352798462, + "logps/chosen": -87.7374038696289, + "logps/rejected": -127.47564697265625, + "loss": 1.4507, + "rewards/accuracies": 0.0, + "rewards/chosen": 4.454866886138916, + "rewards/margins": -1.6904611587524414, + "rewards/rejected": 6.145328044891357, + "step": 5690 + }, + { + "epoch": 0.92, + "learning_rate": 5.8565898823606915e-06, + "logits/chosen": -1.1101946830749512, + "logits/rejected": -1.129402995109558, + "logps/chosen": -53.80872344970703, + "logps/rejected": -68.87797546386719, + "loss": 0.6646, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.8136284351348877, + "rewards/margins": 0.029819488525390625, + "rewards/rejected": 1.783808946609497, + "step": 5691 + }, + { + "epoch": 0.92, + "learning_rate": 5.855295029037405e-06, + "logits/chosen": -1.243481993675232, + "logits/rejected": -1.230827808380127, + "logps/chosen": -24.788867950439453, + "logps/rejected": -5.201663017272949, + "loss": 1.079, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.46933290362358093, + "rewards/margins": 0.14190006256103516, + "rewards/rejected": 0.3274328410625458, + "step": 5692 + }, + { + "epoch": 0.92, + "learning_rate": 5.854000116621326e-06, + "logits/chosen": -1.362781047821045, + "logits/rejected": -1.3696463108062744, + "logps/chosen": -113.95089721679688, + "logps/rejected": -85.25402069091797, + "loss": 1.4843, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.451586961746216, + "rewards/margins": 0.4983680248260498, + "rewards/rejected": 2.953218936920166, + "step": 5693 + }, + { + "epoch": 0.92, + "learning_rate": 5.852705145201919e-06, + "logits/chosen": -1.186913013458252, + "logits/rejected": -1.1926639080047607, + "logps/chosen": -67.00030517578125, + "logps/rejected": -64.20814514160156, + "loss": 0.5784, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.2946152687072754, + "rewards/margins": 1.0729111433029175, + "rewards/rejected": 1.221704125404358, + "step": 5694 + }, + { + "epoch": 0.92, + "learning_rate": 5.851410114868656e-06, + "logits/chosen": -0.9071053266525269, + "logits/rejected": -0.7863842844963074, + "logps/chosen": -63.748291015625, + "logps/rejected": -76.08808898925781, + "loss": 0.0589, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.091933727264404, + "rewards/margins": 2.9379191398620605, + "rewards/rejected": 3.1540145874023438, + "step": 5695 + }, + { + "epoch": 0.92, + "learning_rate": 5.850115025711009e-06, + "logits/chosen": -0.8486274480819702, + "logits/rejected": -0.7889973521232605, + "logps/chosen": -100.13761901855469, + "logps/rejected": -38.44879913330078, + "loss": 0.6506, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.0861740112304688, + "rewards/margins": -0.9795920848846436, + "rewards/rejected": 2.0657660961151123, + "step": 5696 + }, + { + "epoch": 0.92, + "learning_rate": 5.848819877818458e-06, + "logits/chosen": -0.9402439594268799, + "logits/rejected": -0.9571753740310669, + "logps/chosen": -73.88165283203125, + "logps/rejected": -87.05354309082031, + "loss": 0.1381, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.8661346435546875, + "rewards/margins": 1.1633018255233765, + "rewards/rejected": 1.702832818031311, + "step": 5697 + }, + { + "epoch": 0.92, + "learning_rate": 5.8475246712804845e-06, + "logits/chosen": -1.0072462558746338, + "logits/rejected": -0.9859216213226318, + "logps/chosen": -68.57572937011719, + "logps/rejected": -70.17749786376953, + "loss": 1.2228, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.3470399379730225, + "rewards/margins": -2.2302277088165283, + "rewards/rejected": 4.577267646789551, + "step": 5698 + }, + { + "epoch": 0.93, + "learning_rate": 5.846229406186575e-06, + "logits/chosen": -1.1188011169433594, + "logits/rejected": -1.1028642654418945, + "logps/chosen": -93.57005310058594, + "logps/rejected": -65.2896957397461, + "loss": 0.3005, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.5927817821502686, + "rewards/margins": 0.4239013195037842, + "rewards/rejected": 2.1688804626464844, + "step": 5699 + }, + { + "epoch": 0.93, + "learning_rate": 5.844934082626223e-06, + "logits/chosen": -1.124049186706543, + "logits/rejected": -1.1177005767822266, + "logps/chosen": -30.7034912109375, + "logps/rejected": -5.201205253601074, + "loss": 1.057, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.21842651069164276, + "rewards/margins": -0.033108338713645935, + "rewards/rejected": 0.2515348494052887, + "step": 5700 + }, + { + "epoch": 0.93, + "learning_rate": 5.843638700688919e-06, + "logits/chosen": -0.5592150092124939, + "logits/rejected": -0.5560063123703003, + "logps/chosen": -1.8000696897506714, + "logps/rejected": -4.052732467651367, + "loss": 0.5904, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.24890652298927307, + "rewards/margins": -0.05776134133338928, + "rewards/rejected": 0.30666786432266235, + "step": 5701 + }, + { + "epoch": 0.93, + "learning_rate": 5.842343260464164e-06, + "logits/chosen": -1.0500048398971558, + "logits/rejected": -1.066092848777771, + "logps/chosen": -0.6307562589645386, + "logps/rejected": -16.60138702392578, + "loss": 0.4925, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.19862788915634155, + "rewards/margins": 0.15797236561775208, + "rewards/rejected": 0.04065551981329918, + "step": 5702 + }, + { + "epoch": 0.93, + "learning_rate": 5.84104776204146e-06, + "logits/chosen": -1.0781856775283813, + "logits/rejected": -1.1120166778564453, + "logps/chosen": -57.594825744628906, + "logps/rejected": -70.32068634033203, + "loss": 0.7545, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.1636528968811035, + "rewards/margins": -0.4954817295074463, + "rewards/rejected": 2.65913462638855, + "step": 5703 + }, + { + "epoch": 0.93, + "learning_rate": 5.8397522055103125e-06, + "logits/chosen": -1.179964542388916, + "logits/rejected": -1.28785240650177, + "logps/chosen": -115.69963073730469, + "logps/rejected": -152.39205932617188, + "loss": 0.5354, + "rewards/accuracies": 0.0, + "rewards/chosen": 4.501762390136719, + "rewards/margins": -0.6275496482849121, + "rewards/rejected": 5.129312038421631, + "step": 5704 + }, + { + "epoch": 0.93, + "learning_rate": 5.838456590960234e-06, + "logits/chosen": -1.4759154319763184, + "logits/rejected": -1.4266434907913208, + "logps/chosen": -172.15463256835938, + "logps/rejected": -110.26072692871094, + "loss": 0.2158, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.0779571533203125, + "rewards/margins": 0.63568115234375, + "rewards/rejected": 1.4422760009765625, + "step": 5705 + }, + { + "epoch": 0.93, + "learning_rate": 5.83716091848074e-06, + "logits/chosen": -1.4787651300430298, + "logits/rejected": -1.4187946319580078, + "logps/chosen": -82.84419250488281, + "logps/rejected": -73.36031341552734, + "loss": 3.2127, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.0934044122695923, + "rewards/margins": -2.799464225769043, + "rewards/rejected": 3.892868757247925, + "step": 5706 + }, + { + "epoch": 0.93, + "learning_rate": 5.835865188161346e-06, + "logits/chosen": -1.1685985326766968, + "logits/rejected": -1.1463472843170166, + "logps/chosen": -42.44731521606445, + "logps/rejected": -65.79979705810547, + "loss": 0.1013, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0563747882843018, + "rewards/margins": 2.170286178588867, + "rewards/rejected": 0.8860885500907898, + "step": 5707 + }, + { + "epoch": 0.93, + "learning_rate": 5.834569400091578e-06, + "logits/chosen": -1.2750431299209595, + "logits/rejected": -1.1757643222808838, + "logps/chosen": -230.852294921875, + "logps/rejected": -67.49015045166016, + "loss": 0.6619, + "rewards/accuracies": 1.0, + "rewards/chosen": 7.427227973937988, + "rewards/margins": 3.2078371047973633, + "rewards/rejected": 4.219390869140625, + "step": 5708 + }, + { + "epoch": 0.93, + "learning_rate": 5.833273554360959e-06, + "logits/chosen": -1.3760401010513306, + "logits/rejected": -1.4319266080856323, + "logps/chosen": -165.01612854003906, + "logps/rejected": -119.15798950195312, + "loss": 0.7867, + "rewards/accuracies": 0.0, + "rewards/chosen": 5.961830139160156, + "rewards/margins": -1.3244309425354004, + "rewards/rejected": 7.286261081695557, + "step": 5709 + }, + { + "epoch": 0.93, + "learning_rate": 5.8319776510590235e-06, + "logits/chosen": -1.0277866125106812, + "logits/rejected": -1.0266484022140503, + "logps/chosen": -41.55305480957031, + "logps/rejected": -99.41822814941406, + "loss": 0.5357, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.0595500469207764, + "rewards/margins": 0.31135666370391846, + "rewards/rejected": 1.748193383216858, + "step": 5710 + }, + { + "epoch": 0.93, + "learning_rate": 5.830681690275304e-06, + "logits/chosen": -1.274551510810852, + "logits/rejected": -1.3514446020126343, + "logps/chosen": -114.78518676757812, + "logps/rejected": -67.68827056884766, + "loss": 0.1489, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.5870423316955566, + "rewards/margins": 1.5437240600585938, + "rewards/rejected": 2.043318271636963, + "step": 5711 + }, + { + "epoch": 0.93, + "learning_rate": 5.82938567209934e-06, + "logits/chosen": -1.181894063949585, + "logits/rejected": -0.8109516501426697, + "logps/chosen": -92.58328247070312, + "logps/rejected": -32.040740966796875, + "loss": 0.0501, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.9957001209259033, + "rewards/margins": 3.417267322540283, + "rewards/rejected": 0.5784328579902649, + "step": 5712 + }, + { + "epoch": 0.93, + "learning_rate": 5.828089596620674e-06, + "logits/chosen": -1.0382689237594604, + "logits/rejected": -1.1760395765304565, + "logps/chosen": -52.779537200927734, + "logps/rejected": -95.30618286132812, + "loss": 1.4553, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.9406696557998657, + "rewards/margins": -2.8531627655029297, + "rewards/rejected": 4.793832302093506, + "step": 5713 + }, + { + "epoch": 0.93, + "learning_rate": 5.8267934639288525e-06, + "logits/chosen": -0.7219550013542175, + "logits/rejected": -0.7219550013542175, + "logps/chosen": -39.85304641723633, + "logps/rejected": -39.85304641723633, + "loss": 0.3881, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.49481463432312, + "rewards/margins": 0.0, + "rewards/rejected": 2.49481463432312, + "step": 5714 + }, + { + "epoch": 0.93, + "learning_rate": 5.825497274113425e-06, + "logits/chosen": -1.2157288789749146, + "logits/rejected": -1.1296366453170776, + "logps/chosen": -56.862125396728516, + "logps/rejected": -12.57036018371582, + "loss": 0.1704, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.6410138607025146, + "rewards/margins": 1.5794752836227417, + "rewards/rejected": 1.061538577079773, + "step": 5715 + }, + { + "epoch": 0.93, + "learning_rate": 5.824201027263948e-06, + "logits/chosen": -1.005233645439148, + "logits/rejected": -1.0449739694595337, + "logps/chosen": -152.389404296875, + "logps/rejected": -93.29989624023438, + "loss": 1.033, + "rewards/accuracies": 0.0, + "rewards/chosen": 4.782657146453857, + "rewards/margins": -1.389613151550293, + "rewards/rejected": 6.17227029800415, + "step": 5716 + }, + { + "epoch": 0.93, + "learning_rate": 5.822904723469979e-06, + "logits/chosen": -1.1869596242904663, + "logits/rejected": -0.7645111083984375, + "logps/chosen": -44.18421173095703, + "logps/rejected": -78.20282745361328, + "loss": 1.0352, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.080548048019409, + "rewards/margins": -1.5612220764160156, + "rewards/rejected": 3.641770124435425, + "step": 5717 + }, + { + "epoch": 0.93, + "learning_rate": 5.821608362821078e-06, + "logits/chosen": -1.1356096267700195, + "logits/rejected": -1.1719558238983154, + "logps/chosen": -115.84375, + "logps/rejected": -144.90704345703125, + "loss": 1.3354, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.7336838245391846, + "rewards/margins": -2.5779573917388916, + "rewards/rejected": 4.311641216278076, + "step": 5718 + }, + { + "epoch": 0.93, + "learning_rate": 5.820311945406814e-06, + "logits/chosen": -1.3532356023788452, + "logits/rejected": -1.5380616188049316, + "logps/chosen": -84.01101684570312, + "logps/rejected": -183.40774536132812, + "loss": 2.8123, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.1745948791503906, + "rewards/margins": -4.623746395111084, + "rewards/rejected": 5.798341274261475, + "step": 5719 + }, + { + "epoch": 0.93, + "learning_rate": 5.819015471316758e-06, + "logits/chosen": -0.971947431564331, + "logits/rejected": -0.9558452367782593, + "logps/chosen": -81.45452880859375, + "logps/rejected": -72.65252685546875, + "loss": 0.2839, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.4611587524414062, + "rewards/margins": 0.7894363403320312, + "rewards/rejected": 1.671722412109375, + "step": 5720 + }, + { + "epoch": 0.93, + "learning_rate": 5.817718940640481e-06, + "logits/chosen": -1.3912748098373413, + "logits/rejected": -1.3157517910003662, + "logps/chosen": -47.11188507080078, + "logps/rejected": -58.515419006347656, + "loss": 0.8546, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.9661468267440796, + "rewards/margins": 1.0048339366912842, + "rewards/rejected": 0.9613128900527954, + "step": 5721 + }, + { + "epoch": 0.93, + "learning_rate": 5.816422353467562e-06, + "logits/chosen": -0.7579830884933472, + "logits/rejected": -0.7487499117851257, + "logps/chosen": -6.714301109313965, + "logps/rejected": -4.425975322723389, + "loss": 1.1591, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.702267587184906, + "rewards/margins": -0.20585602521896362, + "rewards/rejected": 0.9081236124038696, + "step": 5722 + }, + { + "epoch": 0.93, + "learning_rate": 5.815125709887584e-06, + "logits/chosen": -1.1340755224227905, + "logits/rejected": -0.9078862071037292, + "logps/chosen": -174.28524780273438, + "logps/rejected": -159.85879516601562, + "loss": 1.346, + "rewards/accuracies": 0.0, + "rewards/chosen": 5.520936489105225, + "rewards/margins": -1.8531908988952637, + "rewards/rejected": 7.374127388000488, + "step": 5723 + }, + { + "epoch": 0.93, + "learning_rate": 5.813829009990133e-06, + "logits/chosen": -1.4013890027999878, + "logits/rejected": -1.3001446723937988, + "logps/chosen": -112.88542938232422, + "logps/rejected": -167.89707946777344, + "loss": 0.6864, + "rewards/accuracies": 0.0, + "rewards/chosen": 7.379096508026123, + "rewards/margins": -1.0590157508850098, + "rewards/rejected": 8.438112258911133, + "step": 5724 + }, + { + "epoch": 0.93, + "learning_rate": 5.812532253864798e-06, + "logits/chosen": -1.0275022983551025, + "logits/rejected": -0.9678757786750793, + "logps/chosen": -49.433868408203125, + "logps/rejected": -48.68268585205078, + "loss": 0.9016, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.0834624767303467, + "rewards/margins": -1.4167718887329102, + "rewards/rejected": 3.500234365463257, + "step": 5725 + }, + { + "epoch": 0.93, + "learning_rate": 5.811235441601172e-06, + "logits/chosen": -1.0960261821746826, + "logits/rejected": -1.1178234815597534, + "logps/chosen": -67.66976165771484, + "logps/rejected": -88.19850158691406, + "loss": 0.6998, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.356189012527466, + "rewards/margins": -0.8301758766174316, + "rewards/rejected": 3.1863648891448975, + "step": 5726 + }, + { + "epoch": 0.93, + "learning_rate": 5.809938573288853e-06, + "logits/chosen": -1.1346924304962158, + "logits/rejected": -1.0864603519439697, + "logps/chosen": -53.12397384643555, + "logps/rejected": -24.840288162231445, + "loss": 0.5532, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0756337642669678, + "rewards/margins": 1.5828086137771606, + "rewards/rejected": 1.4928251504898071, + "step": 5727 + }, + { + "epoch": 0.93, + "learning_rate": 5.808641649017442e-06, + "logits/chosen": -1.2233672142028809, + "logits/rejected": -1.2402410507202148, + "logps/chosen": -61.02583694458008, + "logps/rejected": -53.8553581237793, + "loss": 3.4126, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.084347128868103, + "rewards/margins": -2.0475687980651855, + "rewards/rejected": 3.131915807723999, + "step": 5728 + }, + { + "epoch": 0.93, + "learning_rate": 5.8073446688765445e-06, + "logits/chosen": -1.1896135807037354, + "logits/rejected": -1.1789129972457886, + "logps/chosen": -64.35641479492188, + "logps/rejected": -59.87640380859375, + "loss": 0.2439, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.4926880598068237, + "rewards/margins": 0.7750763297080994, + "rewards/rejected": 0.7176117300987244, + "step": 5729 + }, + { + "epoch": 0.93, + "learning_rate": 5.80604763295577e-06, + "logits/chosen": -1.061389684677124, + "logits/rejected": -1.0168038606643677, + "logps/chosen": -120.64542388916016, + "logps/rejected": -79.80232238769531, + "loss": 0.361, + "rewards/accuracies": 0.0, + "rewards/chosen": 5.594258785247803, + "rewards/margins": -0.011648178100585938, + "rewards/rejected": 5.605906963348389, + "step": 5730 + }, + { + "epoch": 0.93, + "learning_rate": 5.8047505413447305e-06, + "logits/chosen": -0.5737369060516357, + "logits/rejected": -0.5737369060516357, + "logps/chosen": -4.25459098815918, + "logps/rejected": -4.25459098815918, + "loss": 0.4121, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.5855751037597656, + "rewards/margins": 0.0, + "rewards/rejected": 0.5855751037597656, + "step": 5731 + }, + { + "epoch": 0.93, + "learning_rate": 5.803453394133043e-06, + "logits/chosen": -1.256155014038086, + "logits/rejected": -1.1118253469467163, + "logps/chosen": -82.530029296875, + "logps/rejected": -13.982532501220703, + "loss": 0.0712, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9942519664764404, + "rewards/margins": 1.8825815916061401, + "rewards/rejected": 1.1116703748703003, + "step": 5732 + }, + { + "epoch": 0.93, + "learning_rate": 5.802156191410331e-06, + "logits/chosen": -1.2900539636611938, + "logits/rejected": -1.300355315208435, + "logps/chosen": -102.45108032226562, + "logps/rejected": -104.64295959472656, + "loss": 1.0317, + "rewards/accuracies": 0.0, + "rewards/chosen": 5.259669780731201, + "rewards/margins": -1.6238188743591309, + "rewards/rejected": 6.883488655090332, + "step": 5733 + }, + { + "epoch": 0.93, + "learning_rate": 5.800858933266214e-06, + "logits/chosen": -1.0698336362838745, + "logits/rejected": -1.0829983949661255, + "logps/chosen": -78.66416931152344, + "logps/rejected": -60.5196418762207, + "loss": 1.6712, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.6981163024902344, + "rewards/margins": -2.5475759506225586, + "rewards/rejected": 5.245692253112793, + "step": 5734 + }, + { + "epoch": 0.93, + "learning_rate": 5.799561619790322e-06, + "logits/chosen": -0.8712919354438782, + "logits/rejected": -0.8712919354438782, + "logps/chosen": -1.1688164472579956, + "logps/rejected": -1.1688164472579956, + "loss": 0.5391, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.15996955335140228, + "rewards/margins": 0.0, + "rewards/rejected": 0.15996955335140228, + "step": 5735 + }, + { + "epoch": 0.93, + "learning_rate": 5.7982642510722875e-06, + "logits/chosen": -1.3400369882583618, + "logits/rejected": -1.4874898195266724, + "logps/chosen": -138.3396453857422, + "logps/rejected": -124.3580322265625, + "loss": 1.6531, + "rewards/accuracies": 0.0, + "rewards/chosen": 4.777507305145264, + "rewards/margins": -3.2657055854797363, + "rewards/rejected": 8.043212890625, + "step": 5736 + }, + { + "epoch": 0.93, + "learning_rate": 5.796966827201747e-06, + "logits/chosen": -0.7486514449119568, + "logits/rejected": -0.7983995676040649, + "logps/chosen": -51.888999938964844, + "logps/rejected": -64.70712280273438, + "loss": 1.058, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.5654624700546265, + "rewards/margins": -0.43551862239837646, + "rewards/rejected": 2.000981092453003, + "step": 5737 + }, + { + "epoch": 0.93, + "learning_rate": 5.795669348268339e-06, + "logits/chosen": -0.950405478477478, + "logits/rejected": -0.9588391780853271, + "logps/chosen": -2.121279239654541, + "logps/rejected": -14.641292572021484, + "loss": 1.7624, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.26521602272987366, + "rewards/margins": -0.4413069188594818, + "rewards/rejected": 0.7065229415893555, + "step": 5738 + }, + { + "epoch": 0.93, + "learning_rate": 5.794371814361709e-06, + "logits/chosen": -1.2933368682861328, + "logits/rejected": -1.3109387159347534, + "logps/chosen": -54.721038818359375, + "logps/rejected": -507.34478759765625, + "loss": 70.5105, + "rewards/accuracies": 0.0, + "rewards/chosen": 4.349891662597656, + "rewards/margins": -141.00537109375, + "rewards/rejected": 145.3552703857422, + "step": 5739 + }, + { + "epoch": 0.93, + "learning_rate": 5.793074225571502e-06, + "logits/chosen": -1.3498767614364624, + "logits/rejected": -1.3209302425384521, + "logps/chosen": -31.848167419433594, + "logps/rejected": -12.594511985778809, + "loss": 1.3014, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.75387042760849, + "rewards/margins": -0.17595529556274414, + "rewards/rejected": 0.9298257231712341, + "step": 5740 + }, + { + "epoch": 0.93, + "learning_rate": 5.791776581987371e-06, + "logits/chosen": -0.8710134029388428, + "logits/rejected": -0.8524168133735657, + "logps/chosen": -40.63983154296875, + "logps/rejected": -48.02235412597656, + "loss": 0.2718, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2083213329315186, + "rewards/margins": 0.4255537986755371, + "rewards/rejected": 2.7827675342559814, + "step": 5741 + }, + { + "epoch": 0.93, + "learning_rate": 5.79047888369897e-06, + "logits/chosen": -0.9118155837059021, + "logits/rejected": -0.9335992336273193, + "logps/chosen": -71.24057006835938, + "logps/rejected": -88.11384582519531, + "loss": 0.515, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.9027450680732727, + "rewards/margins": 0.48766326904296875, + "rewards/rejected": 0.41508179903030396, + "step": 5742 + }, + { + "epoch": 0.93, + "learning_rate": 5.789181130795958e-06, + "logits/chosen": -1.50007164478302, + "logits/rejected": -1.4737861156463623, + "logps/chosen": -53.09239959716797, + "logps/rejected": -42.65981674194336, + "loss": 0.5005, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.3205139636993408, + "rewards/margins": -0.5348705053329468, + "rewards/rejected": 1.8553844690322876, + "step": 5743 + }, + { + "epoch": 0.93, + "learning_rate": 5.787883323367995e-06, + "logits/chosen": -0.5862500667572021, + "logits/rejected": -0.5899673700332642, + "logps/chosen": -2.372117280960083, + "logps/rejected": -5.4735426902771, + "loss": 0.7527, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.27719923853874207, + "rewards/margins": -0.07358092069625854, + "rewards/rejected": 0.3507801592350006, + "step": 5744 + }, + { + "epoch": 0.93, + "learning_rate": 5.786585461504752e-06, + "logits/chosen": -0.5862078070640564, + "logits/rejected": -0.5795658826828003, + "logps/chosen": -4.286812782287598, + "logps/rejected": -18.20138931274414, + "loss": 1.6956, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.1410154402256012, + "rewards/margins": -0.2763395309448242, + "rewards/rejected": 0.4173549711704254, + "step": 5745 + }, + { + "epoch": 0.93, + "learning_rate": 5.785287545295895e-06, + "logits/chosen": -1.2517085075378418, + "logits/rejected": -1.2321367263793945, + "logps/chosen": -57.23588180541992, + "logps/rejected": -56.09133529663086, + "loss": 0.797, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.3458385467529297, + "rewards/margins": 0.7470871210098267, + "rewards/rejected": 1.598751425743103, + "step": 5746 + }, + { + "epoch": 0.93, + "learning_rate": 5.783989574831099e-06, + "logits/chosen": -1.223411202430725, + "logits/rejected": -0.9110434055328369, + "logps/chosen": -109.75045776367188, + "logps/rejected": -23.933368682861328, + "loss": 0.022, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.197999477386475, + "rewards/margins": 5.174021244049072, + "rewards/rejected": 1.0239781141281128, + "step": 5747 + }, + { + "epoch": 0.93, + "learning_rate": 5.782691550200043e-06, + "logits/chosen": -1.356345295906067, + "logits/rejected": -1.458771824836731, + "logps/chosen": -197.52061462402344, + "logps/rejected": -81.4309310913086, + "loss": 1.1181, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.542140483856201, + "rewards/margins": 4.603394508361816, + "rewards/rejected": 1.9387458562850952, + "step": 5748 + }, + { + "epoch": 0.93, + "learning_rate": 5.781393471492406e-06, + "logits/chosen": -1.2889931201934814, + "logits/rejected": -1.3007539510726929, + "logps/chosen": -96.32366180419922, + "logps/rejected": -220.6670684814453, + "loss": 5.1157, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.738909125328064, + "rewards/margins": -4.655394554138184, + "rewards/rejected": 6.394303798675537, + "step": 5749 + }, + { + "epoch": 0.93, + "learning_rate": 5.7800953387978735e-06, + "logits/chosen": -0.8060465455055237, + "logits/rejected": -0.799640953540802, + "logps/chosen": -1.0111052989959717, + "logps/rejected": -14.796721458435059, + "loss": 0.4411, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.40325927734375, + "rewards/margins": -0.08898115158081055, + "rewards/rejected": 0.49224042892456055, + "step": 5750 + }, + { + "epoch": 0.93, + "learning_rate": 5.778797152206133e-06, + "logits/chosen": -0.9679501056671143, + "logits/rejected": -0.9629113674163818, + "logps/chosen": -3.1126410961151123, + "logps/rejected": -4.071091651916504, + "loss": 0.8765, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.2544860243797302, + "rewards/margins": -0.030387938022613525, + "rewards/rejected": 0.28487396240234375, + "step": 5751 + }, + { + "epoch": 0.93, + "learning_rate": 5.77749891180688e-06, + "logits/chosen": -1.3413238525390625, + "logits/rejected": -1.246332049369812, + "logps/chosen": -92.76551818847656, + "logps/rejected": -57.9935188293457, + "loss": 0.2903, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.4733123779296875, + "rewards/margins": 0.24613213539123535, + "rewards/rejected": 2.227180242538452, + "step": 5752 + }, + { + "epoch": 0.93, + "learning_rate": 5.776200617689809e-06, + "logits/chosen": -1.352177619934082, + "logits/rejected": -1.3939725160598755, + "logps/chosen": -171.03721618652344, + "logps/rejected": -68.13612365722656, + "loss": 2.2076, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.549009799957275, + "rewards/margins": 4.095478057861328, + "rewards/rejected": 2.4535317420959473, + "step": 5753 + }, + { + "epoch": 0.93, + "learning_rate": 5.77490226994462e-06, + "logits/chosen": -0.9488040208816528, + "logits/rejected": -0.8851050734519958, + "logps/chosen": -81.35591125488281, + "logps/rejected": -74.40350341796875, + "loss": 1.639, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.7344765663146973, + "rewards/margins": -1.0658888816833496, + "rewards/rejected": 4.800365447998047, + "step": 5754 + }, + { + "epoch": 0.93, + "learning_rate": 5.773603868661015e-06, + "logits/chosen": -0.8060684204101562, + "logits/rejected": -0.7618247270584106, + "logps/chosen": -94.38514709472656, + "logps/rejected": -73.93875122070312, + "loss": 0.0899, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.778218984603882, + "rewards/margins": 1.6861871480941772, + "rewards/rejected": 1.0920318365097046, + "step": 5755 + }, + { + "epoch": 0.93, + "learning_rate": 5.772305413928704e-06, + "logits/chosen": -0.9303279519081116, + "logits/rejected": -0.7756215333938599, + "logps/chosen": -101.56765747070312, + "logps/rejected": -21.009429931640625, + "loss": 0.1808, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.8615020513534546, + "rewards/margins": 0.8388583660125732, + "rewards/rejected": 1.0226436853408813, + "step": 5756 + }, + { + "epoch": 0.93, + "learning_rate": 5.7710069058373965e-06, + "logits/chosen": -1.3687270879745483, + "logits/rejected": -1.3837389945983887, + "logps/chosen": -78.97920227050781, + "logps/rejected": -114.71269226074219, + "loss": 1.0861, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.7805496454238892, + "rewards/margins": -1.2204025983810425, + "rewards/rejected": 3.0009522438049316, + "step": 5757 + }, + { + "epoch": 0.93, + "learning_rate": 5.7697083444768065e-06, + "logits/chosen": -1.0053074359893799, + "logits/rejected": -0.8394204378128052, + "logps/chosen": -69.19573974609375, + "logps/rejected": -29.78365707397461, + "loss": 1.284, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.231494188308716, + "rewards/margins": 2.914039373397827, + "rewards/rejected": -0.6825451254844666, + "step": 5758 + }, + { + "epoch": 0.93, + "learning_rate": 5.768409729936653e-06, + "logits/chosen": -0.9213220477104187, + "logits/rejected": -0.9345649480819702, + "logps/chosen": -113.83544921875, + "logps/rejected": -204.08438110351562, + "loss": 0.3127, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.0356369018554688, + "rewards/margins": 0.9137725830078125, + "rewards/rejected": 1.1218643188476562, + "step": 5759 + }, + { + "epoch": 0.93, + "learning_rate": 5.767111062306657e-06, + "logits/chosen": -1.4745835065841675, + "logits/rejected": -1.4850318431854248, + "logps/chosen": -117.58305358886719, + "logps/rejected": -93.36642456054688, + "loss": 1.042, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.7502152919769287, + "rewards/margins": -1.0561933517456055, + "rewards/rejected": 3.806408643722534, + "step": 5760 + }, + { + "epoch": 0.94, + "learning_rate": 5.765812341676547e-06, + "logits/chosen": -1.0065088272094727, + "logits/rejected": -1.0065088272094727, + "logps/chosen": -17.389083862304688, + "logps/rejected": -17.389083862304688, + "loss": 0.3499, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.7635940313339233, + "rewards/margins": 0.0, + "rewards/rejected": 1.7635940313339233, + "step": 5761 + }, + { + "epoch": 0.94, + "learning_rate": 5.76451356813605e-06, + "logits/chosen": -1.1242318153381348, + "logits/rejected": -1.082322120666504, + "logps/chosen": -64.37106323242188, + "logps/rejected": -53.750205993652344, + "loss": 0.7598, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.050457715988159, + "rewards/margins": -0.6970758438110352, + "rewards/rejected": 2.7475335597991943, + "step": 5762 + }, + { + "epoch": 0.94, + "learning_rate": 5.763214741774898e-06, + "logits/chosen": -1.3875418901443481, + "logits/rejected": -1.252457857131958, + "logps/chosen": -90.14988708496094, + "logps/rejected": -64.85015106201172, + "loss": 0.0902, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.4534454345703125, + "rewards/margins": 1.6501579284667969, + "rewards/rejected": 2.8032875061035156, + "step": 5763 + }, + { + "epoch": 0.94, + "learning_rate": 5.76191586268283e-06, + "logits/chosen": -1.1687158346176147, + "logits/rejected": -1.1417042016983032, + "logps/chosen": -26.981937408447266, + "logps/rejected": -32.64307403564453, + "loss": 0.3142, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.1510555744171143, + "rewards/margins": 0.18409764766693115, + "rewards/rejected": 1.966957926750183, + "step": 5764 + }, + { + "epoch": 0.94, + "learning_rate": 5.760616930949584e-06, + "logits/chosen": -1.2354583740234375, + "logits/rejected": -1.1885660886764526, + "logps/chosen": -98.5732421875, + "logps/rejected": -128.19488525390625, + "loss": 1.0791, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.6262924671173096, + "rewards/margins": -0.6028532981872559, + "rewards/rejected": 2.2291457653045654, + "step": 5765 + }, + { + "epoch": 0.94, + "learning_rate": 5.759317946664906e-06, + "logits/chosen": -1.2192928791046143, + "logits/rejected": -1.21817946434021, + "logps/chosen": -34.02271270751953, + "logps/rejected": -50.412254333496094, + "loss": 0.4627, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.5527352094650269, + "rewards/margins": -0.4180412292480469, + "rewards/rejected": 1.9707764387130737, + "step": 5766 + }, + { + "epoch": 0.94, + "learning_rate": 5.758018909918543e-06, + "logits/chosen": -1.3133668899536133, + "logits/rejected": -1.2386372089385986, + "logps/chosen": -82.29312133789062, + "logps/rejected": -75.95550537109375, + "loss": 1.3782, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.9376800656318665, + "rewards/margins": -2.0892343521118164, + "rewards/rejected": 3.026914358139038, + "step": 5767 + }, + { + "epoch": 0.94, + "learning_rate": 5.756719820800245e-06, + "logits/chosen": -1.1281100511550903, + "logits/rejected": -1.2120801210403442, + "logps/chosen": -86.82725524902344, + "logps/rejected": -107.27957153320312, + "loss": 1.1955, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.681860327720642, + "rewards/margins": -2.2714343070983887, + "rewards/rejected": 3.953294515609741, + "step": 5768 + }, + { + "epoch": 0.94, + "learning_rate": 5.755420679399768e-06, + "logits/chosen": -1.3677260875701904, + "logits/rejected": -1.307654857635498, + "logps/chosen": -120.44119262695312, + "logps/rejected": -127.41487121582031, + "loss": 0.1011, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.2731781005859375, + "rewards/margins": 1.4977903366088867, + "rewards/rejected": 4.775387763977051, + "step": 5769 + }, + { + "epoch": 0.94, + "learning_rate": 5.7541214858068705e-06, + "logits/chosen": -1.1838001012802124, + "logits/rejected": -1.3885542154312134, + "logps/chosen": -168.012939453125, + "logps/rejected": -224.73236083984375, + "loss": 1.8951, + "rewards/accuracies": 0.0, + "rewards/chosen": 5.1230926513671875, + "rewards/margins": -3.7422914505004883, + "rewards/rejected": 8.865384101867676, + "step": 5770 + }, + { + "epoch": 0.94, + "learning_rate": 5.752822240111313e-06, + "logits/chosen": -1.3051331043243408, + "logits/rejected": -1.2828013896942139, + "logps/chosen": -158.26773071289062, + "logps/rejected": -68.77946472167969, + "loss": 0.0379, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.171288967132568, + "rewards/margins": 3.9363951683044434, + "rewards/rejected": 1.234893798828125, + "step": 5771 + }, + { + "epoch": 0.94, + "learning_rate": 5.751522942402861e-06, + "logits/chosen": -1.3113611936569214, + "logits/rejected": -1.0261726379394531, + "logps/chosen": -137.40577697753906, + "logps/rejected": -30.11881446838379, + "loss": 0.0076, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.842357158660889, + "rewards/margins": 5.0246381759643555, + "rewards/rejected": 0.8177191019058228, + "step": 5772 + }, + { + "epoch": 0.94, + "learning_rate": 5.750223592771286e-06, + "logits/chosen": -1.1811867952346802, + "logits/rejected": -1.190412998199463, + "logps/chosen": -58.641605377197266, + "logps/rejected": -54.090091705322266, + "loss": 1.6522, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.208529233932495, + "rewards/margins": -3.265996217727661, + "rewards/rejected": 6.474525451660156, + "step": 5773 + }, + { + "epoch": 0.94, + "learning_rate": 5.748924191306359e-06, + "logits/chosen": -1.0320709943771362, + "logits/rejected": -1.0479522943496704, + "logps/chosen": -66.28982543945312, + "logps/rejected": -62.30559158325195, + "loss": 0.2714, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.5136094093322754, + "rewards/margins": 0.3700854778289795, + "rewards/rejected": 2.143523931503296, + "step": 5774 + }, + { + "epoch": 0.94, + "learning_rate": 5.7476247380978564e-06, + "logits/chosen": -1.1393263339996338, + "logits/rejected": -1.120469570159912, + "logps/chosen": -65.02655792236328, + "logps/rejected": -46.074092864990234, + "loss": 0.2638, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.172415256500244, + "rewards/margins": 0.3905818462371826, + "rewards/rejected": 2.7818334102630615, + "step": 5775 + }, + { + "epoch": 0.94, + "learning_rate": 5.74632523323556e-06, + "logits/chosen": -1.5068594217300415, + "logits/rejected": -1.583209753036499, + "logps/chosen": -127.85784149169922, + "logps/rejected": -91.74136352539062, + "loss": 3.2325, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.5812294483184814, + "rewards/margins": -3.0461556911468506, + "rewards/rejected": 4.627385139465332, + "step": 5776 + }, + { + "epoch": 0.94, + "learning_rate": 5.745025676809251e-06, + "logits/chosen": -1.39198637008667, + "logits/rejected": -1.4167194366455078, + "logps/chosen": -84.30282592773438, + "logps/rejected": -116.68144989013672, + "loss": 0.386, + "rewards/accuracies": 0.0, + "rewards/chosen": 7.013754367828369, + "rewards/margins": -0.1383962631225586, + "rewards/rejected": 7.152150630950928, + "step": 5777 + }, + { + "epoch": 0.94, + "learning_rate": 5.743726068908717e-06, + "logits/chosen": -1.135713815689087, + "logits/rejected": -0.9877013564109802, + "logps/chosen": -38.453269958496094, + "logps/rejected": -10.096973419189453, + "loss": 0.5133, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.730913519859314, + "rewards/margins": 1.1679587364196777, + "rewards/rejected": 0.562954843044281, + "step": 5778 + }, + { + "epoch": 0.94, + "learning_rate": 5.742426409623749e-06, + "logits/chosen": -1.4510455131530762, + "logits/rejected": -1.2079808712005615, + "logps/chosen": -82.42829895019531, + "logps/rejected": -64.66549682617188, + "loss": 0.6315, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.4278243780136108, + "rewards/margins": -0.582351803779602, + "rewards/rejected": 2.010176181793213, + "step": 5779 + }, + { + "epoch": 0.94, + "learning_rate": 5.741126699044141e-06, + "logits/chosen": -1.2667807340621948, + "logits/rejected": -1.2508471012115479, + "logps/chosen": -71.69287109375, + "logps/rejected": -128.10421752929688, + "loss": 0.1821, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.7962188720703125, + "rewards/margins": 0.9825240969657898, + "rewards/rejected": 0.8136947751045227, + "step": 5780 + }, + { + "epoch": 0.94, + "learning_rate": 5.73982693725969e-06, + "logits/chosen": -1.5159012079238892, + "logits/rejected": -1.3499740362167358, + "logps/chosen": -89.75196838378906, + "logps/rejected": -45.94046401977539, + "loss": 0.3921, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.398872375488281, + "rewards/margins": 2.3513522148132324, + "rewards/rejected": 2.047520160675049, + "step": 5781 + }, + { + "epoch": 0.94, + "learning_rate": 5.738527124360199e-06, + "logits/chosen": -1.451524257659912, + "logits/rejected": -1.5344501733779907, + "logps/chosen": -65.57109069824219, + "logps/rejected": -141.09637451171875, + "loss": 1.7856, + "rewards/accuracies": 0.0, + "rewards/chosen": 5.990680694580078, + "rewards/margins": -3.352682113647461, + "rewards/rejected": 9.343362808227539, + "step": 5782 + }, + { + "epoch": 0.94, + "learning_rate": 5.737227260435472e-06, + "logits/chosen": -1.251049280166626, + "logits/rejected": -1.1909701824188232, + "logps/chosen": -132.76129150390625, + "logps/rejected": -132.70704650878906, + "loss": 0.1456, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.432347297668457, + "rewards/margins": 1.1283464431762695, + "rewards/rejected": 3.3040008544921875, + "step": 5783 + }, + { + "epoch": 0.94, + "learning_rate": 5.735927345575316e-06, + "logits/chosen": -1.3182190656661987, + "logits/rejected": -1.3654332160949707, + "logps/chosen": -244.45376586914062, + "logps/rejected": -104.29924011230469, + "loss": 0.0299, + "rewards/accuracies": 1.0, + "rewards/chosen": 7.871066570281982, + "rewards/margins": 2.788717746734619, + "rewards/rejected": 5.082348823547363, + "step": 5784 + }, + { + "epoch": 0.94, + "learning_rate": 5.734627379869544e-06, + "logits/chosen": -1.0248373746871948, + "logits/rejected": -1.0269057750701904, + "logps/chosen": -20.58576202392578, + "logps/rejected": -61.31003189086914, + "loss": 0.4571, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.607304096221924, + "rewards/margins": 0.8523415327072144, + "rewards/rejected": 1.7549625635147095, + "step": 5785 + }, + { + "epoch": 0.94, + "learning_rate": 5.733327363407973e-06, + "logits/chosen": -1.2359235286712646, + "logits/rejected": -1.2944124937057495, + "logps/chosen": -65.75303649902344, + "logps/rejected": -71.15251159667969, + "loss": 0.1981, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.014143466949463, + "rewards/margins": 0.9228196144104004, + "rewards/rejected": 1.0913238525390625, + "step": 5786 + }, + { + "epoch": 0.94, + "learning_rate": 5.732027296280418e-06, + "logits/chosen": -1.4364877939224243, + "logits/rejected": -1.3324533700942993, + "logps/chosen": -50.2562255859375, + "logps/rejected": -20.678646087646484, + "loss": 0.0483, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.44771146774292, + "rewards/margins": 2.9292659759521484, + "rewards/rejected": 1.518445611000061, + "step": 5787 + }, + { + "epoch": 0.94, + "learning_rate": 5.730727178576703e-06, + "logits/chosen": -0.9894852042198181, + "logits/rejected": -0.9103178381919861, + "logps/chosen": -47.738765716552734, + "logps/rejected": -60.10873794555664, + "loss": 0.6512, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.656097888946533, + "rewards/margins": -0.5644752979278564, + "rewards/rejected": 3.2205731868743896, + "step": 5788 + }, + { + "epoch": 0.94, + "learning_rate": 5.729427010386655e-06, + "logits/chosen": -1.0219767093658447, + "logits/rejected": -0.9854946136474609, + "logps/chosen": -75.52928924560547, + "logps/rejected": -51.672237396240234, + "loss": 0.2224, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.297849416732788, + "rewards/margins": 0.8702489137649536, + "rewards/rejected": 1.4276005029678345, + "step": 5789 + }, + { + "epoch": 0.94, + "learning_rate": 5.728126791800103e-06, + "logits/chosen": -1.0390831232070923, + "logits/rejected": -0.9893187880516052, + "logps/chosen": -69.6385498046875, + "logps/rejected": -43.163997650146484, + "loss": 0.3009, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.9261512756347656, + "rewards/margins": 0.23844337463378906, + "rewards/rejected": 1.6877079010009766, + "step": 5790 + }, + { + "epoch": 0.94, + "learning_rate": 5.726826522906879e-06, + "logits/chosen": -1.2165465354919434, + "logits/rejected": -1.1629817485809326, + "logps/chosen": -59.97221374511719, + "logps/rejected": -44.08812713623047, + "loss": 0.3802, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.059166669845581, + "rewards/margins": 0.22692406177520752, + "rewards/rejected": 1.8322426080703735, + "step": 5791 + }, + { + "epoch": 0.94, + "learning_rate": 5.725526203796818e-06, + "logits/chosen": -1.0438309907913208, + "logits/rejected": -1.1285172700881958, + "logps/chosen": -176.80612182617188, + "logps/rejected": -116.16897583007812, + "loss": 0.1844, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.881887912750244, + "rewards/margins": 0.9149963855743408, + "rewards/rejected": 3.9668915271759033, + "step": 5792 + }, + { + "epoch": 0.94, + "learning_rate": 5.724225834559762e-06, + "logits/chosen": -1.3675203323364258, + "logits/rejected": -1.1534287929534912, + "logps/chosen": -53.53261184692383, + "logps/rejected": -23.548555374145508, + "loss": 0.214, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.621805191040039, + "rewards/margins": 1.2012929916381836, + "rewards/rejected": 1.4205121994018555, + "step": 5793 + }, + { + "epoch": 0.94, + "learning_rate": 5.722925415285555e-06, + "logits/chosen": -1.083200216293335, + "logits/rejected": -1.293501377105713, + "logps/chosen": -50.93767547607422, + "logps/rejected": -104.76477813720703, + "loss": 5.3905, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.6249237060546875, + "rewards/margins": -6.075509071350098, + "rewards/rejected": 7.700432777404785, + "step": 5794 + }, + { + "epoch": 0.94, + "learning_rate": 5.721624946064042e-06, + "logits/chosen": -1.3727691173553467, + "logits/rejected": -1.5166503190994263, + "logps/chosen": -97.42655181884766, + "logps/rejected": -33.58138656616211, + "loss": 0.3524, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.9014534950256348, + "rewards/margins": 3.4940102100372314, + "rewards/rejected": 0.40744325518608093, + "step": 5795 + }, + { + "epoch": 0.94, + "learning_rate": 5.720324426985071e-06, + "logits/chosen": -1.1556237936019897, + "logits/rejected": -1.127289056777954, + "logps/chosen": -51.468414306640625, + "logps/rejected": -48.61810302734375, + "loss": 0.3364, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.3563613891601562, + "rewards/margins": 0.7697708606719971, + "rewards/rejected": 2.586590528488159, + "step": 5796 + }, + { + "epoch": 0.94, + "learning_rate": 5.7190238581385e-06, + "logits/chosen": -1.1274127960205078, + "logits/rejected": -1.1288613080978394, + "logps/chosen": -77.23519897460938, + "logps/rejected": -58.674827575683594, + "loss": 0.3868, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.8993446826934814, + "rewards/margins": 0.46466064453125, + "rewards/rejected": 2.4346840381622314, + "step": 5797 + }, + { + "epoch": 0.94, + "learning_rate": 5.717723239614183e-06, + "logits/chosen": -1.5118283033370972, + "logits/rejected": -1.436572551727295, + "logps/chosen": -83.09540557861328, + "logps/rejected": -81.87467193603516, + "loss": 0.2892, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9858384132385254, + "rewards/margins": 0.2501046657562256, + "rewards/rejected": 2.7357337474823, + "step": 5798 + }, + { + "epoch": 0.94, + "learning_rate": 5.716422571501982e-06, + "logits/chosen": -0.911521315574646, + "logits/rejected": -0.9107020497322083, + "logps/chosen": -10.763907432556152, + "logps/rejected": -1.4818800687789917, + "loss": 0.4409, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.11125393211841583, + "rewards/margins": -0.11445403099060059, + "rewards/rejected": 0.22570796310901642, + "step": 5799 + }, + { + "epoch": 0.94, + "learning_rate": 5.715121853891759e-06, + "logits/chosen": -1.2708089351654053, + "logits/rejected": -1.3167890310287476, + "logps/chosen": -56.667823791503906, + "logps/rejected": -96.30038452148438, + "loss": 2.4233, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.7594573497772217, + "rewards/margins": -1.4902575016021729, + "rewards/rejected": 4.2497148513793945, + "step": 5800 + }, + { + "epoch": 0.94, + "learning_rate": 5.713821086873382e-06, + "logits/chosen": -1.588690996170044, + "logits/rejected": -1.5749130249023438, + "logps/chosen": -246.85968017578125, + "logps/rejected": -127.0557861328125, + "loss": 0.0922, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.225415229797363, + "rewards/margins": 1.74729323387146, + "rewards/rejected": 2.4781219959259033, + "step": 5801 + }, + { + "epoch": 0.94, + "learning_rate": 5.712520270536723e-06, + "logits/chosen": -0.8137173652648926, + "logits/rejected": -0.8496047854423523, + "logps/chosen": -43.16614532470703, + "logps/rejected": -105.3947982788086, + "loss": 0.0826, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.8005623817443848, + "rewards/margins": 1.7262588739395142, + "rewards/rejected": 1.0743035078048706, + "step": 5802 + }, + { + "epoch": 0.94, + "learning_rate": 5.711219404971656e-06, + "logits/chosen": -0.9083017110824585, + "logits/rejected": -0.9112316370010376, + "logps/chosen": -78.81831359863281, + "logps/rejected": -51.98711013793945, + "loss": 0.3605, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1367509365081787, + "rewards/margins": 1.568199634552002, + "rewards/rejected": 1.5685513019561768, + "step": 5803 + }, + { + "epoch": 0.94, + "learning_rate": 5.709918490268057e-06, + "logits/chosen": -1.2664337158203125, + "logits/rejected": -1.1916084289550781, + "logps/chosen": -149.83987426757812, + "logps/rejected": -56.551353454589844, + "loss": 0.4255, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.061911106109619, + "rewards/margins": 3.1845123767852783, + "rewards/rejected": 1.8773987293243408, + "step": 5804 + }, + { + "epoch": 0.94, + "learning_rate": 5.7086175265158085e-06, + "logits/chosen": -0.9955212473869324, + "logits/rejected": -1.0006133317947388, + "logps/chosen": -58.54534149169922, + "logps/rejected": -40.48155975341797, + "loss": 0.7892, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.0895881652832031, + "rewards/margins": 0.03643035888671875, + "rewards/rejected": 1.0531578063964844, + "step": 5805 + }, + { + "epoch": 0.94, + "learning_rate": 5.707316513804793e-06, + "logits/chosen": -1.27994966506958, + "logits/rejected": -1.2312750816345215, + "logps/chosen": -46.201148986816406, + "logps/rejected": -63.12555694580078, + "loss": 1.2042, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.756063222885132, + "rewards/margins": -2.0102598667144775, + "rewards/rejected": 4.766323089599609, + "step": 5806 + }, + { + "epoch": 0.94, + "learning_rate": 5.7060154522249e-06, + "logits/chosen": -1.2802338600158691, + "logits/rejected": -1.2211889028549194, + "logps/chosen": -160.00682067871094, + "logps/rejected": -124.0953598022461, + "loss": 0.5185, + "rewards/accuracies": 0.0, + "rewards/chosen": 6.570143222808838, + "rewards/margins": -0.5445733070373535, + "rewards/rejected": 7.114716529846191, + "step": 5807 + }, + { + "epoch": 0.94, + "learning_rate": 5.704714341866019e-06, + "logits/chosen": -1.3201539516448975, + "logits/rejected": -1.4273313283920288, + "logps/chosen": -80.09928894042969, + "logps/rejected": -92.4443359375, + "loss": 2.352, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.477545976638794, + "rewards/margins": -2.9692084789276123, + "rewards/rejected": 5.446754455566406, + "step": 5808 + }, + { + "epoch": 0.94, + "learning_rate": 5.703413182818045e-06, + "logits/chosen": -1.146267056465149, + "logits/rejected": -1.1648157835006714, + "logps/chosen": -80.36540222167969, + "logps/rejected": -132.5853271484375, + "loss": 0.7693, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.237252950668335, + "rewards/margins": 0.6045717000961304, + "rewards/rejected": 1.6326812505722046, + "step": 5809 + }, + { + "epoch": 0.94, + "learning_rate": 5.702111975170875e-06, + "logits/chosen": -0.9945421814918518, + "logits/rejected": -0.9945421814918518, + "logps/chosen": -3.6875107288360596, + "logps/rejected": -3.6875107288360596, + "loss": 0.3558, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.314374327659607, + "rewards/margins": 0.0, + "rewards/rejected": 1.314374327659607, + "step": 5810 + }, + { + "epoch": 0.94, + "learning_rate": 5.700810719014412e-06, + "logits/chosen": -1.1611329317092896, + "logits/rejected": -0.7503260970115662, + "logps/chosen": -160.15345764160156, + "logps/rejected": -120.2564697265625, + "loss": 0.8104, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.0066680908203125, + "rewards/margins": -1.236973524093628, + "rewards/rejected": 3.2436416149139404, + "step": 5811 + }, + { + "epoch": 0.94, + "learning_rate": 5.69950941443856e-06, + "logits/chosen": -0.8403092622756958, + "logits/rejected": -0.762965202331543, + "logps/chosen": -77.43840026855469, + "logps/rejected": -11.738442420959473, + "loss": 0.6319, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.6187843680381775, + "rewards/margins": 0.6977465152740479, + "rewards/rejected": -0.07896213978528976, + "step": 5812 + }, + { + "epoch": 0.94, + "learning_rate": 5.6982080615332244e-06, + "logits/chosen": -1.270484209060669, + "logits/rejected": -1.1090031862258911, + "logps/chosen": -122.11360931396484, + "logps/rejected": -53.45680236816406, + "loss": 0.0682, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.606044769287109, + "rewards/margins": 1.9330687522888184, + "rewards/rejected": 3.672976016998291, + "step": 5813 + }, + { + "epoch": 0.94, + "learning_rate": 5.69690666038832e-06, + "logits/chosen": -1.31551194190979, + "logits/rejected": -1.2902408838272095, + "logps/chosen": -46.83604431152344, + "logps/rejected": -59.44797134399414, + "loss": 0.3259, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.106774091720581, + "rewards/margins": 0.28427767753601074, + "rewards/rejected": 2.8224964141845703, + "step": 5814 + }, + { + "epoch": 0.94, + "learning_rate": 5.695605211093759e-06, + "logits/chosen": -1.2246484756469727, + "logits/rejected": -1.2451399564743042, + "logps/chosen": -42.38862609863281, + "logps/rejected": -42.294559478759766, + "loss": 0.8685, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.2804977893829346, + "rewards/margins": -0.9022603034973145, + "rewards/rejected": 3.182758092880249, + "step": 5815 + }, + { + "epoch": 0.94, + "learning_rate": 5.69430371373946e-06, + "logits/chosen": -1.2611010074615479, + "logits/rejected": -1.2298805713653564, + "logps/chosen": -119.42448425292969, + "logps/rejected": -119.49901580810547, + "loss": 0.482, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2872025966644287, + "rewards/margins": 1.033850908279419, + "rewards/rejected": 2.2533516883850098, + "step": 5816 + }, + { + "epoch": 0.94, + "learning_rate": 5.693002168415343e-06, + "logits/chosen": -1.3022987842559814, + "logits/rejected": -1.302781581878662, + "logps/chosen": -62.80632781982422, + "logps/rejected": -59.371437072753906, + "loss": 1.1047, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.3593071699142456, + "rewards/margins": -2.059697151184082, + "rewards/rejected": 3.419004201889038, + "step": 5817 + }, + { + "epoch": 0.94, + "learning_rate": 5.691700575211335e-06, + "logits/chosen": -1.3590402603149414, + "logits/rejected": -1.3443719148635864, + "logps/chosen": -130.0184326171875, + "logps/rejected": -90.90731811523438, + "loss": 0.304, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.8394227027893066, + "rewards/margins": 0.3498108386993408, + "rewards/rejected": 2.489611864089966, + "step": 5818 + }, + { + "epoch": 0.94, + "learning_rate": 5.690398934217362e-06, + "logits/chosen": -1.150676965713501, + "logits/rejected": -1.136779546737671, + "logps/chosen": -46.11550521850586, + "logps/rejected": -85.52137756347656, + "loss": 0.202, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.8990329504013062, + "rewards/margins": 2.012073040008545, + "rewards/rejected": -0.11304016411304474, + "step": 5819 + }, + { + "epoch": 0.94, + "learning_rate": 5.689097245523354e-06, + "logits/chosen": -1.099587082862854, + "logits/rejected": -1.0376949310302734, + "logps/chosen": -43.4130859375, + "logps/rejected": -35.0888671875, + "loss": 1.102, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.6114776134490967, + "rewards/margins": 0.6345709562301636, + "rewards/rejected": 1.976906657218933, + "step": 5820 + }, + { + "epoch": 0.94, + "learning_rate": 5.687795509219247e-06, + "logits/chosen": -0.9989797472953796, + "logits/rejected": -1.032914400100708, + "logps/chosen": -122.23873138427734, + "logps/rejected": -108.30827331542969, + "loss": 3.1523, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.0761817693710327, + "rewards/margins": -4.596851348876953, + "rewards/rejected": 5.673033237457275, + "step": 5821 + }, + { + "epoch": 0.94, + "learning_rate": 5.686493725394978e-06, + "logits/chosen": -1.1245063543319702, + "logits/rejected": -0.8397395014762878, + "logps/chosen": -84.46111297607422, + "logps/rejected": -42.158721923828125, + "loss": 2.4593, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.7103424072265625, + "rewards/margins": 0.5827414989471436, + "rewards/rejected": 2.127600908279419, + "step": 5822 + }, + { + "epoch": 0.95, + "learning_rate": 5.6851918941404896e-06, + "logits/chosen": -1.1743018627166748, + "logits/rejected": -1.1379953622817993, + "logps/chosen": -98.99806213378906, + "logps/rejected": -77.87478637695312, + "loss": 0.6007, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0442612171173096, + "rewards/margins": 2.3257155418395996, + "rewards/rejected": 0.7185455560684204, + "step": 5823 + }, + { + "epoch": 0.95, + "learning_rate": 5.683890015545724e-06, + "logits/chosen": -1.1022627353668213, + "logits/rejected": -1.0847232341766357, + "logps/chosen": -65.49546813964844, + "logps/rejected": -44.09685516357422, + "loss": 0.5284, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.7212471961975098, + "rewards/margins": -0.6146330833435059, + "rewards/rejected": 3.3358802795410156, + "step": 5824 + }, + { + "epoch": 0.95, + "learning_rate": 5.682588089700629e-06, + "logits/chosen": -1.23738694190979, + "logits/rejected": -1.0875601768493652, + "logps/chosen": -119.49642944335938, + "logps/rejected": -43.59264373779297, + "loss": 0.05, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.741092205047607, + "rewards/margins": 4.407742500305176, + "rewards/rejected": 2.3333497047424316, + "step": 5825 + }, + { + "epoch": 0.95, + "learning_rate": 5.681286116695155e-06, + "logits/chosen": -0.6712246537208557, + "logits/rejected": -0.6146290898323059, + "logps/chosen": -17.975048065185547, + "logps/rejected": -12.85837173461914, + "loss": 1.0275, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.0264781713485718, + "rewards/margins": 0.2990201711654663, + "rewards/rejected": 0.7274580001831055, + "step": 5826 + }, + { + "epoch": 0.95, + "learning_rate": 5.679984096619258e-06, + "logits/chosen": -1.18174409866333, + "logits/rejected": -1.0995122194290161, + "logps/chosen": -30.787635803222656, + "logps/rejected": -18.81073760986328, + "loss": 0.0303, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.821901798248291, + "rewards/margins": 2.7998743057250977, + "rewards/rejected": 1.0220273733139038, + "step": 5827 + }, + { + "epoch": 0.95, + "learning_rate": 5.678682029562894e-06, + "logits/chosen": -1.047215461730957, + "logits/rejected": -1.0474554300308228, + "logps/chosen": -3.9295079708099365, + "logps/rejected": -6.553755760192871, + "loss": 0.4894, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.29873502254486084, + "rewards/margins": -0.08251088857650757, + "rewards/rejected": 0.3812459111213684, + "step": 5828 + }, + { + "epoch": 0.95, + "learning_rate": 5.677379915616023e-06, + "logits/chosen": -1.2971765995025635, + "logits/rejected": -1.2998528480529785, + "logps/chosen": -62.579254150390625, + "logps/rejected": -55.99855041503906, + "loss": 0.4426, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.0689361095428467, + "rewards/margins": 0.3987898826599121, + "rewards/rejected": 1.6701462268829346, + "step": 5829 + }, + { + "epoch": 0.95, + "learning_rate": 5.6760777548686096e-06, + "logits/chosen": -1.3748277425765991, + "logits/rejected": -1.3992756605148315, + "logps/chosen": -52.110862731933594, + "logps/rejected": -56.62806701660156, + "loss": 0.5279, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.0182130336761475, + "rewards/margins": -0.536916971206665, + "rewards/rejected": 2.5551300048828125, + "step": 5830 + }, + { + "epoch": 0.95, + "learning_rate": 5.674775547410619e-06, + "logits/chosen": -0.8469188809394836, + "logits/rejected": -0.8347278833389282, + "logps/chosen": -14.519495010375977, + "logps/rejected": -12.252731323242188, + "loss": 2.7563, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.8065196871757507, + "rewards/margins": -0.2079898715019226, + "rewards/rejected": 1.0145095586776733, + "step": 5831 + }, + { + "epoch": 0.95, + "learning_rate": 5.673473293332025e-06, + "logits/chosen": -1.1202329397201538, + "logits/rejected": -1.1125622987747192, + "logps/chosen": -95.67868041992188, + "logps/rejected": -59.74125289916992, + "loss": 3.0512, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.3648757934570312, + "rewards/margins": 0.3400142192840576, + "rewards/rejected": 2.0248615741729736, + "step": 5832 + }, + { + "epoch": 0.95, + "learning_rate": 5.672170992722797e-06, + "logits/chosen": -1.1962810754776, + "logits/rejected": -1.2013136148452759, + "logps/chosen": -72.39220428466797, + "logps/rejected": -87.21055603027344, + "loss": 0.2857, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.7208480834960938, + "rewards/margins": 0.28001248836517334, + "rewards/rejected": 1.4408355951309204, + "step": 5833 + }, + { + "epoch": 0.95, + "learning_rate": 5.670868645672916e-06, + "logits/chosen": -1.007234811782837, + "logits/rejected": -0.9692358374595642, + "logps/chosen": -22.7725772857666, + "logps/rejected": -7.921059608459473, + "loss": 0.7401, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.4999847412109375, + "rewards/margins": 0.13530606031417847, + "rewards/rejected": 0.36467868089675903, + "step": 5834 + }, + { + "epoch": 0.95, + "learning_rate": 5.669566252272357e-06, + "logits/chosen": -1.3416271209716797, + "logits/rejected": -1.3563565015792847, + "logps/chosen": -69.97821044921875, + "logps/rejected": -191.8649139404297, + "loss": 0.4047, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.376183986663818, + "rewards/margins": 0.0599212646484375, + "rewards/rejected": 5.316262722015381, + "step": 5835 + }, + { + "epoch": 0.95, + "learning_rate": 5.6682638126111055e-06, + "logits/chosen": -1.3337767124176025, + "logits/rejected": -1.2030013799667358, + "logps/chosen": -77.814697265625, + "logps/rejected": -129.27023315429688, + "loss": 0.0529, + "rewards/accuracies": 1.0, + "rewards/chosen": 7.870915412902832, + "rewards/margins": 2.1980714797973633, + "rewards/rejected": 5.672843933105469, + "step": 5836 + }, + { + "epoch": 0.95, + "learning_rate": 5.666961326779148e-06, + "logits/chosen": -1.3604031801223755, + "logits/rejected": -1.3172473907470703, + "logps/chosen": -60.28124237060547, + "logps/rejected": -57.38423156738281, + "loss": 0.5073, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.2814438343048096, + "rewards/margins": -0.12740707397460938, + "rewards/rejected": 2.408850908279419, + "step": 5837 + }, + { + "epoch": 0.95, + "learning_rate": 5.665658794866474e-06, + "logits/chosen": -1.0040723085403442, + "logits/rejected": -1.0352596044540405, + "logps/chosen": -103.93716430664062, + "logps/rejected": -119.46485900878906, + "loss": 0.4772, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.1389167308807373, + "rewards/margins": 0.1528998613357544, + "rewards/rejected": 1.986016869544983, + "step": 5838 + }, + { + "epoch": 0.95, + "learning_rate": 5.664356216963076e-06, + "logits/chosen": -1.119340181350708, + "logits/rejected": -1.1228183507919312, + "logps/chosen": -114.19427490234375, + "logps/rejected": -43.502593994140625, + "loss": 0.1747, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.562739610671997, + "rewards/margins": 1.7056012153625488, + "rewards/rejected": 0.857138454914093, + "step": 5839 + }, + { + "epoch": 0.95, + "learning_rate": 5.663053593158951e-06, + "logits/chosen": -1.2101688385009766, + "logits/rejected": -0.9653652310371399, + "logps/chosen": -103.69406127929688, + "logps/rejected": -55.87982940673828, + "loss": 0.1089, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.6633453369140625, + "rewards/margins": 1.4662039279937744, + "rewards/rejected": 3.197141408920288, + "step": 5840 + }, + { + "epoch": 0.95, + "learning_rate": 5.6617509235440955e-06, + "logits/chosen": -1.2623099088668823, + "logits/rejected": -1.2414196729660034, + "logps/chosen": -92.93229675292969, + "logps/rejected": -114.78164672851562, + "loss": 0.6714, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.0177536010742188, + "rewards/margins": 0.5799834728240967, + "rewards/rejected": 0.4377700984477997, + "step": 5841 + }, + { + "epoch": 0.95, + "learning_rate": 5.660448208208513e-06, + "logits/chosen": -1.4122058153152466, + "logits/rejected": -1.137558102607727, + "logps/chosen": -88.30271911621094, + "logps/rejected": -63.70903015136719, + "loss": 0.2889, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.5176475048065186, + "rewards/margins": 0.32965755462646484, + "rewards/rejected": 2.1879899501800537, + "step": 5842 + }, + { + "epoch": 0.95, + "learning_rate": 5.659145447242208e-06, + "logits/chosen": -1.3522831201553345, + "logits/rejected": -1.464229702949524, + "logps/chosen": -350.5638732910156, + "logps/rejected": -53.276939392089844, + "loss": 1.6114, + "rewards/accuracies": 1.0, + "rewards/chosen": 9.254892349243164, + "rewards/margins": 5.930375099182129, + "rewards/rejected": 3.324517011642456, + "step": 5843 + }, + { + "epoch": 0.95, + "learning_rate": 5.65784264073519e-06, + "logits/chosen": -1.1648293733596802, + "logits/rejected": -1.122592806816101, + "logps/chosen": -34.265872955322266, + "logps/rejected": -37.089439392089844, + "loss": 0.6526, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.407810688018799, + "rewards/margins": 0.625944972038269, + "rewards/rejected": 1.7818657159805298, + "step": 5844 + }, + { + "epoch": 0.95, + "learning_rate": 5.6565397887774686e-06, + "logits/chosen": -1.0479310750961304, + "logits/rejected": -0.9225828647613525, + "logps/chosen": -68.52876281738281, + "logps/rejected": -17.038299560546875, + "loss": 0.5791, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.98136305809021, + "rewards/margins": 2.843310594558716, + "rewards/rejected": 0.13805237412452698, + "step": 5845 + }, + { + "epoch": 0.95, + "learning_rate": 5.655236891459062e-06, + "logits/chosen": -1.052167296409607, + "logits/rejected": -1.0742427110671997, + "logps/chosen": -71.3816146850586, + "logps/rejected": -120.83602142333984, + "loss": 1.8464, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.481903076171875, + "rewards/margins": -3.4681100845336914, + "rewards/rejected": 5.950013160705566, + "step": 5846 + }, + { + "epoch": 0.95, + "learning_rate": 5.653933948869984e-06, + "logits/chosen": -1.2062792778015137, + "logits/rejected": -1.1393446922302246, + "logps/chosen": -53.145015716552734, + "logps/rejected": -53.49644088745117, + "loss": 0.2239, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.23754620552063, + "rewards/margins": 0.5841445922851562, + "rewards/rejected": 1.6534016132354736, + "step": 5847 + }, + { + "epoch": 0.95, + "learning_rate": 5.65263096110026e-06, + "logits/chosen": -1.4006152153015137, + "logits/rejected": -1.4007612466812134, + "logps/chosen": -87.33914947509766, + "logps/rejected": -81.35736083984375, + "loss": 1.1783, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.6631721258163452, + "rewards/margins": -0.43316590785980225, + "rewards/rejected": 2.0963380336761475, + "step": 5848 + }, + { + "epoch": 0.95, + "learning_rate": 5.6513279282399095e-06, + "logits/chosen": -0.9091323614120483, + "logits/rejected": -0.9681212902069092, + "logps/chosen": -92.56148529052734, + "logps/rejected": -69.4682846069336, + "loss": 0.164, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1307685375213623, + "rewards/margins": 1.173966884613037, + "rewards/rejected": 1.9568016529083252, + "step": 5849 + }, + { + "epoch": 0.95, + "learning_rate": 5.650024850378964e-06, + "logits/chosen": -1.1251620054244995, + "logits/rejected": -1.0379188060760498, + "logps/chosen": -101.10847473144531, + "logps/rejected": -43.97165298461914, + "loss": 0.725, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.099992275238037, + "rewards/margins": 1.3917460441589355, + "rewards/rejected": 3.7082462310791016, + "step": 5850 + }, + { + "epoch": 0.95, + "learning_rate": 5.64872172760745e-06, + "logits/chosen": -1.181295394897461, + "logits/rejected": -0.8740751147270203, + "logps/chosen": -132.03448486328125, + "logps/rejected": -28.60643768310547, + "loss": 0.1104, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.053143501281738, + "rewards/margins": 1.5859224796295166, + "rewards/rejected": 3.4672210216522217, + "step": 5851 + }, + { + "epoch": 0.95, + "learning_rate": 5.647418560015405e-06, + "logits/chosen": -1.2011083364486694, + "logits/rejected": -1.1615628004074097, + "logps/chosen": -24.4132022857666, + "logps/rejected": -45.82206726074219, + "loss": 0.7008, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.0296308994293213, + "rewards/margins": 0.48596668243408203, + "rewards/rejected": 1.5436642169952393, + "step": 5852 + }, + { + "epoch": 0.95, + "learning_rate": 5.646115347692862e-06, + "logits/chosen": -1.045231580734253, + "logits/rejected": -0.9006931781768799, + "logps/chosen": -56.23055648803711, + "logps/rejected": -16.619043350219727, + "loss": 0.5385, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.6746959686279297, + "rewards/margins": 0.7591851353645325, + "rewards/rejected": -0.08448915928602219, + "step": 5853 + }, + { + "epoch": 0.95, + "learning_rate": 5.644812090729863e-06, + "logits/chosen": -1.4529021978378296, + "logits/rejected": -1.553553581237793, + "logps/chosen": -70.7301025390625, + "logps/rejected": -100.43896484375, + "loss": 0.5725, + "rewards/accuracies": 0.0, + "rewards/chosen": 5.085839748382568, + "rewards/margins": -0.7291855812072754, + "rewards/rejected": 5.815025329589844, + "step": 5854 + }, + { + "epoch": 0.95, + "learning_rate": 5.64350878921645e-06, + "logits/chosen": -1.2021942138671875, + "logits/rejected": -1.1409088373184204, + "logps/chosen": -43.408233642578125, + "logps/rejected": -49.58016586303711, + "loss": 0.1533, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.058750867843628, + "rewards/margins": 1.0312602519989014, + "rewards/rejected": 2.0274906158447266, + "step": 5855 + }, + { + "epoch": 0.95, + "learning_rate": 5.642205443242668e-06, + "logits/chosen": -1.2303534746170044, + "logits/rejected": -1.0846277475357056, + "logps/chosen": -85.07568359375, + "logps/rejected": -72.65189361572266, + "loss": 0.404, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.7526214122772217, + "rewards/margins": 0.4751930236816406, + "rewards/rejected": 2.277428388595581, + "step": 5856 + }, + { + "epoch": 0.95, + "learning_rate": 5.6409020528985655e-06, + "logits/chosen": -1.7067197561264038, + "logits/rejected": -1.739084005355835, + "logps/chosen": -153.61004638671875, + "logps/rejected": -157.9822998046875, + "loss": 0.4083, + "rewards/accuracies": 0.0, + "rewards/chosen": 6.60321044921875, + "rewards/margins": -0.22301626205444336, + "rewards/rejected": 6.826226711273193, + "step": 5857 + }, + { + "epoch": 0.95, + "learning_rate": 5.6395986182741965e-06, + "logits/chosen": -1.3555630445480347, + "logits/rejected": -1.4726731777191162, + "logps/chosen": -302.2497253417969, + "logps/rejected": -75.45579528808594, + "loss": 0.2443, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.6369781494140625, + "rewards/margins": 0.47637462615966797, + "rewards/rejected": 6.1606035232543945, + "step": 5858 + }, + { + "epoch": 0.95, + "learning_rate": 5.6382951394596155e-06, + "logits/chosen": -1.152795433998108, + "logits/rejected": -1.0809487104415894, + "logps/chosen": -62.48961639404297, + "logps/rejected": -42.54096984863281, + "loss": 0.4315, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.485800266265869, + "rewards/margins": 0.03797459602355957, + "rewards/rejected": 2.4478256702423096, + "step": 5859 + }, + { + "epoch": 0.95, + "learning_rate": 5.636991616544878e-06, + "logits/chosen": -1.2503859996795654, + "logits/rejected": -1.2472736835479736, + "logps/chosen": -49.351165771484375, + "logps/rejected": -52.513980865478516, + "loss": 2.1456, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.5950868129730225, + "rewards/margins": -0.6133241653442383, + "rewards/rejected": 3.2084109783172607, + "step": 5860 + }, + { + "epoch": 0.95, + "learning_rate": 5.635688049620049e-06, + "logits/chosen": -1.3632830381393433, + "logits/rejected": -1.247679352760315, + "logps/chosen": -44.49281311035156, + "logps/rejected": -43.742408752441406, + "loss": 0.4513, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.7171380519866943, + "rewards/margins": 0.36774444580078125, + "rewards/rejected": 2.349393606185913, + "step": 5861 + }, + { + "epoch": 0.95, + "learning_rate": 5.6343844387751905e-06, + "logits/chosen": -1.23091721534729, + "logits/rejected": -1.3691251277923584, + "logps/chosen": -53.02149200439453, + "logps/rejected": -116.7694091796875, + "loss": 1.2327, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.9846222400665283, + "rewards/margins": -0.3621490001678467, + "rewards/rejected": 3.346771240234375, + "step": 5862 + }, + { + "epoch": 0.95, + "learning_rate": 5.633080784100368e-06, + "logits/chosen": -1.000106692314148, + "logits/rejected": -0.995389997959137, + "logps/chosen": -66.51731872558594, + "logps/rejected": -48.041961669921875, + "loss": 0.8856, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.4324318170547485, + "rewards/margins": -0.6311119794845581, + "rewards/rejected": 2.0635437965393066, + "step": 5863 + }, + { + "epoch": 0.95, + "learning_rate": 5.631777085685654e-06, + "logits/chosen": -1.0291179418563843, + "logits/rejected": -1.0294770002365112, + "logps/chosen": -74.48941802978516, + "logps/rejected": -83.32647705078125, + "loss": 0.7032, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.48955613374710083, + "rewards/margins": -0.03963470458984375, + "rewards/rejected": 0.5291908383369446, + "step": 5864 + }, + { + "epoch": 0.95, + "learning_rate": 5.6304733436211234e-06, + "logits/chosen": -1.1266906261444092, + "logits/rejected": -1.0700629949569702, + "logps/chosen": -80.9441146850586, + "logps/rejected": -26.822830200195312, + "loss": 0.8011, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.721459984779358, + "rewards/margins": -0.704196572303772, + "rewards/rejected": 2.42565655708313, + "step": 5865 + }, + { + "epoch": 0.95, + "learning_rate": 5.629169557996848e-06, + "logits/chosen": -1.0726381540298462, + "logits/rejected": -1.0281063318252563, + "logps/chosen": -49.411407470703125, + "logps/rejected": -37.122928619384766, + "loss": 0.5895, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.6059623956680298, + "rewards/margins": -0.6950465440750122, + "rewards/rejected": 2.301008939743042, + "step": 5866 + }, + { + "epoch": 0.95, + "learning_rate": 5.627865728902912e-06, + "logits/chosen": -0.733674168586731, + "logits/rejected": -1.0143754482269287, + "logps/chosen": -105.28887939453125, + "logps/rejected": -151.9201202392578, + "loss": 1.7093, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.045309543609619, + "rewards/margins": -2.979658603668213, + "rewards/rejected": 5.024968147277832, + "step": 5867 + }, + { + "epoch": 0.95, + "learning_rate": 5.626561856429393e-06, + "logits/chosen": -1.1732120513916016, + "logits/rejected": -1.1333249807357788, + "logps/chosen": -56.21701431274414, + "logps/rejected": -70.39602661132812, + "loss": 0.9532, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.4490063190460205, + "rewards/margins": 1.16115140914917, + "rewards/rejected": 2.2878549098968506, + "step": 5868 + }, + { + "epoch": 0.95, + "learning_rate": 5.625257940666379e-06, + "logits/chosen": -0.7577453255653381, + "logits/rejected": -0.6927330493927002, + "logps/chosen": -42.720760345458984, + "logps/rejected": -67.18909454345703, + "loss": 0.088, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0365102291107178, + "rewards/margins": 2.098505973815918, + "rewards/rejected": 0.9380043148994446, + "step": 5869 + }, + { + "epoch": 0.95, + "learning_rate": 5.623953981703958e-06, + "logits/chosen": -1.1210131645202637, + "logits/rejected": -1.0774966478347778, + "logps/chosen": -52.3330078125, + "logps/rejected": -53.826290130615234, + "loss": 0.6094, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.9982277154922485, + "rewards/margins": -0.8020884990692139, + "rewards/rejected": 1.8003162145614624, + "step": 5870 + }, + { + "epoch": 0.95, + "learning_rate": 5.62264997963222e-06, + "logits/chosen": -1.0496652126312256, + "logits/rejected": -0.9993926882743835, + "logps/chosen": -36.24037170410156, + "logps/rejected": -12.898738861083984, + "loss": 0.3715, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.1147960424423218, + "rewards/margins": 0.338523805141449, + "rewards/rejected": 0.7762722373008728, + "step": 5871 + }, + { + "epoch": 0.95, + "learning_rate": 5.62134593454126e-06, + "logits/chosen": -1.4496675729751587, + "logits/rejected": -1.3869268894195557, + "logps/chosen": -100.50390625, + "logps/rejected": -87.48330688476562, + "loss": 0.3162, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.448327541351318, + "rewards/margins": 0.4455413818359375, + "rewards/rejected": 4.002786159515381, + "step": 5872 + }, + { + "epoch": 0.95, + "learning_rate": 5.620041846521176e-06, + "logits/chosen": -1.230855941772461, + "logits/rejected": -1.309976577758789, + "logps/chosen": -40.99135208129883, + "logps/rejected": -154.0465087890625, + "loss": 0.7666, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.022418737411499, + "rewards/margins": -0.9275691509246826, + "rewards/rejected": 2.9499878883361816, + "step": 5873 + }, + { + "epoch": 0.95, + "learning_rate": 5.618737715662067e-06, + "logits/chosen": -1.1248444318771362, + "logits/rejected": -1.161072015762329, + "logps/chosen": -71.92190551757812, + "logps/rejected": -67.01901245117188, + "loss": 0.8126, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.5072433948516846, + "rewards/margins": -0.7819352149963379, + "rewards/rejected": 2.2891786098480225, + "step": 5874 + }, + { + "epoch": 0.95, + "learning_rate": 5.617433542054036e-06, + "logits/chosen": -1.4960092306137085, + "logits/rejected": -1.5840635299682617, + "logps/chosen": -89.40575408935547, + "logps/rejected": -103.34626770019531, + "loss": 0.9902, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.983200788497925, + "rewards/margins": -1.8258049488067627, + "rewards/rejected": 4.8090057373046875, + "step": 5875 + }, + { + "epoch": 0.95, + "learning_rate": 5.61612932578719e-06, + "logits/chosen": -1.2675645351409912, + "logits/rejected": -1.2210556268692017, + "logps/chosen": -134.85580444335938, + "logps/rejected": -80.98033142089844, + "loss": 0.0531, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.534759521484375, + "rewards/margins": 2.5054619312286377, + "rewards/rejected": 2.0292975902557373, + "step": 5876 + }, + { + "epoch": 0.95, + "learning_rate": 5.6148250669516365e-06, + "logits/chosen": -0.9779187440872192, + "logits/rejected": -1.0471817255020142, + "logps/chosen": -92.49336242675781, + "logps/rejected": -43.72832489013672, + "loss": 0.516, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.4629898071289062, + "rewards/margins": -0.34760355949401855, + "rewards/rejected": 2.810593366622925, + "step": 5877 + }, + { + "epoch": 0.95, + "learning_rate": 5.613520765637489e-06, + "logits/chosen": -1.363956332206726, + "logits/rejected": -1.0370478630065918, + "logps/chosen": -97.70149993896484, + "logps/rejected": -34.88140106201172, + "loss": 0.1187, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.889766693115234, + "rewards/margins": 3.713552951812744, + "rewards/rejected": 1.1762138605117798, + "step": 5878 + }, + { + "epoch": 0.95, + "learning_rate": 5.612216421934862e-06, + "logits/chosen": -1.093004584312439, + "logits/rejected": -1.0442711114883423, + "logps/chosen": -65.09651947021484, + "logps/rejected": -69.44960021972656, + "loss": 0.3356, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.0593955516815186, + "rewards/margins": 0.044597625732421875, + "rewards/rejected": 2.0147979259490967, + "step": 5879 + }, + { + "epoch": 0.95, + "learning_rate": 5.610912035933872e-06, + "logits/chosen": -1.362518310546875, + "logits/rejected": -1.3436816930770874, + "logps/chosen": -113.68980407714844, + "logps/rejected": -64.52264404296875, + "loss": 0.6279, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.7267807722091675, + "rewards/margins": -0.9093643426895142, + "rewards/rejected": 2.6361451148986816, + "step": 5880 + }, + { + "epoch": 0.95, + "learning_rate": 5.6096076077246416e-06, + "logits/chosen": -1.187007188796997, + "logits/rejected": -1.1295875310897827, + "logps/chosen": -70.90081024169922, + "logps/rejected": -65.9388198852539, + "loss": 0.5566, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.943714141845703, + "rewards/margins": 3.955435276031494, + "rewards/rejected": 0.9882789850234985, + "step": 5881 + }, + { + "epoch": 0.95, + "learning_rate": 5.608303137397294e-06, + "logits/chosen": -1.0856739282608032, + "logits/rejected": -1.2125959396362305, + "logps/chosen": -135.60293579101562, + "logps/rejected": -98.66921997070312, + "loss": 0.4844, + "rewards/accuracies": 0.0, + "rewards/chosen": 5.444714546203613, + "rewards/margins": -0.15648794174194336, + "rewards/rejected": 5.601202487945557, + "step": 5882 + }, + { + "epoch": 0.95, + "learning_rate": 5.606998625041955e-06, + "logits/chosen": -1.5855059623718262, + "logits/rejected": -1.639827013015747, + "logps/chosen": -159.5576629638672, + "logps/rejected": -74.283203125, + "loss": 0.0128, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.743872165679932, + "rewards/margins": 4.885195732116699, + "rewards/rejected": 1.8586761951446533, + "step": 5883 + }, + { + "epoch": 0.96, + "learning_rate": 5.605694070748755e-06, + "logits/chosen": -1.5489445924758911, + "logits/rejected": -1.4547537565231323, + "logps/chosen": -102.09349060058594, + "logps/rejected": -100.25745391845703, + "loss": 0.495, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.135575771331787, + "rewards/margins": 0.9192161560058594, + "rewards/rejected": 5.216359615325928, + "step": 5884 + }, + { + "epoch": 0.96, + "learning_rate": 5.6043894746078256e-06, + "logits/chosen": -0.8258888125419617, + "logits/rejected": -0.847307562828064, + "logps/chosen": -62.979183197021484, + "logps/rejected": -93.11410522460938, + "loss": 1.4952, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.4971240758895874, + "rewards/margins": -1.4149304628372192, + "rewards/rejected": 2.9120545387268066, + "step": 5885 + }, + { + "epoch": 0.96, + "learning_rate": 5.603084836709301e-06, + "logits/chosen": -0.6204648017883301, + "logits/rejected": -0.6204648017883301, + "logps/chosen": -0.38442736864089966, + "logps/rejected": -0.38442736864089966, + "loss": 0.4049, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.14449289441108704, + "rewards/margins": 0.0, + "rewards/rejected": 0.14449289441108704, + "step": 5886 + }, + { + "epoch": 0.96, + "learning_rate": 5.6017801571433224e-06, + "logits/chosen": -1.1570221185684204, + "logits/rejected": -1.046083688735962, + "logps/chosen": -76.92488861083984, + "logps/rejected": -32.38251495361328, + "loss": 3.1944, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.0066566467285156, + "rewards/margins": 1.6377251148223877, + "rewards/rejected": 0.3689315915107727, + "step": 5887 + }, + { + "epoch": 0.96, + "learning_rate": 5.600475436000029e-06, + "logits/chosen": -1.1875061988830566, + "logits/rejected": -1.1626332998275757, + "logps/chosen": -73.35763549804688, + "logps/rejected": -40.62287521362305, + "loss": 0.7309, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.2739776372909546, + "rewards/margins": -0.21553850173950195, + "rewards/rejected": 1.4895161390304565, + "step": 5888 + }, + { + "epoch": 0.96, + "learning_rate": 5.599170673369564e-06, + "logits/chosen": -1.1467655897140503, + "logits/rejected": -1.1475474834442139, + "logps/chosen": -3.7078330516815186, + "logps/rejected": -1.1882164478302002, + "loss": 0.4081, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.12032931298017502, + "rewards/margins": -0.2024100124835968, + "rewards/rejected": 0.3227393329143524, + "step": 5889 + }, + { + "epoch": 0.96, + "learning_rate": 5.597865869342075e-06, + "logits/chosen": -1.2726550102233887, + "logits/rejected": -1.2726550102233887, + "logps/chosen": -63.097412109375, + "logps/rejected": -63.097412109375, + "loss": 0.3535, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.9254348278045654, + "rewards/margins": 0.0, + "rewards/rejected": 3.9254348278045654, + "step": 5890 + }, + { + "epoch": 0.96, + "learning_rate": 5.596561024007711e-06, + "logits/chosen": -1.2736053466796875, + "logits/rejected": -1.3010755777359009, + "logps/chosen": -173.57626342773438, + "logps/rejected": -61.20146942138672, + "loss": 0.1862, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.658532619476318, + "rewards/margins": 1.1008434295654297, + "rewards/rejected": 4.557689189910889, + "step": 5891 + }, + { + "epoch": 0.96, + "learning_rate": 5.595256137456626e-06, + "logits/chosen": -1.1120017766952515, + "logits/rejected": -1.1405266523361206, + "logps/chosen": -116.94239807128906, + "logps/rejected": -105.94798278808594, + "loss": 0.3742, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.4554460048675537, + "rewards/margins": -0.06471395492553711, + "rewards/rejected": 2.520159959793091, + "step": 5892 + }, + { + "epoch": 0.96, + "learning_rate": 5.5939512097789735e-06, + "logits/chosen": -1.1697590351104736, + "logits/rejected": -1.0683308839797974, + "logps/chosen": -145.09840393066406, + "logps/rejected": -39.52881622314453, + "loss": 0.123, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.543125867843628, + "rewards/margins": 1.2820746898651123, + "rewards/rejected": 1.2610511779785156, + "step": 5893 + }, + { + "epoch": 0.96, + "learning_rate": 5.592646241064913e-06, + "logits/chosen": -1.185147762298584, + "logits/rejected": -1.1820296049118042, + "logps/chosen": -62.75605010986328, + "logps/rejected": -73.86314392089844, + "loss": 1.1049, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.7893829345703125, + "rewards/margins": -2.089550018310547, + "rewards/rejected": 3.8789329528808594, + "step": 5894 + }, + { + "epoch": 0.96, + "learning_rate": 5.591341231404604e-06, + "logits/chosen": -1.0064038038253784, + "logits/rejected": -1.053187370300293, + "logps/chosen": -63.09260940551758, + "logps/rejected": -97.6207504272461, + "loss": 0.5141, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0045101642608643, + "rewards/margins": 0.944382905960083, + "rewards/rejected": 2.0601272583007812, + "step": 5895 + }, + { + "epoch": 0.96, + "learning_rate": 5.590036180888212e-06, + "logits/chosen": -0.917547881603241, + "logits/rejected": -0.9183656573295593, + "logps/chosen": -36.9069709777832, + "logps/rejected": -78.43659210205078, + "loss": 0.5401, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.4656574726104736, + "rewards/margins": -0.6634707450866699, + "rewards/rejected": 3.1291282176971436, + "step": 5896 + }, + { + "epoch": 0.96, + "learning_rate": 5.588731089605903e-06, + "logits/chosen": -1.1294021606445312, + "logits/rejected": -1.1021045446395874, + "logps/chosen": -66.72820281982422, + "logps/rejected": -72.01543426513672, + "loss": 0.5785, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.6524658203125, + "rewards/margins": 1.1494652032852173, + "rewards/rejected": 1.5030006170272827, + "step": 5897 + }, + { + "epoch": 0.96, + "learning_rate": 5.5874259576478465e-06, + "logits/chosen": -1.2341728210449219, + "logits/rejected": -1.3007746934890747, + "logps/chosen": -75.47476196289062, + "logps/rejected": -94.13518524169922, + "loss": 1.5571, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.0123679637908936, + "rewards/margins": -1.7509384155273438, + "rewards/rejected": 3.7633063793182373, + "step": 5898 + }, + { + "epoch": 0.96, + "learning_rate": 5.586120785104213e-06, + "logits/chosen": -1.0228511095046997, + "logits/rejected": -0.9589236974716187, + "logps/chosen": -39.92676544189453, + "logps/rejected": -15.975863456726074, + "loss": 2.1122, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.11323618888855, + "rewards/margins": 2.247581958770752, + "rewards/rejected": 0.8656542897224426, + "step": 5899 + }, + { + "epoch": 0.96, + "learning_rate": 5.5848155720651805e-06, + "logits/chosen": -1.221658706665039, + "logits/rejected": -1.116544485092163, + "logps/chosen": -102.35630798339844, + "logps/rejected": -93.02397918701172, + "loss": 0.7835, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.115593671798706, + "rewards/margins": 1.436159372329712, + "rewards/rejected": 0.6794342398643494, + "step": 5900 + }, + { + "epoch": 0.96, + "learning_rate": 5.583510318620926e-06, + "logits/chosen": -1.0917960405349731, + "logits/rejected": -1.0883324146270752, + "logps/chosen": -52.98229217529297, + "logps/rejected": -101.61808013916016, + "loss": 1.8249, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.098989248275757, + "rewards/margins": -1.1378614902496338, + "rewards/rejected": 3.2368507385253906, + "step": 5901 + }, + { + "epoch": 0.96, + "learning_rate": 5.582205024861629e-06, + "logits/chosen": -1.4912352561950684, + "logits/rejected": -1.563408613204956, + "logps/chosen": -132.234375, + "logps/rejected": -101.58515930175781, + "loss": 0.7807, + "rewards/accuracies": 0.0, + "rewards/chosen": 6.3056230545043945, + "rewards/margins": -1.3134064674377441, + "rewards/rejected": 7.619029521942139, + "step": 5902 + }, + { + "epoch": 0.96, + "learning_rate": 5.580899690877473e-06, + "logits/chosen": -0.8351227045059204, + "logits/rejected": -0.8430628180503845, + "logps/chosen": -45.614967346191406, + "logps/rejected": -77.04765319824219, + "loss": 0.7389, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.5656380653381348, + "rewards/margins": -1.1525108814239502, + "rewards/rejected": 3.718148946762085, + "step": 5903 + }, + { + "epoch": 0.96, + "learning_rate": 5.579594316758647e-06, + "logits/chosen": -1.2859418392181396, + "logits/rejected": -1.241513729095459, + "logps/chosen": -101.36215209960938, + "logps/rejected": -101.27272033691406, + "loss": 0.1795, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.465719699859619, + "rewards/margins": 1.3310956954956055, + "rewards/rejected": 4.134624004364014, + "step": 5904 + }, + { + "epoch": 0.96, + "learning_rate": 5.578288902595336e-06, + "logits/chosen": -1.0261342525482178, + "logits/rejected": -1.0106863975524902, + "logps/chosen": -57.01362609863281, + "logps/rejected": -76.94017791748047, + "loss": 1.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.1787773370742798, + "rewards/margins": 0.33921587467193604, + "rewards/rejected": 0.8395614624023438, + "step": 5905 + }, + { + "epoch": 0.96, + "learning_rate": 5.5769834484777344e-06, + "logits/chosen": -1.1440309286117554, + "logits/rejected": -1.2201443910598755, + "logps/chosen": -59.90447998046875, + "logps/rejected": -64.48517608642578, + "loss": 1.0401, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.5032639503479004, + "rewards/margins": -1.7293319702148438, + "rewards/rejected": 5.232595920562744, + "step": 5906 + }, + { + "epoch": 0.96, + "learning_rate": 5.5756779544960345e-06, + "logits/chosen": -1.3056093454360962, + "logits/rejected": -1.2207177877426147, + "logps/chosen": -107.26046752929688, + "logps/rejected": -69.87581634521484, + "loss": 0.0906, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.690582275390625, + "rewards/margins": 1.6551430225372314, + "rewards/rejected": 2.0354392528533936, + "step": 5907 + }, + { + "epoch": 0.96, + "learning_rate": 5.574372420740437e-06, + "logits/chosen": -1.4114770889282227, + "logits/rejected": -1.277268886566162, + "logps/chosen": -170.1945037841797, + "logps/rejected": -65.78297424316406, + "loss": 0.2906, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.595396518707275, + "rewards/margins": 1.156367301940918, + "rewards/rejected": 5.439029216766357, + "step": 5908 + }, + { + "epoch": 0.96, + "learning_rate": 5.573066847301139e-06, + "logits/chosen": -1.2406539916992188, + "logits/rejected": -1.0751197338104248, + "logps/chosen": -124.1653823852539, + "logps/rejected": -88.66607666015625, + "loss": 0.2154, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.823842525482178, + "rewards/margins": 0.7475805282592773, + "rewards/rejected": 4.0762619972229, + "step": 5909 + }, + { + "epoch": 0.96, + "learning_rate": 5.571761234268345e-06, + "logits/chosen": -0.7430779337882996, + "logits/rejected": -0.6562444567680359, + "logps/chosen": -45.543701171875, + "logps/rejected": -45.25769805908203, + "loss": 0.2607, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.9967918395996094, + "rewards/margins": 0.5597419738769531, + "rewards/rejected": 1.4370498657226562, + "step": 5910 + }, + { + "epoch": 0.96, + "learning_rate": 5.570455581732259e-06, + "logits/chosen": -1.3368372917175293, + "logits/rejected": -1.3365137577056885, + "logps/chosen": -81.24462127685547, + "logps/rejected": -87.47393798828125, + "loss": 3.0366, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.5012366771698, + "rewards/margins": 0.7849448919296265, + "rewards/rejected": 1.7162917852401733, + "step": 5911 + }, + { + "epoch": 0.96, + "learning_rate": 5.56914988978309e-06, + "logits/chosen": -1.1634763479232788, + "logits/rejected": -1.135885238647461, + "logps/chosen": -78.99720764160156, + "logps/rejected": -51.594337463378906, + "loss": 1.6622, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.5554863214492798, + "rewards/margins": 0.14468073844909668, + "rewards/rejected": 1.410805583000183, + "step": 5912 + }, + { + "epoch": 0.96, + "learning_rate": 5.567844158511049e-06, + "logits/chosen": -1.3606281280517578, + "logits/rejected": -1.3153986930847168, + "logps/chosen": -86.79363250732422, + "logps/rejected": -78.55707550048828, + "loss": 0.442, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.993317604064941, + "rewards/margins": 0.31605100631713867, + "rewards/rejected": 4.677266597747803, + "step": 5913 + }, + { + "epoch": 0.96, + "learning_rate": 5.566538388006351e-06, + "logits/chosen": -0.9070197939872742, + "logits/rejected": -0.9216752052307129, + "logps/chosen": -77.84923553466797, + "logps/rejected": -31.946367263793945, + "loss": 1.7176, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.182549238204956, + "rewards/margins": -0.32892441749572754, + "rewards/rejected": 2.5114736557006836, + "step": 5914 + }, + { + "epoch": 0.96, + "learning_rate": 5.565232578359209e-06, + "logits/chosen": -1.2036856412887573, + "logits/rejected": -1.2036856412887573, + "logps/chosen": -66.26761627197266, + "logps/rejected": -66.26761627197266, + "loss": 0.3714, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.200705051422119, + "rewards/margins": 0.0, + "rewards/rejected": 2.200705051422119, + "step": 5915 + }, + { + "epoch": 0.96, + "learning_rate": 5.5639267296598455e-06, + "logits/chosen": -1.3389391899108887, + "logits/rejected": -1.312656044960022, + "logps/chosen": -149.37545776367188, + "logps/rejected": -58.839752197265625, + "loss": 0.1207, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.8450212478637695, + "rewards/margins": 1.3920326232910156, + "rewards/rejected": 4.452988624572754, + "step": 5916 + }, + { + "epoch": 0.96, + "learning_rate": 5.56262084199848e-06, + "logits/chosen": -0.9794439673423767, + "logits/rejected": -1.1734261512756348, + "logps/chosen": -69.87786865234375, + "logps/rejected": -164.51963806152344, + "loss": 2.0495, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.7367721796035767, + "rewards/margins": -2.409170627593994, + "rewards/rejected": 4.145942687988281, + "step": 5917 + }, + { + "epoch": 0.96, + "learning_rate": 5.5613149154653375e-06, + "logits/chosen": -0.7605395317077637, + "logits/rejected": -0.7522594332695007, + "logps/chosen": -54.68938446044922, + "logps/rejected": -100.40449523925781, + "loss": 0.1398, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.9964149594306946, + "rewards/margins": 1.3560974597930908, + "rewards/rejected": -0.35968247056007385, + "step": 5918 + }, + { + "epoch": 0.96, + "learning_rate": 5.560008950150647e-06, + "logits/chosen": -1.186853051185608, + "logits/rejected": -1.0878008604049683, + "logps/chosen": -136.9476318359375, + "logps/rejected": -51.22352600097656, + "loss": 0.2514, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.688045024871826, + "rewards/margins": 0.8275065422058105, + "rewards/rejected": 3.8605384826660156, + "step": 5919 + }, + { + "epoch": 0.96, + "learning_rate": 5.558702946144636e-06, + "logits/chosen": -0.8450395464897156, + "logits/rejected": -0.8201104998588562, + "logps/chosen": -77.63209533691406, + "logps/rejected": -15.959550857543945, + "loss": 0.5348, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.3265891969203949, + "rewards/margins": -0.457937628030777, + "rewards/rejected": 0.7845268249511719, + "step": 5920 + }, + { + "epoch": 0.96, + "learning_rate": 5.557396903537539e-06, + "logits/chosen": -1.258460283279419, + "logits/rejected": -1.3059130907058716, + "logps/chosen": -80.76209259033203, + "logps/rejected": -102.93054962158203, + "loss": 1.4835, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.3924949169158936, + "rewards/margins": -2.769984483718872, + "rewards/rejected": 5.162479400634766, + "step": 5921 + }, + { + "epoch": 0.96, + "learning_rate": 5.556090822419589e-06, + "logits/chosen": -1.3284375667572021, + "logits/rejected": -1.2329734563827515, + "logps/chosen": -73.28337097167969, + "logps/rejected": -53.24824523925781, + "loss": 1.9504, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.6320831775665283, + "rewards/margins": -1.288604736328125, + "rewards/rejected": 3.9206879138946533, + "step": 5922 + }, + { + "epoch": 0.96, + "learning_rate": 5.554784702881026e-06, + "logits/chosen": -1.3560194969177246, + "logits/rejected": -1.3047908544540405, + "logps/chosen": -75.32038879394531, + "logps/rejected": -41.61716079711914, + "loss": 0.5539, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9549896717071533, + "rewards/margins": 0.6655895709991455, + "rewards/rejected": 2.289400100708008, + "step": 5923 + }, + { + "epoch": 0.96, + "learning_rate": 5.553478545012088e-06, + "logits/chosen": -1.0745036602020264, + "logits/rejected": -1.0947375297546387, + "logps/chosen": -56.7608528137207, + "logps/rejected": -69.90000915527344, + "loss": 1.1594, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.101990222930908, + "rewards/margins": 0.3008037805557251, + "rewards/rejected": 1.801186442375183, + "step": 5924 + }, + { + "epoch": 0.96, + "learning_rate": 5.55217234890302e-06, + "logits/chosen": -0.8227748274803162, + "logits/rejected": -0.8035446405410767, + "logps/chosen": -58.14604949951172, + "logps/rejected": -106.04835510253906, + "loss": 2.2401, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.7120102643966675, + "rewards/margins": 1.2692116498947144, + "rewards/rejected": 0.4427986145019531, + "step": 5925 + }, + { + "epoch": 0.96, + "learning_rate": 5.550866114644068e-06, + "logits/chosen": -1.225638747215271, + "logits/rejected": -1.419566035270691, + "logps/chosen": -89.03109741210938, + "logps/rejected": -36.025184631347656, + "loss": 0.7307, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2578818798065186, + "rewards/margins": 2.9876952171325684, + "rewards/rejected": 0.2701866328716278, + "step": 5926 + }, + { + "epoch": 0.96, + "learning_rate": 5.549559842325478e-06, + "logits/chosen": -0.9772109985351562, + "logits/rejected": -1.0270382165908813, + "logps/chosen": -112.36514282226562, + "logps/rejected": -124.15814971923828, + "loss": 0.272, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.5661301612854004, + "rewards/margins": 0.920752763748169, + "rewards/rejected": 1.6453773975372314, + "step": 5927 + }, + { + "epoch": 0.96, + "learning_rate": 5.548253532037504e-06, + "logits/chosen": -1.1158093214035034, + "logits/rejected": -1.192991018295288, + "logps/chosen": -74.4146728515625, + "logps/rejected": -109.35820007324219, + "loss": 0.798, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.5871903896331787, + "rewards/margins": 0.06725621223449707, + "rewards/rejected": 2.5199341773986816, + "step": 5928 + }, + { + "epoch": 0.96, + "learning_rate": 5.546947183870399e-06, + "logits/chosen": -1.1218574047088623, + "logits/rejected": -1.1095430850982666, + "logps/chosen": -131.3394775390625, + "logps/rejected": -76.35250091552734, + "loss": 0.1201, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1436691284179688, + "rewards/margins": 1.4149872064590454, + "rewards/rejected": 1.7286819219589233, + "step": 5929 + }, + { + "epoch": 0.96, + "learning_rate": 5.54564079791442e-06, + "logits/chosen": -1.1725168228149414, + "logits/rejected": -1.1955738067626953, + "logps/chosen": -143.49473571777344, + "logps/rejected": -40.09193420410156, + "loss": 0.5678, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.0992143154144287, + "rewards/margins": 0.09038734436035156, + "rewards/rejected": 2.008826971054077, + "step": 5930 + }, + { + "epoch": 0.96, + "learning_rate": 5.544334374259823e-06, + "logits/chosen": -0.9750431180000305, + "logits/rejected": -0.9441259503364563, + "logps/chosen": -55.95641326904297, + "logps/rejected": -70.31381225585938, + "loss": 0.8447, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.2276099920272827, + "rewards/margins": -0.7803550958633423, + "rewards/rejected": 2.007965087890625, + "step": 5931 + }, + { + "epoch": 0.96, + "learning_rate": 5.543027912996872e-06, + "logits/chosen": -1.3097394704818726, + "logits/rejected": -1.0956918001174927, + "logps/chosen": -73.97543334960938, + "logps/rejected": -31.612098693847656, + "loss": 0.219, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.4899604320526123, + "rewards/margins": 3.245112895965576, + "rewards/rejected": -0.7551525235176086, + "step": 5932 + }, + { + "epoch": 0.96, + "learning_rate": 5.5417214142158305e-06, + "logits/chosen": -1.2209616899490356, + "logits/rejected": -1.1349129676818848, + "logps/chosen": -45.97462463378906, + "logps/rejected": -45.045135498046875, + "loss": 0.8403, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.3696563243865967, + "rewards/margins": -0.9870812892913818, + "rewards/rejected": 3.3567376136779785, + "step": 5933 + }, + { + "epoch": 0.96, + "learning_rate": 5.540414878006965e-06, + "logits/chosen": -0.9386519193649292, + "logits/rejected": -0.8800901770591736, + "logps/chosen": -47.37263488769531, + "logps/rejected": -37.570919036865234, + "loss": 0.4771, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.1105804443359375, + "rewards/margins": 0.17158091068267822, + "rewards/rejected": 1.9389995336532593, + "step": 5934 + }, + { + "epoch": 0.96, + "learning_rate": 5.539108304460544e-06, + "logits/chosen": -1.4417765140533447, + "logits/rejected": -1.2802131175994873, + "logps/chosen": -102.17225646972656, + "logps/rejected": -50.63597869873047, + "loss": 0.163, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.800744533538818, + "rewards/margins": 1.572965145111084, + "rewards/rejected": 3.2277793884277344, + "step": 5935 + }, + { + "epoch": 0.96, + "learning_rate": 5.537801693666841e-06, + "logits/chosen": -1.1351675987243652, + "logits/rejected": -1.0378037691116333, + "logps/chosen": -82.26113891601562, + "logps/rejected": -22.602767944335938, + "loss": 0.0457, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.297149181365967, + "rewards/margins": 2.5035691261291504, + "rewards/rejected": 1.7935799360275269, + "step": 5936 + }, + { + "epoch": 0.96, + "learning_rate": 5.536495045716129e-06, + "logits/chosen": -1.067407250404358, + "logits/rejected": -1.1474394798278809, + "logps/chosen": -61.84935760498047, + "logps/rejected": -59.95088195800781, + "loss": 0.6099, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.5968620777130127, + "rewards/margins": -0.5755248069763184, + "rewards/rejected": 3.172386884689331, + "step": 5937 + }, + { + "epoch": 0.96, + "learning_rate": 5.535188360698687e-06, + "logits/chosen": -1.4639759063720703, + "logits/rejected": -1.388893961906433, + "logps/chosen": -75.09044647216797, + "logps/rejected": -27.60919189453125, + "loss": 0.5548, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.4473098516464233, + "rewards/margins": 1.8018627166748047, + "rewards/rejected": -0.35455283522605896, + "step": 5938 + }, + { + "epoch": 0.96, + "learning_rate": 5.5338816387047926e-06, + "logits/chosen": -1.0641766786575317, + "logits/rejected": -1.0641766786575317, + "logps/chosen": -45.3249397277832, + "logps/rejected": -45.3249397277832, + "loss": 0.7044, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.509368658065796, + "rewards/margins": 0.0, + "rewards/rejected": 2.509368658065796, + "step": 5939 + }, + { + "epoch": 0.96, + "learning_rate": 5.532574879824729e-06, + "logits/chosen": -0.8333189487457275, + "logits/rejected": -0.821418046951294, + "logps/chosen": -75.92430877685547, + "logps/rejected": -45.08631896972656, + "loss": 0.3453, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.025011420249939, + "rewards/margins": 0.046920716762542725, + "rewards/rejected": 0.9780907034873962, + "step": 5940 + }, + { + "epoch": 0.96, + "learning_rate": 5.53126808414878e-06, + "logits/chosen": -1.3018256425857544, + "logits/rejected": -1.183341145515442, + "logps/chosen": -110.4543228149414, + "logps/rejected": -99.06975555419922, + "loss": 0.1005, + "rewards/accuracies": 1.0, + "rewards/chosen": 7.785982608795166, + "rewards/margins": 4.635454177856445, + "rewards/rejected": 3.1505286693573, + "step": 5941 + }, + { + "epoch": 0.96, + "learning_rate": 5.529961251767233e-06, + "logits/chosen": -1.092328667640686, + "logits/rejected": -1.040123462677002, + "logps/chosen": -65.03314208984375, + "logps/rejected": -113.97073364257812, + "loss": 0.2491, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.461514472961426, + "rewards/margins": 0.49149179458618164, + "rewards/rejected": 4.970022678375244, + "step": 5942 + }, + { + "epoch": 0.96, + "learning_rate": 5.528654382770379e-06, + "logits/chosen": -1.2164686918258667, + "logits/rejected": -1.2097283601760864, + "logps/chosen": -70.77723693847656, + "logps/rejected": -76.68851470947266, + "loss": 1.6204, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.4279006719589233, + "rewards/margins": -0.20962905883789062, + "rewards/rejected": 1.637529730796814, + "step": 5943 + }, + { + "epoch": 0.96, + "learning_rate": 5.527347477248508e-06, + "logits/chosen": -0.8723950386047363, + "logits/rejected": -0.9341862201690674, + "logps/chosen": -65.86146545410156, + "logps/rejected": -78.79124450683594, + "loss": 0.7001, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.4819228649139404, + "rewards/margins": -1.1050736904144287, + "rewards/rejected": 3.586996555328369, + "step": 5944 + }, + { + "epoch": 0.96, + "learning_rate": 5.5260405352919175e-06, + "logits/chosen": -1.3679428100585938, + "logits/rejected": -1.2841559648513794, + "logps/chosen": -146.1544189453125, + "logps/rejected": -62.469356536865234, + "loss": 0.401, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.153912544250488, + "rewards/margins": 3.968810796737671, + "rewards/rejected": 1.1851017475128174, + "step": 5945 + }, + { + "epoch": 0.97, + "learning_rate": 5.524733556990904e-06, + "logits/chosen": -0.9703583121299744, + "logits/rejected": -0.9929102659225464, + "logps/chosen": -74.41728973388672, + "logps/rejected": -130.6270294189453, + "loss": 1.3444, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.82524573802948, + "rewards/margins": 0.5314011573791504, + "rewards/rejected": 1.2938445806503296, + "step": 5946 + }, + { + "epoch": 0.97, + "learning_rate": 5.523426542435766e-06, + "logits/chosen": -1.3700318336486816, + "logits/rejected": -1.3632595539093018, + "logps/chosen": -237.51593017578125, + "logps/rejected": -67.01456451416016, + "loss": 0.6842, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.5302094221115112, + "rewards/margins": -0.156829833984375, + "rewards/rejected": 1.6870392560958862, + "step": 5947 + }, + { + "epoch": 0.97, + "learning_rate": 5.522119491716806e-06, + "logits/chosen": -0.982105016708374, + "logits/rejected": -0.9840127825737, + "logps/chosen": -40.811859130859375, + "logps/rejected": -122.24706268310547, + "loss": 0.4263, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.081313371658325, + "rewards/margins": -0.16939997673034668, + "rewards/rejected": 2.250713348388672, + "step": 5948 + }, + { + "epoch": 0.97, + "learning_rate": 5.520812404924329e-06, + "logits/chosen": -1.0644841194152832, + "logits/rejected": -1.1121152639389038, + "logps/chosen": -141.80752563476562, + "logps/rejected": -87.64763641357422, + "loss": 0.5096, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.6655030250549316, + "rewards/margins": -0.26712632179260254, + "rewards/rejected": 2.932629346847534, + "step": 5949 + }, + { + "epoch": 0.97, + "learning_rate": 5.519505282148644e-06, + "logits/chosen": -1.3256789445877075, + "logits/rejected": -1.3256789445877075, + "logps/chosen": -45.10223388671875, + "logps/rejected": -45.10223388671875, + "loss": 0.3478, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.8065025806427, + "rewards/margins": 0.0, + "rewards/rejected": 2.8065025806427, + "step": 5950 + }, + { + "epoch": 0.97, + "learning_rate": 5.518198123480059e-06, + "logits/chosen": -0.768071174621582, + "logits/rejected": -0.766343891620636, + "logps/chosen": -5.328221321105957, + "logps/rejected": -1.1625925302505493, + "loss": 1.1079, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.025687361136078835, + "rewards/margins": -0.1320396363735199, + "rewards/rejected": 0.10635226964950562, + "step": 5951 + }, + { + "epoch": 0.97, + "learning_rate": 5.516890929008887e-06, + "logits/chosen": -1.012661337852478, + "logits/rejected": -1.010057806968689, + "logps/chosen": -84.83525085449219, + "logps/rejected": -79.6612319946289, + "loss": 0.9087, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.8361930847167969, + "rewards/margins": -0.7462601661682129, + "rewards/rejected": 2.5824532508850098, + "step": 5952 + }, + { + "epoch": 0.97, + "learning_rate": 5.515583698825443e-06, + "logits/chosen": -0.8710480332374573, + "logits/rejected": -0.8474709391593933, + "logps/chosen": -95.65925598144531, + "logps/rejected": -61.43055725097656, + "loss": 0.2392, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.7854843139648438, + "rewards/margins": 0.6841316223144531, + "rewards/rejected": 1.1013526916503906, + "step": 5953 + }, + { + "epoch": 0.97, + "learning_rate": 5.514276433020044e-06, + "logits/chosen": -0.6912994384765625, + "logits/rejected": -0.6447054743766785, + "logps/chosen": -38.67251968383789, + "logps/rejected": -66.19813537597656, + "loss": 0.1212, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.8147106170654297, + "rewards/margins": 1.4086947441101074, + "rewards/rejected": 2.4060158729553223, + "step": 5954 + }, + { + "epoch": 0.97, + "learning_rate": 5.512969131683008e-06, + "logits/chosen": -0.9785550832748413, + "logits/rejected": -0.9896940588951111, + "logps/chosen": -4.326839923858643, + "logps/rejected": -0.6647501587867737, + "loss": 0.4356, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.13879084587097168, + "rewards/margins": -0.13890504837036133, + "rewards/rejected": 0.277695894241333, + "step": 5955 + }, + { + "epoch": 0.97, + "learning_rate": 5.511661794904659e-06, + "logits/chosen": -1.09529447555542, + "logits/rejected": -1.1553964614868164, + "logps/chosen": -187.82411193847656, + "logps/rejected": -131.48529052734375, + "loss": 0.2859, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.361036777496338, + "rewards/margins": 0.39638209342956543, + "rewards/rejected": 2.9646546840667725, + "step": 5956 + }, + { + "epoch": 0.97, + "learning_rate": 5.510354422775324e-06, + "logits/chosen": -1.421220064163208, + "logits/rejected": -1.4168208837509155, + "logps/chosen": -109.43997955322266, + "logps/rejected": -90.84031677246094, + "loss": 1.9448, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.4301795959472656, + "rewards/margins": -1.1710305213928223, + "rewards/rejected": 2.601210117340088, + "step": 5957 + }, + { + "epoch": 0.97, + "learning_rate": 5.509047015385325e-06, + "logits/chosen": -1.0427618026733398, + "logits/rejected": -1.019545316696167, + "logps/chosen": -57.976966857910156, + "logps/rejected": -49.80662536621094, + "loss": 1.495, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.3758506774902344, + "rewards/margins": -2.1699540615081787, + "rewards/rejected": 3.545804738998413, + "step": 5958 + }, + { + "epoch": 0.97, + "learning_rate": 5.507739572824995e-06, + "logits/chosen": -1.277095913887024, + "logits/rejected": -1.2509243488311768, + "logps/chosen": -88.13520812988281, + "logps/rejected": -91.42008972167969, + "loss": 0.596, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.614617943763733, + "rewards/margins": 0.13714599609375, + "rewards/rejected": 1.477471947669983, + "step": 5959 + }, + { + "epoch": 0.97, + "learning_rate": 5.506432095184664e-06, + "logits/chosen": -1.482454538345337, + "logits/rejected": -1.4433351755142212, + "logps/chosen": -68.02129364013672, + "logps/rejected": -76.86652374267578, + "loss": 0.5752, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.9623870849609375, + "rewards/margins": -0.7622122764587402, + "rewards/rejected": 4.724599361419678, + "step": 5960 + }, + { + "epoch": 0.97, + "learning_rate": 5.505124582554667e-06, + "logits/chosen": -0.9633169770240784, + "logits/rejected": -1.0188112258911133, + "logps/chosen": -66.06317138671875, + "logps/rejected": -115.00432586669922, + "loss": 0.7097, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.959893822669983, + "rewards/margins": -0.15664136409759521, + "rewards/rejected": 2.116535186767578, + "step": 5961 + }, + { + "epoch": 0.97, + "learning_rate": 5.503817035025341e-06, + "logits/chosen": -1.2687970399856567, + "logits/rejected": -1.4544731378555298, + "logps/chosen": -106.37088012695312, + "logps/rejected": -34.809288024902344, + "loss": 0.2845, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.7130355834960938, + "rewards/margins": 2.2990424633026123, + "rewards/rejected": 0.41399309039115906, + "step": 5962 + }, + { + "epoch": 0.97, + "learning_rate": 5.502509452687026e-06, + "logits/chosen": -0.7190617918968201, + "logits/rejected": -1.0568422079086304, + "logps/chosen": -61.564945220947266, + "logps/rejected": -58.267303466796875, + "loss": 0.5147, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.191126585006714, + "rewards/margins": 0.04861259460449219, + "rewards/rejected": 2.1425139904022217, + "step": 5963 + }, + { + "epoch": 0.97, + "learning_rate": 5.501201835630062e-06, + "logits/chosen": -1.3383435010910034, + "logits/rejected": -1.392716884613037, + "logps/chosen": -96.81883239746094, + "logps/rejected": -80.59423828125, + "loss": 0.3294, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.7872047424316406, + "rewards/margins": 0.3420677185058594, + "rewards/rejected": 1.4451370239257812, + "step": 5964 + }, + { + "epoch": 0.97, + "learning_rate": 5.4998941839447935e-06, + "logits/chosen": -1.3130372762680054, + "logits/rejected": -1.2841410636901855, + "logps/chosen": -108.64292907714844, + "logps/rejected": -41.433502197265625, + "loss": 0.6752, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.6109650135040283, + "rewards/margins": -0.9054412841796875, + "rewards/rejected": 2.516406297683716, + "step": 5965 + }, + { + "epoch": 0.97, + "learning_rate": 5.498586497721568e-06, + "logits/chosen": -1.0881412029266357, + "logits/rejected": -1.1602245569229126, + "logps/chosen": -95.81048583984375, + "logps/rejected": -100.07173156738281, + "loss": 0.4629, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.271094560623169, + "rewards/margins": -0.3992295265197754, + "rewards/rejected": 2.6703240871429443, + "step": 5966 + }, + { + "epoch": 0.97, + "learning_rate": 5.497278777050732e-06, + "logits/chosen": -1.0084636211395264, + "logits/rejected": -1.003039836883545, + "logps/chosen": -18.746259689331055, + "logps/rejected": -34.46778106689453, + "loss": 0.2808, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.37723466753959656, + "rewards/margins": 0.3876190483570099, + "rewards/rejected": -0.01038436871021986, + "step": 5967 + }, + { + "epoch": 0.97, + "learning_rate": 5.495971022022638e-06, + "logits/chosen": -0.9505341053009033, + "logits/rejected": -0.933872640132904, + "logps/chosen": -43.47161865234375, + "logps/rejected": -43.84152603149414, + "loss": 0.3558, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.8020260334014893, + "rewards/margins": 0.3108714818954468, + "rewards/rejected": 1.4911545515060425, + "step": 5968 + }, + { + "epoch": 0.97, + "learning_rate": 5.494663232727639e-06, + "logits/chosen": -0.9895955324172974, + "logits/rejected": -1.036380648612976, + "logps/chosen": -64.23432922363281, + "logps/rejected": -65.86918640136719, + "loss": 0.9104, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.8604980707168579, + "rewards/margins": -0.5283203125, + "rewards/rejected": 1.388818383216858, + "step": 5969 + }, + { + "epoch": 0.97, + "learning_rate": 5.493355409256091e-06, + "logits/chosen": -1.176964282989502, + "logits/rejected": -1.1439266204833984, + "logps/chosen": -48.12853240966797, + "logps/rejected": -55.44877624511719, + "loss": 1.8102, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.2458428144454956, + "rewards/margins": -0.5425170660018921, + "rewards/rejected": 1.7883598804473877, + "step": 5970 + }, + { + "epoch": 0.97, + "learning_rate": 5.4920475516983544e-06, + "logits/chosen": -0.6982525587081909, + "logits/rejected": -0.7019922137260437, + "logps/chosen": -104.0809326171875, + "logps/rejected": -46.5018196105957, + "loss": 0.9029, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.0867745876312256, + "rewards/margins": -0.7137141227722168, + "rewards/rejected": 2.8004887104034424, + "step": 5971 + }, + { + "epoch": 0.97, + "learning_rate": 5.490739660144786e-06, + "logits/chosen": -1.2692911624908447, + "logits/rejected": -1.3369085788726807, + "logps/chosen": -82.74624633789062, + "logps/rejected": -100.92796325683594, + "loss": 1.5687, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.8914124965667725, + "rewards/margins": -3.078474283218384, + "rewards/rejected": 5.969886779785156, + "step": 5972 + }, + { + "epoch": 0.97, + "learning_rate": 5.489431734685751e-06, + "logits/chosen": -1.0225297212600708, + "logits/rejected": -1.0853053331375122, + "logps/chosen": -70.1325454711914, + "logps/rejected": -93.19148254394531, + "loss": 0.5455, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.6721482276916504, + "rewards/margins": -0.48578333854675293, + "rewards/rejected": 3.1579315662384033, + "step": 5973 + }, + { + "epoch": 0.97, + "learning_rate": 5.4881237754116135e-06, + "logits/chosen": -0.9644213914871216, + "logits/rejected": -0.958611011505127, + "logps/chosen": -25.031322479248047, + "logps/rejected": -51.6622314453125, + "loss": 0.5159, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.9195820093154907, + "rewards/margins": 0.31577563285827637, + "rewards/rejected": 1.6038063764572144, + "step": 5974 + }, + { + "epoch": 0.97, + "learning_rate": 5.486815782412742e-06, + "logits/chosen": -1.307169795036316, + "logits/rejected": -1.3103569746017456, + "logps/chosen": -63.36250686645508, + "logps/rejected": -76.7121353149414, + "loss": 0.3398, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.5370242595672607, + "rewards/margins": 0.3512873649597168, + "rewards/rejected": 2.185736894607544, + "step": 5975 + }, + { + "epoch": 0.97, + "learning_rate": 5.485507755779506e-06, + "logits/chosen": -1.2575868368148804, + "logits/rejected": -1.0139139890670776, + "logps/chosen": -115.44779968261719, + "logps/rejected": -49.395713806152344, + "loss": 0.0043, + "rewards/accuracies": 1.0, + "rewards/chosen": 7.152227878570557, + "rewards/margins": 6.167992115020752, + "rewards/rejected": 0.9842357635498047, + "step": 5976 + }, + { + "epoch": 0.97, + "learning_rate": 5.484199695602279e-06, + "logits/chosen": -1.3795664310455322, + "logits/rejected": -1.2879399061203003, + "logps/chosen": -92.90412902832031, + "logps/rejected": -120.88252258300781, + "loss": 0.1555, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.722550868988037, + "rewards/margins": 1.1844542026519775, + "rewards/rejected": 3.5380966663360596, + "step": 5977 + }, + { + "epoch": 0.97, + "learning_rate": 5.482891601971434e-06, + "logits/chosen": -1.0733143091201782, + "logits/rejected": -1.05094313621521, + "logps/chosen": -245.64022827148438, + "logps/rejected": -90.56771850585938, + "loss": 0.444, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.627636909484863, + "rewards/margins": 2.994826555252075, + "rewards/rejected": 3.632810354232788, + "step": 5978 + }, + { + "epoch": 0.97, + "learning_rate": 5.481583474977349e-06, + "logits/chosen": -1.5961493253707886, + "logits/rejected": -1.5588980913162231, + "logps/chosen": -81.40652465820312, + "logps/rejected": -83.23468017578125, + "loss": 0.1468, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.918870687484741, + "rewards/margins": 1.3983758687973022, + "rewards/rejected": 1.520494818687439, + "step": 5979 + }, + { + "epoch": 0.97, + "learning_rate": 5.480275314710401e-06, + "logits/chosen": -1.1265970468521118, + "logits/rejected": -1.1411361694335938, + "logps/chosen": -45.63969039916992, + "logps/rejected": -90.78498840332031, + "loss": 0.3458, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.1413605213165283, + "rewards/margins": 0.2869507074356079, + "rewards/rejected": 0.8544098138809204, + "step": 5980 + }, + { + "epoch": 0.97, + "learning_rate": 5.478967121260975e-06, + "logits/chosen": -1.6824772357940674, + "logits/rejected": -1.7487397193908691, + "logps/chosen": -121.52626037597656, + "logps/rejected": -146.75949096679688, + "loss": 1.1597, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.3278579711914062, + "rewards/margins": -2.1835570335388184, + "rewards/rejected": 4.511415004730225, + "step": 5981 + }, + { + "epoch": 0.97, + "learning_rate": 5.477658894719453e-06, + "logits/chosen": -1.2051036357879639, + "logits/rejected": -1.24346125125885, + "logps/chosen": -83.3016357421875, + "logps/rejected": -123.86860656738281, + "loss": 2.1762, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.5662872791290283, + "rewards/margins": -0.8308517932891846, + "rewards/rejected": 2.397139072418213, + "step": 5982 + }, + { + "epoch": 0.97, + "learning_rate": 5.47635063517622e-06, + "logits/chosen": -0.7571152448654175, + "logits/rejected": -0.8080089688301086, + "logps/chosen": -114.42869567871094, + "logps/rejected": -81.51002502441406, + "loss": 0.9871, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.34494948387146, + "rewards/margins": -0.8471190929412842, + "rewards/rejected": 3.192068576812744, + "step": 5983 + }, + { + "epoch": 0.97, + "learning_rate": 5.475042342721666e-06, + "logits/chosen": -1.3863584995269775, + "logits/rejected": -1.3009991645812988, + "logps/chosen": -145.2691650390625, + "logps/rejected": -123.54936218261719, + "loss": 0.728, + "rewards/accuracies": 0.0, + "rewards/chosen": 6.391607761383057, + "rewards/margins": -1.1193861961364746, + "rewards/rejected": 7.510993957519531, + "step": 5984 + }, + { + "epoch": 0.97, + "learning_rate": 5.47373401744618e-06, + "logits/chosen": -1.2861748933792114, + "logits/rejected": -1.2957669496536255, + "logps/chosen": -58.72518539428711, + "logps/rejected": -99.53765869140625, + "loss": 3.2939, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.00298810005188, + "rewards/margins": -6.578962326049805, + "rewards/rejected": 9.581950187683105, + "step": 5985 + }, + { + "epoch": 0.97, + "learning_rate": 5.472425659440157e-06, + "logits/chosen": -1.4626673460006714, + "logits/rejected": -1.461284875869751, + "logps/chosen": -78.10134887695312, + "logps/rejected": -71.19607543945312, + "loss": 1.4797, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.8638709783554077, + "rewards/margins": -0.9988366365432739, + "rewards/rejected": 2.8627076148986816, + "step": 5986 + }, + { + "epoch": 0.97, + "learning_rate": 5.4711172687939904e-06, + "logits/chosen": -0.9307325482368469, + "logits/rejected": -0.9018092751502991, + "logps/chosen": -53.995140075683594, + "logps/rejected": -2.114866256713867, + "loss": 0.899, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.5050186514854431, + "rewards/margins": -0.05435103178024292, + "rewards/rejected": 0.559369683265686, + "step": 5987 + }, + { + "epoch": 0.97, + "learning_rate": 5.469808845598079e-06, + "logits/chosen": -1.5193697214126587, + "logits/rejected": -1.5672309398651123, + "logps/chosen": -146.8448486328125, + "logps/rejected": -109.4608154296875, + "loss": 1.589, + "rewards/accuracies": 0.0, + "rewards/chosen": 5.117488384246826, + "rewards/margins": -1.4003047943115234, + "rewards/rejected": 6.51779317855835, + "step": 5988 + }, + { + "epoch": 0.97, + "learning_rate": 5.468500389942821e-06, + "logits/chosen": -0.4815688133239746, + "logits/rejected": -0.4815688133239746, + "logps/chosen": -8.250916481018066, + "logps/rejected": -8.250916481018066, + "loss": 0.7215, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.4292352795600891, + "rewards/margins": 0.0, + "rewards/rejected": 0.4292352795600891, + "step": 5989 + }, + { + "epoch": 0.97, + "learning_rate": 5.46719190191862e-06, + "logits/chosen": -1.1799991130828857, + "logits/rejected": -1.1315927505493164, + "logps/chosen": -62.16987609863281, + "logps/rejected": -63.12938690185547, + "loss": 0.455, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.6217682361602783, + "rewards/margins": -0.36313700675964355, + "rewards/rejected": 2.984905242919922, + "step": 5990 + }, + { + "epoch": 0.97, + "learning_rate": 5.465883381615877e-06, + "logits/chosen": -1.085959553718567, + "logits/rejected": -0.9890903830528259, + "logps/chosen": -37.554134368896484, + "logps/rejected": -10.226423263549805, + "loss": 0.1399, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.7418549060821533, + "rewards/margins": 1.144045352935791, + "rewards/rejected": 0.5978096127510071, + "step": 5991 + }, + { + "epoch": 0.97, + "learning_rate": 5.464574829125002e-06, + "logits/chosen": -1.2664508819580078, + "logits/rejected": -1.139518141746521, + "logps/chosen": -108.97747802734375, + "logps/rejected": -57.64927673339844, + "loss": 0.4953, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.397953987121582, + "rewards/margins": 2.4242501258850098, + "rewards/rejected": 2.9737038612365723, + "step": 5992 + }, + { + "epoch": 0.97, + "learning_rate": 5.463266244536403e-06, + "logits/chosen": -1.1073901653289795, + "logits/rejected": -1.0950721502304077, + "logps/chosen": -96.16706848144531, + "logps/rejected": -95.20964813232422, + "loss": 1.4716, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.6997498273849487, + "rewards/margins": -2.5531439781188965, + "rewards/rejected": 4.252893924713135, + "step": 5993 + }, + { + "epoch": 0.97, + "learning_rate": 5.461957627940489e-06, + "logits/chosen": -1.0035372972488403, + "logits/rejected": -0.9002406597137451, + "logps/chosen": -32.067115783691406, + "logps/rejected": -28.779277801513672, + "loss": 0.2744, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.3370964527130127, + "rewards/margins": 0.32153284549713135, + "rewards/rejected": 1.0155636072158813, + "step": 5994 + }, + { + "epoch": 0.97, + "learning_rate": 5.460648979427674e-06, + "logits/chosen": -1.1882517337799072, + "logits/rejected": -1.1906251907348633, + "logps/chosen": -104.61805725097656, + "logps/rejected": -87.68882751464844, + "loss": 0.4018, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.6968986988067627, + "rewards/margins": 0.10357367992401123, + "rewards/rejected": 1.5933250188827515, + "step": 5995 + }, + { + "epoch": 0.97, + "learning_rate": 5.459340299088374e-06, + "logits/chosen": -1.084890604019165, + "logits/rejected": -1.1193214654922485, + "logps/chosen": -70.99626159667969, + "logps/rejected": -63.71300506591797, + "loss": 0.3501, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.067584276199341, + "rewards/margins": 0.016843318939208984, + "rewards/rejected": 2.050740957260132, + "step": 5996 + }, + { + "epoch": 0.97, + "learning_rate": 5.458031587013005e-06, + "logits/chosen": -0.9346526265144348, + "logits/rejected": -0.9337185025215149, + "logps/chosen": -2.878744602203369, + "logps/rejected": -1.635778546333313, + "loss": 0.4873, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.35140058398246765, + "rewards/margins": 0.10037770867347717, + "rewards/rejected": 0.2510228753089905, + "step": 5997 + }, + { + "epoch": 0.97, + "learning_rate": 5.456722843291987e-06, + "logits/chosen": -1.111636757850647, + "logits/rejected": -1.0425313711166382, + "logps/chosen": -111.79132080078125, + "logps/rejected": -57.005863189697266, + "loss": 0.352, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.7353241443634033, + "rewards/margins": 0.7272571325302124, + "rewards/rejected": 1.008067011833191, + "step": 5998 + }, + { + "epoch": 0.97, + "learning_rate": 5.455414068015743e-06, + "logits/chosen": -1.5348376035690308, + "logits/rejected": -1.5352768898010254, + "logps/chosen": -71.4705810546875, + "logps/rejected": -253.88088989257812, + "loss": 1.9056, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.4561432600021362, + "rewards/margins": -3.2525787353515625, + "rewards/rejected": 4.708722114562988, + "step": 5999 + }, + { + "epoch": 0.97, + "learning_rate": 5.454105261274696e-06, + "logits/chosen": -1.0890289545059204, + "logits/rejected": -1.1246230602264404, + "logps/chosen": -48.31333541870117, + "logps/rejected": -54.92894744873047, + "loss": 0.9131, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.603200912475586, + "rewards/margins": -0.7550473213195801, + "rewards/rejected": 2.358248233795166, + "step": 6000 + }, + { + "epoch": 0.97, + "learning_rate": 5.452796423159273e-06, + "logits/chosen": -1.1588795185089111, + "logits/rejected": -1.1500316858291626, + "logps/chosen": -48.328880310058594, + "logps/rejected": -68.88285827636719, + "loss": 1.7217, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.4119980335235596, + "rewards/margins": 0.5111267566680908, + "rewards/rejected": 1.9008712768554688, + "step": 6001 + }, + { + "epoch": 0.97, + "learning_rate": 5.451487553759899e-06, + "logits/chosen": -1.084722876548767, + "logits/rejected": -1.0544283390045166, + "logps/chosen": -75.84197235107422, + "logps/rejected": -24.529258728027344, + "loss": 2.1316, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.8563377857208252, + "rewards/margins": -2.0415191650390625, + "rewards/rejected": 3.8978569507598877, + "step": 6002 + }, + { + "epoch": 0.97, + "learning_rate": 5.450178653167009e-06, + "logits/chosen": -1.26856529712677, + "logits/rejected": -1.0762605667114258, + "logps/chosen": -169.704345703125, + "logps/rejected": -205.48248291015625, + "loss": 0.5293, + "rewards/accuracies": 0.0, + "rewards/chosen": 5.59651517868042, + "rewards/margins": -0.5974974632263184, + "rewards/rejected": 6.194012641906738, + "step": 6003 + }, + { + "epoch": 0.97, + "learning_rate": 5.448869721471033e-06, + "logits/chosen": -1.3656712770462036, + "logits/rejected": -1.2244430780410767, + "logps/chosen": -139.59786987304688, + "logps/rejected": -34.001346588134766, + "loss": 0.0521, + "rewards/accuracies": 1.0, + "rewards/chosen": 8.696359634399414, + "rewards/margins": 4.9177069664001465, + "rewards/rejected": 3.7786526679992676, + "step": 6004 + }, + { + "epoch": 0.97, + "learning_rate": 5.447560758762405e-06, + "logits/chosen": -1.0855050086975098, + "logits/rejected": -1.0738557577133179, + "logps/chosen": -49.96575164794922, + "logps/rejected": -42.98890686035156, + "loss": 1.0321, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.0653282403945923, + "rewards/margins": 0.0862838625907898, + "rewards/rejected": 0.9790443778038025, + "step": 6005 + }, + { + "epoch": 0.97, + "learning_rate": 5.446251765131566e-06, + "logits/chosen": -1.4140961170196533, + "logits/rejected": -1.358017086982727, + "logps/chosen": -80.7949447631836, + "logps/rejected": -36.711326599121094, + "loss": 0.0777, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.7542245388031006, + "rewards/margins": 3.1580371856689453, + "rewards/rejected": 0.5961872339248657, + "step": 6006 + }, + { + "epoch": 0.98, + "learning_rate": 5.444942740668952e-06, + "logits/chosen": -1.0801314115524292, + "logits/rejected": -1.0888484716415405, + "logps/chosen": -80.19588470458984, + "logps/rejected": -112.44944763183594, + "loss": 1.1939, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.9891074895858765, + "rewards/margins": -1.4793449640274048, + "rewards/rejected": 3.4684524536132812, + "step": 6007 + }, + { + "epoch": 0.98, + "learning_rate": 5.443633685465004e-06, + "logits/chosen": -1.1406357288360596, + "logits/rejected": -1.090683937072754, + "logps/chosen": -93.38839721679688, + "logps/rejected": -86.21216583251953, + "loss": 1.2301, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.0709335803985596, + "rewards/margins": -2.34904408454895, + "rewards/rejected": 3.4199776649475098, + "step": 6008 + }, + { + "epoch": 0.98, + "learning_rate": 5.442324599610166e-06, + "logits/chosen": -1.1323355436325073, + "logits/rejected": -1.1091506481170654, + "logps/chosen": -103.1422119140625, + "logps/rejected": -55.956825256347656, + "loss": 1.0052, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.350637823343277, + "rewards/margins": -0.2661670744419098, + "rewards/rejected": 0.6168048977851868, + "step": 6009 + }, + { + "epoch": 0.98, + "learning_rate": 5.441015483194883e-06, + "logits/chosen": -0.9550777077674866, + "logits/rejected": -0.9654729962348938, + "logps/chosen": -56.09346008300781, + "logps/rejected": -46.23542022705078, + "loss": 0.2995, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.5076065063476562, + "rewards/margins": 0.20671772956848145, + "rewards/rejected": 2.300888776779175, + "step": 6010 + }, + { + "epoch": 0.98, + "learning_rate": 5.439706336309605e-06, + "logits/chosen": -0.9397514462471008, + "logits/rejected": -0.9397514462471008, + "logps/chosen": -68.21315002441406, + "logps/rejected": -68.21315002441406, + "loss": 0.6134, + "rewards/accuracies": 0.0, + "rewards/chosen": 4.5305681228637695, + "rewards/margins": 0.0, + "rewards/rejected": 4.5305681228637695, + "step": 6011 + }, + { + "epoch": 0.98, + "learning_rate": 5.438397159044778e-06, + "logits/chosen": -1.2547880411148071, + "logits/rejected": -1.0961071252822876, + "logps/chosen": -62.16817092895508, + "logps/rejected": -70.11314392089844, + "loss": 0.0043, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.775629758834839, + "rewards/margins": 5.073148727416992, + "rewards/rejected": -1.2975189685821533, + "step": 6012 + }, + { + "epoch": 0.98, + "learning_rate": 5.437087951490856e-06, + "logits/chosen": -0.9597587585449219, + "logits/rejected": -0.9379720687866211, + "logps/chosen": -40.928218841552734, + "logps/rejected": -63.11415481567383, + "loss": 0.9634, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.8353115320205688, + "rewards/margins": -0.8893142938613892, + "rewards/rejected": 2.724625825881958, + "step": 6013 + }, + { + "epoch": 0.98, + "learning_rate": 5.435778713738292e-06, + "logits/chosen": -1.1782662868499756, + "logits/rejected": -1.0804027318954468, + "logps/chosen": -95.57015991210938, + "logps/rejected": -61.80662155151367, + "loss": 0.0199, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.069427490234375, + "rewards/margins": 4.089416027069092, + "rewards/rejected": 1.9800113439559937, + "step": 6014 + }, + { + "epoch": 0.98, + "learning_rate": 5.4344694458775425e-06, + "logits/chosen": -1.4125169515609741, + "logits/rejected": -1.2686879634857178, + "logps/chosen": -59.67581558227539, + "logps/rejected": -15.407018661499023, + "loss": 0.0402, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.296655654907227, + "rewards/margins": 3.550266742706299, + "rewards/rejected": 0.7463890314102173, + "step": 6015 + }, + { + "epoch": 0.98, + "learning_rate": 5.4331601479990655e-06, + "logits/chosen": -1.0828689336776733, + "logits/rejected": -1.0485631227493286, + "logps/chosen": -188.04086303710938, + "logps/rejected": -75.70738983154297, + "loss": 0.0263, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.320120334625244, + "rewards/margins": 4.205483436584473, + "rewards/rejected": 1.114637017250061, + "step": 6016 + }, + { + "epoch": 0.98, + "learning_rate": 5.43185082019332e-06, + "logits/chosen": -1.4995367527008057, + "logits/rejected": -1.4099647998809814, + "logps/chosen": -95.40194702148438, + "logps/rejected": -26.793123245239258, + "loss": 1.3839, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.5072288513183594, + "rewards/margins": 1.8837130069732666, + "rewards/rejected": 0.6235159039497375, + "step": 6017 + }, + { + "epoch": 0.98, + "learning_rate": 5.43054146255077e-06, + "logits/chosen": -1.0059453248977661, + "logits/rejected": -0.9381141066551208, + "logps/chosen": -72.67593383789062, + "logps/rejected": -22.90402603149414, + "loss": 0.8751, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.1330153942108154, + "rewards/margins": -0.19693565368652344, + "rewards/rejected": 2.329951047897339, + "step": 6018 + }, + { + "epoch": 0.98, + "learning_rate": 5.429232075161877e-06, + "logits/chosen": -1.1905919313430786, + "logits/rejected": -1.117850422859192, + "logps/chosen": -47.361488342285156, + "logps/rejected": -52.73964309692383, + "loss": 0.3263, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.119683027267456, + "rewards/margins": 0.7128733396530151, + "rewards/rejected": 1.406809687614441, + "step": 6019 + }, + { + "epoch": 0.98, + "learning_rate": 5.427922658117111e-06, + "logits/chosen": -1.073438048362732, + "logits/rejected": -1.136243224143982, + "logps/chosen": -54.74801254272461, + "logps/rejected": -77.42010498046875, + "loss": 1.2412, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.2024158239364624, + "rewards/margins": -1.972082257270813, + "rewards/rejected": 3.1744980812072754, + "step": 6020 + }, + { + "epoch": 0.98, + "learning_rate": 5.426613211506938e-06, + "logits/chosen": -1.3337832689285278, + "logits/rejected": -1.0266400575637817, + "logps/chosen": -188.5526580810547, + "logps/rejected": -78.77823638916016, + "loss": 1.608, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.692149639129639, + "rewards/margins": 0.6102762222290039, + "rewards/rejected": 4.081873416900635, + "step": 6021 + }, + { + "epoch": 0.98, + "learning_rate": 5.425303735421828e-06, + "logits/chosen": -1.0490666627883911, + "logits/rejected": -1.0447261333465576, + "logps/chosen": -133.0172882080078, + "logps/rejected": -85.37959289550781, + "loss": 0.9662, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.622216820716858, + "rewards/margins": -1.633508324623108, + "rewards/rejected": 3.255725145339966, + "step": 6022 + }, + { + "epoch": 0.98, + "learning_rate": 5.423994229952255e-06, + "logits/chosen": -1.5182377099990845, + "logits/rejected": -1.5337541103363037, + "logps/chosen": -65.77679443359375, + "logps/rejected": -88.51216125488281, + "loss": 0.5405, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.1144301891326904, + "rewards/margins": -0.5223374366760254, + "rewards/rejected": 3.636767625808716, + "step": 6023 + }, + { + "epoch": 0.98, + "learning_rate": 5.4226846951886925e-06, + "logits/chosen": -1.2551262378692627, + "logits/rejected": -1.1735118627548218, + "logps/chosen": -268.09283447265625, + "logps/rejected": -68.90751647949219, + "loss": 1.4741, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.558135986328125, + "rewards/margins": 0.672119140625, + "rewards/rejected": 1.886016845703125, + "step": 6024 + }, + { + "epoch": 0.98, + "learning_rate": 5.421375131221617e-06, + "logits/chosen": -1.0272364616394043, + "logits/rejected": -1.0964388847351074, + "logps/chosen": -95.08367156982422, + "logps/rejected": -124.26786804199219, + "loss": 1.3073, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.752191185951233, + "rewards/margins": -2.1093292236328125, + "rewards/rejected": 3.861520528793335, + "step": 6025 + }, + { + "epoch": 0.98, + "learning_rate": 5.420065538141507e-06, + "logits/chosen": -0.9906818866729736, + "logits/rejected": -0.9906818866729736, + "logps/chosen": -85.57769775390625, + "logps/rejected": -85.57769775390625, + "loss": 0.3575, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.0633316040039062, + "rewards/margins": 0.0, + "rewards/rejected": 3.0633316040039062, + "step": 6026 + }, + { + "epoch": 0.98, + "learning_rate": 5.418755916038843e-06, + "logits/chosen": -0.92006516456604, + "logits/rejected": -0.9544532895088196, + "logps/chosen": -65.94612884521484, + "logps/rejected": -91.72816467285156, + "loss": 0.5721, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.2259438037872314, + "rewards/margins": 0.7655105590820312, + "rewards/rejected": 1.4604332447052002, + "step": 6027 + }, + { + "epoch": 0.98, + "learning_rate": 5.417446265004107e-06, + "logits/chosen": -0.9212411642074585, + "logits/rejected": -0.9221657514572144, + "logps/chosen": -4.235523700714111, + "logps/rejected": -3.3075249195098877, + "loss": 0.3617, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.29688674211502075, + "rewards/margins": -0.0520971417427063, + "rewards/rejected": 0.34898388385772705, + "step": 6028 + }, + { + "epoch": 0.98, + "learning_rate": 5.416136585127785e-06, + "logits/chosen": -1.4243398904800415, + "logits/rejected": -1.428491473197937, + "logps/chosen": -75.08309173583984, + "logps/rejected": -48.04096603393555, + "loss": 0.1648, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.393531084060669, + "rewards/margins": 1.735740303993225, + "rewards/rejected": 0.6577907800674438, + "step": 6029 + }, + { + "epoch": 0.98, + "learning_rate": 5.414826876500361e-06, + "logits/chosen": -1.0863289833068848, + "logits/rejected": -1.030621886253357, + "logps/chosen": -59.2733268737793, + "logps/rejected": -66.95120239257812, + "loss": 0.5523, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.07891583442688, + "rewards/margins": -0.6113543510437012, + "rewards/rejected": 2.690270185470581, + "step": 6030 + }, + { + "epoch": 0.98, + "learning_rate": 5.413517139212326e-06, + "logits/chosen": -1.3671685457229614, + "logits/rejected": -1.1857268810272217, + "logps/chosen": -75.34398651123047, + "logps/rejected": -43.19386291503906, + "loss": 0.7774, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.1307199001312256, + "rewards/margins": -0.07067489624023438, + "rewards/rejected": 2.20139479637146, + "step": 6031 + }, + { + "epoch": 0.98, + "learning_rate": 5.412207373354169e-06, + "logits/chosen": -1.2741116285324097, + "logits/rejected": -1.1151925325393677, + "logps/chosen": -69.61892700195312, + "logps/rejected": -20.667757034301758, + "loss": 1.9452, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.974183082580566, + "rewards/margins": 3.8680295944213867, + "rewards/rejected": 1.1061534881591797, + "step": 6032 + }, + { + "epoch": 0.98, + "learning_rate": 5.410897579016383e-06, + "logits/chosen": -1.140165090560913, + "logits/rejected": -1.1721988916397095, + "logps/chosen": -46.946502685546875, + "logps/rejected": -51.12645721435547, + "loss": 0.4276, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.8208954334259033, + "rewards/margins": -0.29927587509155273, + "rewards/rejected": 3.120171308517456, + "step": 6033 + }, + { + "epoch": 0.98, + "learning_rate": 5.409587756289462e-06, + "logits/chosen": -0.8257375359535217, + "logits/rejected": -0.8688276410102844, + "logps/chosen": -95.19534301757812, + "logps/rejected": -48.03516387939453, + "loss": 1.2843, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.4386856257915497, + "rewards/margins": -2.3486907482147217, + "rewards/rejected": 2.7873764038085938, + "step": 6034 + }, + { + "epoch": 0.98, + "learning_rate": 5.4082779052639034e-06, + "logits/chosen": -1.0248132944107056, + "logits/rejected": -0.9395061731338501, + "logps/chosen": -83.96945190429688, + "logps/rejected": -77.94017028808594, + "loss": 0.3674, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1146888732910156, + "rewards/margins": 0.09284806251525879, + "rewards/rejected": 3.021840810775757, + "step": 6035 + }, + { + "epoch": 0.98, + "learning_rate": 5.406968026030205e-06, + "logits/chosen": -0.9619723558425903, + "logits/rejected": -0.8814389705657959, + "logps/chosen": -86.10848999023438, + "logps/rejected": -59.306922912597656, + "loss": 0.1988, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.6380455493927, + "rewards/margins": 1.103082299232483, + "rewards/rejected": 1.5349632501602173, + "step": 6036 + }, + { + "epoch": 0.98, + "learning_rate": 5.405658118678866e-06, + "logits/chosen": -1.0412864685058594, + "logits/rejected": -1.0709283351898193, + "logps/chosen": -48.50614547729492, + "logps/rejected": -117.06199645996094, + "loss": 0.8185, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.7613773345947266, + "rewards/margins": 0.5005146265029907, + "rewards/rejected": 1.2608627080917358, + "step": 6037 + }, + { + "epoch": 0.98, + "learning_rate": 5.4043481833003884e-06, + "logits/chosen": -1.3122899532318115, + "logits/rejected": -1.4359760284423828, + "logps/chosen": -96.6413803100586, + "logps/rejected": -142.5267333984375, + "loss": 3.5741, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.6696648597717285, + "rewards/margins": -7.068043231964111, + "rewards/rejected": 10.73770809173584, + "step": 6038 + }, + { + "epoch": 0.98, + "learning_rate": 5.4030382199852785e-06, + "logits/chosen": -1.0775307416915894, + "logits/rejected": -0.9877769947052002, + "logps/chosen": -62.21565628051758, + "logps/rejected": -69.57302856445312, + "loss": 0.1662, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.579223394393921, + "rewards/margins": 0.9541653394699097, + "rewards/rejected": 1.6250580549240112, + "step": 6039 + }, + { + "epoch": 0.98, + "learning_rate": 5.401728228824042e-06, + "logits/chosen": -1.0692474842071533, + "logits/rejected": -1.074729323387146, + "logps/chosen": -114.51161193847656, + "logps/rejected": -116.40107727050781, + "loss": 0.1849, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.9378098249435425, + "rewards/margins": 1.1981430053710938, + "rewards/rejected": 0.739666759967804, + "step": 6040 + }, + { + "epoch": 0.98, + "learning_rate": 5.4004182099071844e-06, + "logits/chosen": -0.4851018488407135, + "logits/rejected": -0.4851018488407135, + "logps/chosen": -1.4290069341659546, + "logps/rejected": -1.4290069341659546, + "loss": 0.4238, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.33560267090797424, + "rewards/margins": 0.0, + "rewards/rejected": 0.33560267090797424, + "step": 6041 + }, + { + "epoch": 0.98, + "learning_rate": 5.399108163325217e-06, + "logits/chosen": -1.2056015729904175, + "logits/rejected": -1.1958128213882446, + "logps/chosen": -76.8797607421875, + "logps/rejected": -102.64482116699219, + "loss": 3.0925, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.8188461065292358, + "rewards/margins": 0.40630412101745605, + "rewards/rejected": 1.4125419855117798, + "step": 6042 + }, + { + "epoch": 0.98, + "learning_rate": 5.397798089168653e-06, + "logits/chosen": -0.8937522768974304, + "logits/rejected": -1.0471948385238647, + "logps/chosen": -126.26528930664062, + "logps/rejected": -119.07838439941406, + "loss": 1.3484, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.8615570068359375, + "rewards/margins": -2.044325351715088, + "rewards/rejected": 3.9058823585510254, + "step": 6043 + }, + { + "epoch": 0.98, + "learning_rate": 5.396487987528005e-06, + "logits/chosen": -1.007275104522705, + "logits/rejected": -1.0076788663864136, + "logps/chosen": -76.85874938964844, + "logps/rejected": -80.69769287109375, + "loss": 0.4523, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.880090355873108, + "rewards/margins": 0.635810136795044, + "rewards/rejected": 1.244280219078064, + "step": 6044 + }, + { + "epoch": 0.98, + "learning_rate": 5.395177858493788e-06, + "logits/chosen": -1.1102542877197266, + "logits/rejected": -1.0447850227355957, + "logps/chosen": -65.40492248535156, + "logps/rejected": -53.28064727783203, + "loss": 0.4172, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.6540496349334717, + "rewards/margins": 0.27235865592956543, + "rewards/rejected": 2.3816909790039062, + "step": 6045 + }, + { + "epoch": 0.98, + "learning_rate": 5.3938677021565225e-06, + "logits/chosen": -1.2377020120620728, + "logits/rejected": -1.164973497390747, + "logps/chosen": -111.2275390625, + "logps/rejected": -32.42955780029297, + "loss": 0.25, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.5298264026641846, + "rewards/margins": 2.3944168090820312, + "rewards/rejected": 0.13540954887866974, + "step": 6046 + }, + { + "epoch": 0.98, + "learning_rate": 5.392557518606724e-06, + "logits/chosen": -1.0926368236541748, + "logits/rejected": -1.1016769409179688, + "logps/chosen": -64.32148742675781, + "logps/rejected": -104.67622375488281, + "loss": 0.6094, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.9481644034385681, + "rewards/margins": -0.512890636920929, + "rewards/rejected": 1.461055040359497, + "step": 6047 + }, + { + "epoch": 0.98, + "learning_rate": 5.391247307934915e-06, + "logits/chosen": -1.2290420532226562, + "logits/rejected": -1.0889407396316528, + "logps/chosen": -139.7881622314453, + "logps/rejected": -52.54301071166992, + "loss": 0.1687, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.522722005844116, + "rewards/margins": 0.9369657039642334, + "rewards/rejected": 2.585756301879883, + "step": 6048 + }, + { + "epoch": 0.98, + "learning_rate": 5.389937070231619e-06, + "logits/chosen": -1.2599128484725952, + "logits/rejected": -1.24248206615448, + "logps/chosen": -83.3927993774414, + "logps/rejected": -43.2738037109375, + "loss": 0.4302, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.2839133739471436, + "rewards/margins": -0.014619588851928711, + "rewards/rejected": 3.2985329627990723, + "step": 6049 + }, + { + "epoch": 0.98, + "learning_rate": 5.388626805587361e-06, + "logits/chosen": -1.050415277481079, + "logits/rejected": -1.050415277481079, + "logps/chosen": -27.75857162475586, + "logps/rejected": -27.75857162475586, + "loss": 0.3841, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.9976298809051514, + "rewards/margins": 0.0, + "rewards/rejected": 2.9976298809051514, + "step": 6050 + }, + { + "epoch": 0.98, + "learning_rate": 5.387316514092668e-06, + "logits/chosen": -0.9625276327133179, + "logits/rejected": -0.9435266852378845, + "logps/chosen": -42.53179168701172, + "logps/rejected": -42.003150939941406, + "loss": 0.1957, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.6917359828948975, + "rewards/margins": 0.7931060791015625, + "rewards/rejected": 2.898629903793335, + "step": 6051 + }, + { + "epoch": 0.98, + "learning_rate": 5.386006195838069e-06, + "logits/chosen": -1.3106714487075806, + "logits/rejected": -1.2444555759429932, + "logps/chosen": -63.82251739501953, + "logps/rejected": -14.407388687133789, + "loss": 0.4354, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.3921279907226562, + "rewards/margins": 2.054372549057007, + "rewards/rejected": 0.337755411863327, + "step": 6052 + }, + { + "epoch": 0.98, + "learning_rate": 5.3846958509140945e-06, + "logits/chosen": -1.3276869058609009, + "logits/rejected": -1.1402186155319214, + "logps/chosen": -109.56216430664062, + "logps/rejected": -77.97760009765625, + "loss": 0.8978, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.130725383758545, + "rewards/margins": 3.3555924892425537, + "rewards/rejected": 2.775132894515991, + "step": 6053 + }, + { + "epoch": 0.98, + "learning_rate": 5.383385479411276e-06, + "logits/chosen": -1.517569661140442, + "logits/rejected": -1.5032613277435303, + "logps/chosen": -111.82048797607422, + "logps/rejected": -65.37844848632812, + "loss": 0.3474, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.495209693908691, + "rewards/margins": 1.4732658863067627, + "rewards/rejected": 3.0219438076019287, + "step": 6054 + }, + { + "epoch": 0.98, + "learning_rate": 5.38207508142015e-06, + "logits/chosen": -0.8412697315216064, + "logits/rejected": -0.8267340064048767, + "logps/chosen": -64.16656494140625, + "logps/rejected": -57.99003601074219, + "loss": 0.5461, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.2550185918807983, + "rewards/margins": 0.712734580039978, + "rewards/rejected": 0.5422840118408203, + "step": 6055 + }, + { + "epoch": 0.98, + "learning_rate": 5.38076465703125e-06, + "logits/chosen": -0.9170422554016113, + "logits/rejected": -0.9840239882469177, + "logps/chosen": -47.12181854248047, + "logps/rejected": -53.88922119140625, + "loss": 2.3904, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.7242332696914673, + "rewards/margins": -1.2873574495315552, + "rewards/rejected": 3.0115907192230225, + "step": 6056 + }, + { + "epoch": 0.98, + "learning_rate": 5.379454206335114e-06, + "logits/chosen": -1.241906762123108, + "logits/rejected": -1.2062727212905884, + "logps/chosen": -104.47479248046875, + "logps/rejected": -55.99183654785156, + "loss": 0.5723, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.4590301513671875, + "rewards/margins": -0.2931396961212158, + "rewards/rejected": 2.7521698474884033, + "step": 6057 + }, + { + "epoch": 0.98, + "learning_rate": 5.378143729422285e-06, + "logits/chosen": -0.8991517424583435, + "logits/rejected": -0.8989121317863464, + "logps/chosen": -3.137216567993164, + "logps/rejected": -0.6399341225624084, + "loss": 1.2224, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.23614411056041718, + "rewards/margins": -0.0283174067735672, + "rewards/rejected": 0.2644615173339844, + "step": 6058 + }, + { + "epoch": 0.98, + "learning_rate": 5.3768332263833e-06, + "logits/chosen": -0.9372721314430237, + "logits/rejected": -0.7892526984214783, + "logps/chosen": -51.591888427734375, + "logps/rejected": -17.971683502197266, + "loss": 1.0209, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.637590169906616, + "rewards/margins": 2.279802083969116, + "rewards/rejected": 0.3577880859375, + "step": 6059 + }, + { + "epoch": 0.98, + "learning_rate": 5.375522697308706e-06, + "logits/chosen": -1.195556879043579, + "logits/rejected": -1.195556879043579, + "logps/chosen": -49.54142761230469, + "logps/rejected": -49.54142761230469, + "loss": 2.4414, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.982755422592163, + "rewards/margins": 0.0, + "rewards/rejected": 2.982755422592163, + "step": 6060 + }, + { + "epoch": 0.98, + "learning_rate": 5.374212142289047e-06, + "logits/chosen": -1.2018331289291382, + "logits/rejected": -1.230196475982666, + "logps/chosen": -94.14976501464844, + "logps/rejected": -140.62355041503906, + "loss": 0.4131, + "rewards/accuracies": 0.0, + "rewards/chosen": 5.3121018409729, + "rewards/margins": -0.20171833038330078, + "rewards/rejected": 5.513820171356201, + "step": 6061 + }, + { + "epoch": 0.98, + "learning_rate": 5.372901561414869e-06, + "logits/chosen": -0.5670936107635498, + "logits/rejected": -0.603877067565918, + "logps/chosen": -16.872812271118164, + "logps/rejected": -44.17324447631836, + "loss": 0.968, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2665647566318512, + "rewards/margins": 0.12971173226833344, + "rewards/rejected": 0.13685302436351776, + "step": 6062 + }, + { + "epoch": 0.98, + "learning_rate": 5.371590954776723e-06, + "logits/chosen": -1.2371162176132202, + "logits/rejected": -1.028792142868042, + "logps/chosen": -117.36155700683594, + "logps/rejected": -47.77581787109375, + "loss": 0.2606, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.571290493011475, + "rewards/margins": 1.6655402183532715, + "rewards/rejected": 3.905750274658203, + "step": 6063 + }, + { + "epoch": 0.98, + "learning_rate": 5.370280322465157e-06, + "logits/chosen": -1.2819751501083374, + "logits/rejected": -1.2019015550613403, + "logps/chosen": -101.00917053222656, + "logps/rejected": -74.6386489868164, + "loss": 0.0969, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.912358283996582, + "rewards/margins": 3.034771203994751, + "rewards/rejected": 3.877587080001831, + "step": 6064 + }, + { + "epoch": 0.98, + "learning_rate": 5.368969664570725e-06, + "logits/chosen": -1.1546142101287842, + "logits/rejected": -1.1442972421646118, + "logps/chosen": -64.78797912597656, + "logps/rejected": -36.71128845214844, + "loss": 0.5108, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.8598273992538452, + "rewards/margins": -0.1452258825302124, + "rewards/rejected": 2.0050532817840576, + "step": 6065 + }, + { + "epoch": 0.98, + "learning_rate": 5.367658981183979e-06, + "logits/chosen": -1.2377880811691284, + "logits/rejected": -1.1499779224395752, + "logps/chosen": -78.87201690673828, + "logps/rejected": -59.265106201171875, + "loss": 0.1208, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.3686394691467285, + "rewards/margins": 1.6362130641937256, + "rewards/rejected": 2.732426404953003, + "step": 6066 + }, + { + "epoch": 0.98, + "learning_rate": 5.3663482723954775e-06, + "logits/chosen": -1.1481856107711792, + "logits/rejected": -1.1730048656463623, + "logps/chosen": -41.6826286315918, + "logps/rejected": -53.96144104003906, + "loss": 0.3299, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.5046566724777222, + "rewards/margins": 0.09127235412597656, + "rewards/rejected": 1.4133843183517456, + "step": 6067 + }, + { + "epoch": 0.98, + "learning_rate": 5.365037538295776e-06, + "logits/chosen": -1.063504695892334, + "logits/rejected": -1.1318000555038452, + "logps/chosen": -50.27446746826172, + "logps/rejected": -107.41046142578125, + "loss": 0.8423, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.5622398853302002, + "rewards/margins": -0.2544517517089844, + "rewards/rejected": 1.8166916370391846, + "step": 6068 + }, + { + "epoch": 0.99, + "learning_rate": 5.363726778975436e-06, + "logits/chosen": -1.1553338766098022, + "logits/rejected": -1.1987420320510864, + "logps/chosen": -84.02847290039062, + "logps/rejected": -77.57806396484375, + "loss": 1.0181, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.123955488204956, + "rewards/margins": -1.5025849342346191, + "rewards/rejected": 3.626540422439575, + "step": 6069 + }, + { + "epoch": 0.99, + "learning_rate": 5.3624159945250155e-06, + "logits/chosen": -1.1182719469070435, + "logits/rejected": -1.127596139907837, + "logps/chosen": -22.319595336914062, + "logps/rejected": -5.277781963348389, + "loss": 0.3371, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.4866199493408203, + "rewards/margins": 0.04228845238685608, + "rewards/rejected": 0.44433149695396423, + "step": 6070 + }, + { + "epoch": 0.99, + "learning_rate": 5.361105185035079e-06, + "logits/chosen": -1.2569352388381958, + "logits/rejected": -1.2569352388381958, + "logps/chosen": -62.38813018798828, + "logps/rejected": -62.38813018798828, + "loss": 1.2336, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.1738669872283936, + "rewards/margins": 0.0, + "rewards/rejected": 2.1738669872283936, + "step": 6071 + }, + { + "epoch": 0.99, + "learning_rate": 5.3597943505961926e-06, + "logits/chosen": -1.601088523864746, + "logits/rejected": -1.6357614994049072, + "logps/chosen": -50.4304084777832, + "logps/rejected": -88.05097198486328, + "loss": 0.6472, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.8945813179016113, + "rewards/margins": 0.3536975383758545, + "rewards/rejected": 2.540883779525757, + "step": 6072 + }, + { + "epoch": 0.99, + "learning_rate": 5.3584834912989204e-06, + "logits/chosen": -1.1719813346862793, + "logits/rejected": -1.0959370136260986, + "logps/chosen": -39.65105438232422, + "logps/rejected": -14.025647163391113, + "loss": 0.1207, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.4126343727111816, + "rewards/margins": 1.3928906917572021, + "rewards/rejected": 1.0197436809539795, + "step": 6073 + }, + { + "epoch": 0.99, + "learning_rate": 5.357172607233831e-06, + "logits/chosen": -0.9635646939277649, + "logits/rejected": -0.716277003288269, + "logps/chosen": -87.205810546875, + "logps/rejected": -20.310670852661133, + "loss": 0.1335, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.5012054443359375, + "rewards/margins": 4.062169075012207, + "rewards/rejected": 0.4390365779399872, + "step": 6074 + }, + { + "epoch": 0.99, + "learning_rate": 5.355861698491492e-06, + "logits/chosen": -1.1102817058563232, + "logits/rejected": -1.164060354232788, + "logps/chosen": -115.30979919433594, + "logps/rejected": -66.09931945800781, + "loss": 0.9899, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.10113525390625, + "rewards/margins": -0.7663452625274658, + "rewards/rejected": 3.867480516433716, + "step": 6075 + }, + { + "epoch": 0.99, + "learning_rate": 5.354550765162479e-06, + "logits/chosen": -0.9801575541496277, + "logits/rejected": -1.0373141765594482, + "logps/chosen": -100.73890686035156, + "logps/rejected": -83.42327117919922, + "loss": 1.7804, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.3009979724884033, + "rewards/margins": -2.1255953311920166, + "rewards/rejected": 4.42659330368042, + "step": 6076 + }, + { + "epoch": 0.99, + "learning_rate": 5.353239807337363e-06, + "logits/chosen": -1.3731739521026611, + "logits/rejected": -1.4259346723556519, + "logps/chosen": -118.18423461914062, + "logps/rejected": -80.05706787109375, + "loss": 0.3266, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.212246894836426, + "rewards/margins": 0.16041898727416992, + "rewards/rejected": 5.051827907562256, + "step": 6077 + }, + { + "epoch": 0.99, + "learning_rate": 5.351928825106718e-06, + "logits/chosen": -1.1855427026748657, + "logits/rejected": -1.145666241645813, + "logps/chosen": -52.90794372558594, + "logps/rejected": -43.65559005737305, + "loss": 0.9245, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.7567901611328125, + "rewards/margins": -0.4707324504852295, + "rewards/rejected": 3.227522611618042, + "step": 6078 + }, + { + "epoch": 0.99, + "learning_rate": 5.350617818561121e-06, + "logits/chosen": -0.959356427192688, + "logits/rejected": -0.9640257358551025, + "logps/chosen": -81.35197448730469, + "logps/rejected": -49.47303009033203, + "loss": 0.4992, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.4369491636753082, + "rewards/margins": -0.5308895111083984, + "rewards/rejected": 0.967838704586029, + "step": 6079 + }, + { + "epoch": 0.99, + "learning_rate": 5.349306787791151e-06, + "logits/chosen": -0.8674501776695251, + "logits/rejected": -0.863292396068573, + "logps/chosen": -45.34714889526367, + "logps/rejected": -77.70077514648438, + "loss": 1.2837, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.5752209424972534, + "rewards/margins": 0.4991466999053955, + "rewards/rejected": 1.076074242591858, + "step": 6080 + }, + { + "epoch": 0.99, + "learning_rate": 5.347995732887387e-06, + "logits/chosen": -0.7701177000999451, + "logits/rejected": -0.7641530632972717, + "logps/chosen": -3.921522617340088, + "logps/rejected": -25.18601417541504, + "loss": 0.5189, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.32775893807411194, + "rewards/margins": -0.4120441973209381, + "rewards/rejected": 0.73980313539505, + "step": 6081 + }, + { + "epoch": 0.99, + "learning_rate": 5.346684653940408e-06, + "logits/chosen": -1.8022732734680176, + "logits/rejected": -1.8006410598754883, + "logps/chosen": -60.528411865234375, + "logps/rejected": -72.7904281616211, + "loss": 0.8569, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.5510849952697754, + "rewards/margins": -1.210228681564331, + "rewards/rejected": 3.7613136768341064, + "step": 6082 + }, + { + "epoch": 0.99, + "learning_rate": 5.345373551040802e-06, + "logits/chosen": -1.048183560371399, + "logits/rejected": -1.029286503791809, + "logps/chosen": -107.07545471191406, + "logps/rejected": -109.67444610595703, + "loss": 0.6296, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.3687729835510254, + "rewards/margins": 1.4434869289398193, + "rewards/rejected": 0.9252861142158508, + "step": 6083 + }, + { + "epoch": 0.99, + "learning_rate": 5.344062424279153e-06, + "logits/chosen": -1.173822045326233, + "logits/rejected": -1.0408111810684204, + "logps/chosen": -103.29359436035156, + "logps/rejected": -56.21826934814453, + "loss": 0.0271, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.168818950653076, + "rewards/margins": 4.0177459716796875, + "rewards/rejected": 2.1510727405548096, + "step": 6084 + }, + { + "epoch": 0.99, + "learning_rate": 5.342751273746044e-06, + "logits/chosen": -1.1533607244491577, + "logits/rejected": -1.0492388010025024, + "logps/chosen": -54.232879638671875, + "logps/rejected": -37.89178466796875, + "loss": 0.7937, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.812030076980591, + "rewards/margins": -0.9256470203399658, + "rewards/rejected": 3.7376770973205566, + "step": 6085 + }, + { + "epoch": 0.99, + "learning_rate": 5.3414400995320655e-06, + "logits/chosen": -1.3336292505264282, + "logits/rejected": -1.3335096836090088, + "logps/chosen": -236.3189697265625, + "logps/rejected": -63.23521423339844, + "loss": 0.0139, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.074639797210693, + "rewards/margins": 4.295109272003174, + "rewards/rejected": 0.7795303463935852, + "step": 6086 + }, + { + "epoch": 0.99, + "learning_rate": 5.340128901727808e-06, + "logits/chosen": -0.9105080366134644, + "logits/rejected": -0.9012544751167297, + "logps/chosen": -87.91094970703125, + "logps/rejected": -57.59667205810547, + "loss": 0.283, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.143454074859619, + "rewards/margins": 0.5963165760040283, + "rewards/rejected": 1.5471374988555908, + "step": 6087 + }, + { + "epoch": 0.99, + "learning_rate": 5.3388176804238614e-06, + "logits/chosen": -1.3803428411483765, + "logits/rejected": -1.2630412578582764, + "logps/chosen": -131.5630645751953, + "logps/rejected": -67.12052154541016, + "loss": 0.0082, + "rewards/accuracies": 1.0, + "rewards/chosen": 8.417732238769531, + "rewards/margins": 4.145331382751465, + "rewards/rejected": 4.272400856018066, + "step": 6088 + }, + { + "epoch": 0.99, + "learning_rate": 5.337506435710817e-06, + "logits/chosen": -1.19344961643219, + "logits/rejected": -1.1656744480133057, + "logps/chosen": -50.11953353881836, + "logps/rejected": -29.74103546142578, + "loss": 0.2548, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.727415919303894, + "rewards/margins": 0.4896721839904785, + "rewards/rejected": 1.2377437353134155, + "step": 6089 + }, + { + "epoch": 0.99, + "learning_rate": 5.3361951676792745e-06, + "logits/chosen": -1.321790337562561, + "logits/rejected": -1.3191221952438354, + "logps/chosen": -72.42958068847656, + "logps/rejected": -75.61372375488281, + "loss": 0.265, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.6951340436935425, + "rewards/margins": 0.365969181060791, + "rewards/rejected": 1.3291648626327515, + "step": 6090 + }, + { + "epoch": 0.99, + "learning_rate": 5.334883876419825e-06, + "logits/chosen": -0.9634625315666199, + "logits/rejected": -0.8895869255065918, + "logps/chosen": -51.39061737060547, + "logps/rejected": -72.69123077392578, + "loss": 2.6518, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.0387017726898193, + "rewards/margins": -0.15730881690979004, + "rewards/rejected": 2.1960105895996094, + "step": 6091 + }, + { + "epoch": 0.99, + "learning_rate": 5.333572562023068e-06, + "logits/chosen": -1.175881266593933, + "logits/rejected": -1.1349279880523682, + "logps/chosen": -116.04817199707031, + "logps/rejected": -57.3219108581543, + "loss": 0.081, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.374597072601318, + "rewards/margins": 1.9236795902252197, + "rewards/rejected": 2.4509174823760986, + "step": 6092 + }, + { + "epoch": 0.99, + "learning_rate": 5.332261224579605e-06, + "logits/chosen": -0.9825632572174072, + "logits/rejected": -0.9768884181976318, + "logps/chosen": -117.24019622802734, + "logps/rejected": -106.4565658569336, + "loss": 0.3237, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.7432441711425781, + "rewards/margins": 0.1812751293182373, + "rewards/rejected": 1.5619690418243408, + "step": 6093 + }, + { + "epoch": 0.99, + "learning_rate": 5.330949864180034e-06, + "logits/chosen": -1.167452096939087, + "logits/rejected": -0.9578532576560974, + "logps/chosen": -147.3905029296875, + "logps/rejected": -36.897430419921875, + "loss": 0.3614, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.009461879730225, + "rewards/margins": 4.4094977378845215, + "rewards/rejected": 1.5999641418457031, + "step": 6094 + }, + { + "epoch": 0.99, + "learning_rate": 5.3296384809149595e-06, + "logits/chosen": -1.3623754978179932, + "logits/rejected": -1.1705933809280396, + "logps/chosen": -132.80349731445312, + "logps/rejected": -35.05664825439453, + "loss": 0.3758, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.694906711578369, + "rewards/margins": 3.0492210388183594, + "rewards/rejected": 2.6456856727600098, + "step": 6095 + }, + { + "epoch": 0.99, + "learning_rate": 5.328327074874985e-06, + "logits/chosen": -1.094690203666687, + "logits/rejected": -1.0530753135681152, + "logps/chosen": -63.75749969482422, + "logps/rejected": -60.35956954956055, + "loss": 0.5544, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.558675527572632, + "rewards/margins": 0.3233966827392578, + "rewards/rejected": 2.235278844833374, + "step": 6096 + }, + { + "epoch": 0.99, + "learning_rate": 5.327015646150716e-06, + "logits/chosen": -1.2897679805755615, + "logits/rejected": -1.3704088926315308, + "logps/chosen": -189.79183959960938, + "logps/rejected": -136.26458740234375, + "loss": 1.8841, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.838740587234497, + "rewards/margins": -2.287815809249878, + "rewards/rejected": 6.126556396484375, + "step": 6097 + }, + { + "epoch": 0.99, + "learning_rate": 5.325704194832759e-06, + "logits/chosen": -0.8974230289459229, + "logits/rejected": -0.8997181057929993, + "logps/chosen": -75.31465148925781, + "logps/rejected": -40.66606521606445, + "loss": 0.764, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.0271103382110596, + "rewards/margins": -0.39242053031921387, + "rewards/rejected": 2.4195308685302734, + "step": 6098 + }, + { + "epoch": 0.99, + "learning_rate": 5.324392721011727e-06, + "logits/chosen": -0.7932671308517456, + "logits/rejected": -0.7695665955543518, + "logps/chosen": -13.993707656860352, + "logps/rejected": -16.924524307250977, + "loss": 1.2474, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.5079679489135742, + "rewards/margins": -2.365978240966797, + "rewards/rejected": 2.873946189880371, + "step": 6099 + }, + { + "epoch": 0.99, + "learning_rate": 5.323081224778225e-06, + "logits/chosen": -1.0266461372375488, + "logits/rejected": -1.021807312965393, + "logps/chosen": -87.52597045898438, + "logps/rejected": -53.12982177734375, + "loss": 1.266, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.1055748462677, + "rewards/margins": 1.1808781623840332, + "rewards/rejected": 0.9246967434883118, + "step": 6100 + }, + { + "epoch": 0.99, + "learning_rate": 5.3217697062228675e-06, + "logits/chosen": -0.8550703525543213, + "logits/rejected": -0.9897238612174988, + "logps/chosen": -24.02014923095703, + "logps/rejected": -85.00682067871094, + "loss": 3.1626, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.0649940967559814, + "rewards/margins": -3.5339791774749756, + "rewards/rejected": 4.598973274230957, + "step": 6101 + }, + { + "epoch": 0.99, + "learning_rate": 5.3204581654362684e-06, + "logits/chosen": -0.8890793919563293, + "logits/rejected": -0.9729632139205933, + "logps/chosen": -84.50503540039062, + "logps/rejected": -143.2593994140625, + "loss": 2.6876, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.9018921256065369, + "rewards/margins": -4.672845363616943, + "rewards/rejected": 5.574737548828125, + "step": 6102 + }, + { + "epoch": 0.99, + "learning_rate": 5.319146602509042e-06, + "logits/chosen": -1.3391484022140503, + "logits/rejected": -1.314807415008545, + "logps/chosen": -115.34429931640625, + "logps/rejected": -160.36715698242188, + "loss": 0.6939, + "rewards/accuracies": 0.0, + "rewards/chosen": 7.071223735809326, + "rewards/margins": -1.0884737968444824, + "rewards/rejected": 8.159697532653809, + "step": 6103 + }, + { + "epoch": 0.99, + "learning_rate": 5.317835017531805e-06, + "logits/chosen": -0.6382517218589783, + "logits/rejected": -0.6340139508247375, + "logps/chosen": -3.770371198654175, + "logps/rejected": -1.7381030321121216, + "loss": 0.983, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.3173478841781616, + "rewards/margins": -0.029294699430465698, + "rewards/rejected": 0.3466425836086273, + "step": 6104 + }, + { + "epoch": 0.99, + "learning_rate": 5.316523410595177e-06, + "logits/chosen": -0.9734231233596802, + "logits/rejected": -0.8381412625312805, + "logps/chosen": -43.58368682861328, + "logps/rejected": -7.632254600524902, + "loss": 0.132, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.3828017711639404, + "rewards/margins": 1.4865870475769043, + "rewards/rejected": 0.8962147831916809, + "step": 6105 + }, + { + "epoch": 0.99, + "learning_rate": 5.315211781789775e-06, + "logits/chosen": -1.1049097776412964, + "logits/rejected": -1.1049097776412964, + "logps/chosen": -73.82650756835938, + "logps/rejected": -73.82650756835938, + "loss": 0.3504, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.2004525661468506, + "rewards/margins": 0.0, + "rewards/rejected": 3.2004525661468506, + "step": 6106 + }, + { + "epoch": 0.99, + "learning_rate": 5.313900131206222e-06, + "logits/chosen": -1.3768856525421143, + "logits/rejected": -1.5226514339447021, + "logps/chosen": -62.966102600097656, + "logps/rejected": -37.03804016113281, + "loss": 0.027, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.234095811843872, + "rewards/margins": 2.9564616680145264, + "rewards/rejected": 0.27763405442237854, + "step": 6107 + }, + { + "epoch": 0.99, + "learning_rate": 5.3125884589351405e-06, + "logits/chosen": -1.2987711429595947, + "logits/rejected": -1.2863702774047852, + "logps/chosen": -91.97785186767578, + "logps/rejected": -70.6900634765625, + "loss": 1.4, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.5339676141738892, + "rewards/margins": -1.4812957048416138, + "rewards/rejected": 3.015263319015503, + "step": 6108 + }, + { + "epoch": 0.99, + "learning_rate": 5.311276765067153e-06, + "logits/chosen": -1.2125858068466187, + "logits/rejected": -1.2174216508865356, + "logps/chosen": -55.91572189331055, + "logps/rejected": -164.30316162109375, + "loss": 3.7614, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.6629703044891357, + "rewards/margins": -6.888720512390137, + "rewards/rejected": 10.551691055297852, + "step": 6109 + }, + { + "epoch": 0.99, + "learning_rate": 5.309965049692887e-06, + "logits/chosen": -0.9468461871147156, + "logits/rejected": -0.9468461871147156, + "logps/chosen": -21.703914642333984, + "logps/rejected": -21.703914642333984, + "loss": 0.3907, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.2444615364074707, + "rewards/margins": 0.0, + "rewards/rejected": 2.2444615364074707, + "step": 6110 + }, + { + "epoch": 0.99, + "learning_rate": 5.308653312902968e-06, + "logits/chosen": -1.206481695175171, + "logits/rejected": -1.2355928421020508, + "logps/chosen": -109.38639831542969, + "logps/rejected": -99.41424560546875, + "loss": 0.1613, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.701446533203125, + "rewards/margins": 0.9662766456604004, + "rewards/rejected": 4.735169887542725, + "step": 6111 + }, + { + "epoch": 0.99, + "learning_rate": 5.307341554788027e-06, + "logits/chosen": -0.8811662197113037, + "logits/rejected": -0.8811662197113037, + "logps/chosen": -0.2740475535392761, + "logps/rejected": -0.2740475535392761, + "loss": 0.4972, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.07809299975633621, + "rewards/margins": 0.0, + "rewards/rejected": 0.07809299975633621, + "step": 6112 + }, + { + "epoch": 0.99, + "learning_rate": 5.3060297754386926e-06, + "logits/chosen": -1.197411060333252, + "logits/rejected": -1.1485449075698853, + "logps/chosen": -90.10687255859375, + "logps/rejected": -61.582008361816406, + "loss": 0.6201, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.7438843250274658, + "rewards/margins": 0.7554298639297485, + "rewards/rejected": 0.9884544610977173, + "step": 6113 + }, + { + "epoch": 0.99, + "learning_rate": 5.304717974945596e-06, + "logits/chosen": -1.3956477642059326, + "logits/rejected": -1.390181541442871, + "logps/chosen": -55.851505279541016, + "logps/rejected": -45.21708679199219, + "loss": 0.6057, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.6738338470458984, + "rewards/margins": 0.016791105270385742, + "rewards/rejected": 2.6570427417755127, + "step": 6114 + }, + { + "epoch": 0.99, + "learning_rate": 5.30340615339937e-06, + "logits/chosen": -1.056115984916687, + "logits/rejected": -1.056115984916687, + "logps/chosen": -39.724632263183594, + "logps/rejected": -39.724632263183594, + "loss": 0.6595, + "rewards/accuracies": 0.0, + "rewards/chosen": 4.242434978485107, + "rewards/margins": 0.0, + "rewards/rejected": 4.242434978485107, + "step": 6115 + }, + { + "epoch": 0.99, + "learning_rate": 5.3020943108906496e-06, + "logits/chosen": -1.3317351341247559, + "logits/rejected": -1.3224422931671143, + "logps/chosen": -63.162078857421875, + "logps/rejected": -108.94070434570312, + "loss": 1.1292, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.4143387079238892, + "rewards/margins": 0.12319719791412354, + "rewards/rejected": 1.2911415100097656, + "step": 6116 + }, + { + "epoch": 0.99, + "learning_rate": 5.300782447510072e-06, + "logits/chosen": -0.9595350623130798, + "logits/rejected": -1.0320273637771606, + "logps/chosen": -67.33541107177734, + "logps/rejected": -120.71241760253906, + "loss": 0.6229, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.370572805404663, + "rewards/margins": -0.8657660484313965, + "rewards/rejected": 3.2363388538360596, + "step": 6117 + }, + { + "epoch": 0.99, + "learning_rate": 5.299470563348273e-06, + "logits/chosen": -1.0110509395599365, + "logits/rejected": -1.11024808883667, + "logps/chosen": -58.964622497558594, + "logps/rejected": -105.80728149414062, + "loss": 2.6781, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.6106430292129517, + "rewards/margins": -4.680166721343994, + "rewards/rejected": 6.290809631347656, + "step": 6118 + }, + { + "epoch": 0.99, + "learning_rate": 5.298158658495891e-06, + "logits/chosen": -1.2171149253845215, + "logits/rejected": -1.1528688669204712, + "logps/chosen": -130.86256408691406, + "logps/rejected": -58.86824417114258, + "loss": 0.2063, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0838685035705566, + "rewards/margins": 0.8857121467590332, + "rewards/rejected": 2.1981563568115234, + "step": 6119 + }, + { + "epoch": 0.99, + "learning_rate": 5.296846733043567e-06, + "logits/chosen": -1.2608448266983032, + "logits/rejected": -1.26577627658844, + "logps/chosen": -4.7379937171936035, + "logps/rejected": -2.880169153213501, + "loss": 0.8733, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.37883201241493225, + "rewards/margins": -0.03781619668006897, + "rewards/rejected": 0.4166482090950012, + "step": 6120 + }, + { + "epoch": 0.99, + "learning_rate": 5.295534787081943e-06, + "logits/chosen": -0.9640378355979919, + "logits/rejected": -0.9740679264068604, + "logps/chosen": -3.2295596599578857, + "logps/rejected": -1.9991884231567383, + "loss": 0.6586, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.2954624891281128, + "rewards/margins": -0.036144763231277466, + "rewards/rejected": 0.33160725235939026, + "step": 6121 + }, + { + "epoch": 0.99, + "learning_rate": 5.294222820701661e-06, + "logits/chosen": -1.1803921461105347, + "logits/rejected": -1.1706541776657104, + "logps/chosen": -96.94880676269531, + "logps/rejected": -81.208740234375, + "loss": 0.3241, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.7394638061523438, + "rewards/margins": 0.969648003578186, + "rewards/rejected": 1.7698158025741577, + "step": 6122 + }, + { + "epoch": 0.99, + "learning_rate": 5.292910833993367e-06, + "logits/chosen": -0.8595307469367981, + "logits/rejected": -0.8802430629730225, + "logps/chosen": -74.07151794433594, + "logps/rejected": -68.66455078125, + "loss": 0.7088, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.7871322631835938, + "rewards/margins": -0.0615081787109375, + "rewards/rejected": 1.8486404418945312, + "step": 6123 + }, + { + "epoch": 0.99, + "learning_rate": 5.291598827047706e-06, + "logits/chosen": -1.4417327642440796, + "logits/rejected": -1.3912290334701538, + "logps/chosen": -239.21536254882812, + "logps/rejected": -120.62995147705078, + "loss": 2.0611, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.1800262928009033, + "rewards/margins": -3.316673994064331, + "rewards/rejected": 6.496700286865234, + "step": 6124 + }, + { + "epoch": 0.99, + "learning_rate": 5.2902867999553245e-06, + "logits/chosen": -1.1944770812988281, + "logits/rejected": -1.2228769063949585, + "logps/chosen": -81.96890258789062, + "logps/rejected": -85.63980102539062, + "loss": 0.1878, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.932803630828857, + "rewards/margins": 1.1709916591644287, + "rewards/rejected": 3.7618119716644287, + "step": 6125 + }, + { + "epoch": 0.99, + "learning_rate": 5.288974752806871e-06, + "logits/chosen": -0.9919400215148926, + "logits/rejected": -0.9806984066963196, + "logps/chosen": -28.136648178100586, + "logps/rejected": -4.437272071838379, + "loss": 0.5232, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.7247114181518555, + "rewards/margins": -0.03470182418823242, + "rewards/rejected": 0.7594132423400879, + "step": 6126 + }, + { + "epoch": 0.99, + "learning_rate": 5.287662685692998e-06, + "logits/chosen": -0.8461151719093323, + "logits/rejected": -0.9386401772499084, + "logps/chosen": -82.76591491699219, + "logps/rejected": -116.51995849609375, + "loss": 1.4369, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.842637777328491, + "rewards/margins": -1.9494659900665283, + "rewards/rejected": 4.7921037673950195, + "step": 6127 + }, + { + "epoch": 0.99, + "learning_rate": 5.286350598704354e-06, + "logits/chosen": -0.9658238887786865, + "logits/rejected": -0.9836233258247375, + "logps/chosen": -30.043853759765625, + "logps/rejected": -73.93738555908203, + "loss": 2.0429, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.5385822057724, + "rewards/margins": -3.772038459777832, + "rewards/rejected": 5.3106207847595215, + "step": 6128 + }, + { + "epoch": 0.99, + "learning_rate": 5.285038491931593e-06, + "logits/chosen": -1.358953595161438, + "logits/rejected": -1.347702980041504, + "logps/chosen": -191.1892852783203, + "logps/rejected": -140.0795440673828, + "loss": 0.586, + "rewards/accuracies": 0.0, + "rewards/chosen": 7.726649761199951, + "rewards/margins": -0.7914919853210449, + "rewards/rejected": 8.518141746520996, + "step": 6129 + }, + { + "epoch": 0.99, + "learning_rate": 5.2837263654653715e-06, + "logits/chosen": -1.1587473154067993, + "logits/rejected": -1.1772984266281128, + "logps/chosen": -62.74256134033203, + "logps/rejected": -57.38722229003906, + "loss": 0.9663, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.1776299476623535, + "rewards/margins": -1.7584724426269531, + "rewards/rejected": 4.936102390289307, + "step": 6130 + }, + { + "epoch": 1.0, + "learning_rate": 5.28241421939634e-06, + "logits/chosen": -0.9890410900115967, + "logits/rejected": -0.8545078635215759, + "logps/chosen": -52.69088363647461, + "logps/rejected": -52.17920684814453, + "loss": 0.2467, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.712635517120361, + "rewards/margins": 3.656745195388794, + "rewards/rejected": 1.0558903217315674, + "step": 6131 + }, + { + "epoch": 1.0, + "learning_rate": 5.281102053815161e-06, + "logits/chosen": -1.1678341627120972, + "logits/rejected": -1.1779874563217163, + "logps/chosen": -47.71534729003906, + "logps/rejected": -35.88365936279297, + "loss": 0.9539, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.1944077014923096, + "rewards/margins": -0.8211205005645752, + "rewards/rejected": 3.0155282020568848, + "step": 6132 + }, + { + "epoch": 1.0, + "learning_rate": 5.2797898688124875e-06, + "logits/chosen": -1.2688463926315308, + "logits/rejected": -1.3338958024978638, + "logps/chosen": -113.4674072265625, + "logps/rejected": -121.32852935791016, + "loss": 0.6333, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.924755871295929, + "rewards/margins": -0.8531089425086975, + "rewards/rejected": 1.7778648138046265, + "step": 6133 + }, + { + "epoch": 1.0, + "learning_rate": 5.2784776644789825e-06, + "logits/chosen": -1.1826814413070679, + "logits/rejected": -1.1863648891448975, + "logps/chosen": -1.6765248775482178, + "logps/rejected": -4.321219444274902, + "loss": 1.586, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.31431153416633606, + "rewards/margins": -0.11320045590400696, + "rewards/rejected": 0.427511990070343, + "step": 6134 + }, + { + "epoch": 1.0, + "learning_rate": 5.2771654409053055e-06, + "logits/chosen": -1.573095440864563, + "logits/rejected": -1.6447888612747192, + "logps/chosen": -73.20138549804688, + "logps/rejected": -147.31024169921875, + "loss": 4.0724, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.192124128341675, + "rewards/margins": -7.353984832763672, + "rewards/rejected": 9.546109199523926, + "step": 6135 + }, + { + "epoch": 1.0, + "learning_rate": 5.275853198182119e-06, + "logits/chosen": -1.1790176630020142, + "logits/rejected": -1.1786835193634033, + "logps/chosen": -38.427364349365234, + "logps/rejected": -23.0267333984375, + "loss": 0.3071, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.9214771389961243, + "rewards/margins": 0.2642452120780945, + "rewards/rejected": 0.6572319269180298, + "step": 6136 + }, + { + "epoch": 1.0, + "learning_rate": 5.274540936400086e-06, + "logits/chosen": -1.236446738243103, + "logits/rejected": -1.236446738243103, + "logps/chosen": -48.752655029296875, + "logps/rejected": -48.752655029296875, + "loss": 0.7571, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.2166755199432373, + "rewards/margins": 0.0, + "rewards/rejected": 3.2166755199432373, + "step": 6137 + }, + { + "epoch": 1.0, + "learning_rate": 5.273228655649873e-06, + "logits/chosen": -1.0016082525253296, + "logits/rejected": -1.0863069295883179, + "logps/chosen": -13.774225234985352, + "logps/rejected": -40.30958938598633, + "loss": 1.7472, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.142472267150879, + "rewards/margins": -3.0586323738098145, + "rewards/rejected": 4.201104640960693, + "step": 6138 + }, + { + "epoch": 1.0, + "learning_rate": 5.2719163560221464e-06, + "logits/chosen": -1.1912720203399658, + "logits/rejected": -0.9959267973899841, + "logps/chosen": -217.49842834472656, + "logps/rejected": -66.13518524169922, + "loss": 1.2018, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.027702331542969, + "rewards/margins": 0.8438496589660645, + "rewards/rejected": 4.183852672576904, + "step": 6139 + }, + { + "epoch": 1.0, + "learning_rate": 5.270604037607571e-06, + "logits/chosen": -1.052117109298706, + "logits/rejected": -1.057737946510315, + "logps/chosen": -27.69422149658203, + "logps/rejected": -24.270122528076172, + "loss": 0.3483, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.8019390106201172, + "rewards/margins": 0.019543468952178955, + "rewards/rejected": 0.7823955416679382, + "step": 6140 + }, + { + "epoch": 1.0, + "learning_rate": 5.269291700496817e-06, + "logits/chosen": -1.0973190069198608, + "logits/rejected": -1.078260898590088, + "logps/chosen": -47.742919921875, + "logps/rejected": -43.71570587158203, + "loss": 0.6634, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.439321517944336, + "rewards/margins": -0.9438526630401611, + "rewards/rejected": 2.383174180984497, + "step": 6141 + }, + { + "epoch": 1.0, + "learning_rate": 5.267979344780555e-06, + "logits/chosen": -1.1095778942108154, + "logits/rejected": -1.0861424207687378, + "logps/chosen": -59.450748443603516, + "logps/rejected": -77.90534973144531, + "loss": 3.1931, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.957761764526367, + "rewards/margins": -0.8365483283996582, + "rewards/rejected": 3.7943100929260254, + "step": 6142 + }, + { + "epoch": 1.0, + "learning_rate": 5.266666970549455e-06, + "logits/chosen": -1.3986122608184814, + "logits/rejected": -1.3475370407104492, + "logps/chosen": -103.45604705810547, + "logps/rejected": -35.79928970336914, + "loss": 0.5145, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.5406928062438965, + "rewards/margins": 4.291191101074219, + "rewards/rejected": 1.2495018243789673, + "step": 6143 + }, + { + "epoch": 1.0, + "learning_rate": 5.265354577894192e-06, + "logits/chosen": -1.2862892150878906, + "logits/rejected": -1.2909893989562988, + "logps/chosen": -72.80795288085938, + "logps/rejected": -51.12876892089844, + "loss": 0.7648, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.55815589427948, + "rewards/margins": -0.26852869987487793, + "rewards/rejected": 1.826684594154358, + "step": 6144 + }, + { + "epoch": 1.0, + "learning_rate": 5.264042166905437e-06, + "logits/chosen": -1.442776083946228, + "logits/rejected": -1.3172152042388916, + "logps/chosen": -111.85038757324219, + "logps/rejected": -26.694795608520508, + "loss": 0.1388, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.8528412580490112, + "rewards/margins": 1.3546431064605713, + "rewards/rejected": 0.49819812178611755, + "step": 6145 + }, + { + "epoch": 1.0, + "learning_rate": 5.2627297376738674e-06, + "logits/chosen": -1.2040624618530273, + "logits/rejected": -1.168954849243164, + "logps/chosen": -77.69282531738281, + "logps/rejected": -141.2606964111328, + "loss": 0.703, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.979516625404358, + "rewards/margins": -0.03898465633392334, + "rewards/rejected": 2.0185012817382812, + "step": 6146 + }, + { + "epoch": 1.0, + "learning_rate": 5.261417290290159e-06, + "logits/chosen": -1.3276026248931885, + "logits/rejected": -1.3009721040725708, + "logps/chosen": -141.96971130371094, + "logps/rejected": -107.436767578125, + "loss": 1.0239, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.960395812988281, + "rewards/margins": 0.29990386962890625, + "rewards/rejected": 6.660491943359375, + "step": 6147 + }, + { + "epoch": 1.0, + "learning_rate": 5.2601048248449894e-06, + "logits/chosen": -1.0721874237060547, + "logits/rejected": -1.08584725856781, + "logps/chosen": -99.06075286865234, + "logps/rejected": -130.46783447265625, + "loss": 0.1522, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9337317943573, + "rewards/margins": 1.6119894981384277, + "rewards/rejected": 1.321742296218872, + "step": 6148 + }, + { + "epoch": 1.0, + "learning_rate": 5.258792341429038e-06, + "logits/chosen": -0.9038986563682556, + "logits/rejected": -0.8635596036911011, + "logps/chosen": -30.882858276367188, + "logps/rejected": -2.0915422439575195, + "loss": 0.3758, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.7307068109512329, + "rewards/margins": 0.5550270080566406, + "rewards/rejected": 0.1756797879934311, + "step": 6149 + }, + { + "epoch": 1.0, + "learning_rate": 5.257479840132983e-06, + "logits/chosen": -1.1881245374679565, + "logits/rejected": -1.0375638008117676, + "logps/chosen": -59.94023895263672, + "logps/rejected": -14.937193870544434, + "loss": 0.2711, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.579418182373047, + "rewards/margins": 3.484438896179199, + "rewards/rejected": 1.0949794054031372, + "step": 6150 + }, + { + "epoch": 1.0, + "learning_rate": 5.2561673210475085e-06, + "logits/chosen": -1.3510156869888306, + "logits/rejected": -1.3164386749267578, + "logps/chosen": -172.6058349609375, + "logps/rejected": -31.818933486938477, + "loss": 0.0214, + "rewards/accuracies": 1.0, + "rewards/chosen": 7.314218044281006, + "rewards/margins": 3.7665297985076904, + "rewards/rejected": 3.5476882457733154, + "step": 6151 + }, + { + "epoch": 1.0, + "learning_rate": 5.254854784263296e-06, + "logits/chosen": -1.3502835035324097, + "logits/rejected": -1.3729315996170044, + "logps/chosen": -107.58732604980469, + "logps/rejected": -69.69728088378906, + "loss": 1.314, + "rewards/accuracies": 0.0, + "rewards/chosen": 7.079948425292969, + "rewards/margins": -1.3278913497924805, + "rewards/rejected": 8.40783977508545, + "step": 6152 + }, + { + "epoch": 1.0, + "learning_rate": 5.253542229871029e-06, + "logits/chosen": -1.3730700016021729, + "logits/rejected": -1.2626546621322632, + "logps/chosen": -72.5374755859375, + "logps/rejected": -58.808467864990234, + "loss": 0.1479, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.759227275848389, + "rewards/margins": 1.551931381225586, + "rewards/rejected": 5.207295894622803, + "step": 6153 + }, + { + "epoch": 1.0, + "learning_rate": 5.252229657961394e-06, + "logits/chosen": -1.1916426420211792, + "logits/rejected": -1.0787886381149292, + "logps/chosen": -103.20932006835938, + "logps/rejected": -50.30946731567383, + "loss": 0.6534, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.8171135187149048, + "rewards/margins": 0.7546970844268799, + "rewards/rejected": 1.062416434288025, + "step": 6154 + }, + { + "epoch": 1.0, + "learning_rate": 5.250917068625075e-06, + "logits/chosen": -1.448708176612854, + "logits/rejected": -1.4571855068206787, + "logps/chosen": -22.38863754272461, + "logps/rejected": -56.56757736206055, + "loss": 2.9035, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.6376562118530273, + "rewards/margins": -2.2138381004333496, + "rewards/rejected": 2.851494312286377, + "step": 6155 + }, + { + "epoch": 1.0, + "learning_rate": 5.249604461952761e-06, + "logits/chosen": -1.256194829940796, + "logits/rejected": -1.360463261604309, + "logps/chosen": -53.628719329833984, + "logps/rejected": -94.82272338867188, + "loss": 1.0582, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.672276735305786, + "rewards/margins": -1.9234097003936768, + "rewards/rejected": 4.595686435699463, + "step": 6156 + }, + { + "epoch": 1.0, + "learning_rate": 5.248291838035141e-06, + "logits/chosen": -1.1749478578567505, + "logits/rejected": -1.2982012033462524, + "logps/chosen": -57.94402313232422, + "logps/rejected": -116.3337631225586, + "loss": 2.2438, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.8611321449279785, + "rewards/margins": -3.5700912475585938, + "rewards/rejected": 6.431223392486572, + "step": 6157 + }, + { + "epoch": 1.0, + "learning_rate": 5.246979196962904e-06, + "logits/chosen": -1.3783186674118042, + "logits/rejected": -1.3563653230667114, + "logps/chosen": -109.10580444335938, + "logps/rejected": -141.2132568359375, + "loss": 1.7053, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.8037078380584717, + "rewards/margins": -3.3719546794891357, + "rewards/rejected": 6.175662517547607, + "step": 6158 + }, + { + "epoch": 1.0, + "learning_rate": 5.245666538826741e-06, + "logits/chosen": -0.783972442150116, + "logits/rejected": -0.8322868347167969, + "logps/chosen": -28.74116325378418, + "logps/rejected": -69.0694580078125, + "loss": 2.0757, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.778270959854126, + "rewards/margins": -2.421846628189087, + "rewards/rejected": 4.200117588043213, + "step": 6159 + }, + { + "epoch": 1.0, + "learning_rate": 5.2443538637173464e-06, + "logits/chosen": -1.2598631381988525, + "logits/rejected": -1.2265563011169434, + "logps/chosen": -96.08576965332031, + "logps/rejected": -76.55418395996094, + "loss": 0.3465, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.4847214221954346, + "rewards/margins": 0.009574174880981445, + "rewards/rejected": 3.475147247314453, + "step": 6160 + }, + { + "epoch": 1.0, + "learning_rate": 5.24304117172541e-06, + "logits/chosen": -1.3041629791259766, + "logits/rejected": -1.3460328578948975, + "logps/chosen": -114.45770263671875, + "logps/rejected": -110.40480041503906, + "loss": 2.9768, + "rewards/accuracies": 0.0, + "rewards/chosen": 6.881628513336182, + "rewards/margins": -3.355032444000244, + "rewards/rejected": 10.236660957336426, + "step": 6161 + }, + { + "epoch": 1.0, + "learning_rate": 5.24172846294163e-06, + "logits/chosen": -0.9879769086837769, + "logits/rejected": -1.0165910720825195, + "logps/chosen": -53.75847625732422, + "logps/rejected": -125.76585388183594, + "loss": 0.3514, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.704627275466919, + "rewards/margins": 0.032196879386901855, + "rewards/rejected": 1.672430396080017, + "step": 6162 + }, + { + "epoch": 1.0, + "learning_rate": 5.240415737456699e-06, + "logits/chosen": -1.121023416519165, + "logits/rejected": -1.106185793876648, + "logps/chosen": -38.84571075439453, + "logps/rejected": -83.00482940673828, + "loss": 0.7334, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.603295087814331, + "rewards/margins": -0.5452332496643066, + "rewards/rejected": 3.1485283374786377, + "step": 6163 + }, + { + "epoch": 1.0, + "learning_rate": 5.239102995361316e-06, + "logits/chosen": -1.3869308233261108, + "logits/rejected": -1.3566861152648926, + "logps/chosen": -42.76771545410156, + "logps/rejected": -78.73645782470703, + "loss": 0.5799, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.80692982673645, + "rewards/margins": -0.21749043464660645, + "rewards/rejected": 3.0244202613830566, + "step": 6164 + }, + { + "epoch": 1.0, + "learning_rate": 5.237790236746178e-06, + "logits/chosen": -1.4433194398880005, + "logits/rejected": -1.3935613632202148, + "logps/chosen": -87.4412841796875, + "logps/rejected": -30.094091415405273, + "loss": 0.0718, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.7846497297286987, + "rewards/margins": 2.007129669189453, + "rewards/rejected": -0.22248001396656036, + "step": 6165 + }, + { + "epoch": 1.0, + "learning_rate": 5.236477461701985e-06, + "logits/chosen": -0.8048373460769653, + "logits/rejected": -0.8984298706054688, + "logps/chosen": -84.682373046875, + "logps/rejected": -92.90106201171875, + "loss": 1.7313, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.34197998046875, + "rewards/margins": -2.400347948074341, + "rewards/rejected": 3.742327928543091, + "step": 6166 + }, + { + "epoch": 1.0, + "learning_rate": 5.235164670319437e-06, + "logits/chosen": -1.3913512229919434, + "logits/rejected": -1.3074226379394531, + "logps/chosen": -64.03736877441406, + "logps/rejected": -17.148019790649414, + "loss": 0.0442, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.969036817550659, + "rewards/margins": 2.833754777908325, + "rewards/rejected": 0.13528214395046234, + "step": 6167 + }, + { + "epoch": 1.0, + "learning_rate": 5.233851862689235e-06, + "logits/chosen": -1.1546292304992676, + "logits/rejected": -1.2316844463348389, + "logps/chosen": -47.166114807128906, + "logps/rejected": -113.93167114257812, + "loss": 1.3987, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.910959005355835, + "rewards/margins": -1.143584966659546, + "rewards/rejected": 4.054543972015381, + "step": 6168 + }, + { + "epoch": 1.0, + "learning_rate": 5.232539038902082e-06, + "logits/chosen": -0.9705045223236084, + "logits/rejected": -1.0066701173782349, + "logps/chosen": -59.889373779296875, + "logps/rejected": -141.70188903808594, + "loss": 0.2272, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.7414169311523438, + "rewards/margins": 0.6109496355056763, + "rewards/rejected": 1.1304672956466675, + "step": 6169 + }, + { + "epoch": 1.0, + "learning_rate": 5.231226199048682e-06, + "logits/chosen": -0.8125847578048706, + "logits/rejected": -0.8977205157279968, + "logps/chosen": -37.95552444458008, + "logps/rejected": -121.99212646484375, + "loss": 0.2956, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.004852771759033, + "rewards/margins": 0.3947361707687378, + "rewards/rejected": 1.6101166009902954, + "step": 6170 + }, + { + "epoch": 1.0, + "learning_rate": 5.22991334321974e-06, + "logits/chosen": -1.177624225616455, + "logits/rejected": -1.1502439975738525, + "logps/chosen": -96.53263854980469, + "logps/rejected": -38.17417907714844, + "loss": 0.2616, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.4075334072113037, + "rewards/margins": 0.5638642311096191, + "rewards/rejected": 1.8436691761016846, + "step": 6171 + }, + { + "epoch": 1.0, + "learning_rate": 5.2286004715059615e-06, + "logits/chosen": -1.4570425748825073, + "logits/rejected": -1.5347665548324585, + "logps/chosen": -240.8341064453125, + "logps/rejected": -92.55513000488281, + "loss": 0.0877, + "rewards/accuracies": 1.0, + "rewards/chosen": 7.735260009765625, + "rewards/margins": 1.6591415405273438, + "rewards/rejected": 6.076118469238281, + "step": 6172 + }, + { + "epoch": 1.0, + "learning_rate": 5.227287583998052e-06, + "logits/chosen": -1.177183985710144, + "logits/rejected": -1.1326274871826172, + "logps/chosen": -60.45901107788086, + "logps/rejected": -94.3651123046875, + "loss": 0.1117, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.912601113319397, + "rewards/margins": 1.4504581689834595, + "rewards/rejected": 0.4621429443359375, + "step": 6173 + }, + { + "epoch": 1.0, + "learning_rate": 5.225974680786721e-06, + "logits/chosen": -1.3182317018508911, + "logits/rejected": -1.4007856845855713, + "logps/chosen": -178.0701904296875, + "logps/rejected": -171.74537658691406, + "loss": 0.5854, + "rewards/accuracies": 1.0, + "rewards/chosen": 7.717810153961182, + "rewards/margins": 0.49066638946533203, + "rewards/rejected": 7.22714376449585, + "step": 6174 + }, + { + "epoch": 1.0, + "learning_rate": 5.224661761962679e-06, + "logits/chosen": -1.0473763942718506, + "logits/rejected": -1.0101796388626099, + "logps/chosen": -38.431884765625, + "logps/rejected": -28.700115203857422, + "loss": 0.9053, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.9345200061798096, + "rewards/margins": -1.3083629608154297, + "rewards/rejected": 3.2428829669952393, + "step": 6175 + }, + { + "epoch": 1.0, + "learning_rate": 5.223348827616635e-06, + "logits/chosen": -1.5034129619598389, + "logits/rejected": -1.3270639181137085, + "logps/chosen": -119.51512145996094, + "logps/rejected": -51.83504104614258, + "loss": 0.3829, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.1939194202423096, + "rewards/margins": -0.13175082206726074, + "rewards/rejected": 3.3256702423095703, + "step": 6176 + }, + { + "epoch": 1.0, + "learning_rate": 5.2220358778393e-06, + "logits/chosen": -0.9424555897712708, + "logits/rejected": -0.8739567995071411, + "logps/chosen": -87.56702423095703, + "logps/rejected": -88.57184600830078, + "loss": 0.3153, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.181569576263428, + "rewards/margins": 0.44501185417175293, + "rewards/rejected": 3.736557722091675, + "step": 6177 + }, + { + "epoch": 1.0, + "learning_rate": 5.2207229127213866e-06, + "logits/chosen": -0.8990026712417603, + "logits/rejected": -0.92148756980896, + "logps/chosen": -1.2517576217651367, + "logps/rejected": -32.45283889770508, + "loss": 0.5177, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.3861680030822754, + "rewards/margins": -0.3794218897819519, + "rewards/rejected": 0.7655898928642273, + "step": 6178 + }, + { + "epoch": 1.0, + "learning_rate": 5.219409932353609e-06, + "logits/chosen": -1.1874576807022095, + "logits/rejected": -1.1846981048583984, + "logps/chosen": -49.64056396484375, + "logps/rejected": -41.3634033203125, + "loss": 1.7132, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.9758026599884033, + "rewards/margins": -0.46795654296875, + "rewards/rejected": 2.4437592029571533, + "step": 6179 + }, + { + "epoch": 1.0, + "learning_rate": 5.218096936826681e-06, + "logits/chosen": -1.099386215209961, + "logits/rejected": -1.0652822256088257, + "logps/chosen": -235.39930725097656, + "logps/rejected": -58.89704132080078, + "loss": 0.27, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.392924785614014, + "rewards/margins": 4.475614547729492, + "rewards/rejected": 1.917310357093811, + "step": 6180 + }, + { + "epoch": 1.0, + "learning_rate": 5.216783926231318e-06, + "logits/chosen": -1.1479802131652832, + "logits/rejected": -1.1938576698303223, + "logps/chosen": -55.561283111572266, + "logps/rejected": -68.12980651855469, + "loss": 0.8722, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.2231175899505615, + "rewards/margins": -1.4967341423034668, + "rewards/rejected": 3.7198517322540283, + "step": 6181 + }, + { + "epoch": 1.0, + "learning_rate": 5.215470900658237e-06, + "logits/chosen": -1.37234628200531, + "logits/rejected": -1.3615813255310059, + "logps/chosen": -182.10079956054688, + "logps/rejected": -139.39991760253906, + "loss": 0.3375, + "rewards/accuracies": 1.0, + "rewards/chosen": 8.635498046875, + "rewards/margins": 0.049393653869628906, + "rewards/rejected": 8.586104393005371, + "step": 6182 + }, + { + "epoch": 1.0, + "learning_rate": 5.214157860198156e-06, + "logits/chosen": -0.9671057462692261, + "logits/rejected": -0.9656313061714172, + "logps/chosen": -1.952044129371643, + "logps/rejected": -1.5715134143829346, + "loss": 0.7065, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.22034870088100433, + "rewards/margins": -0.09289725124835968, + "rewards/rejected": 0.313245952129364, + "step": 6183 + }, + { + "epoch": 1.0, + "learning_rate": 5.212844804941792e-06, + "logits/chosen": -1.2427552938461304, + "logits/rejected": -1.3502765893936157, + "logps/chosen": -128.12185668945312, + "logps/rejected": -110.81452941894531, + "loss": 0.571, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.108668565750122, + "rewards/margins": -0.45710301399230957, + "rewards/rejected": 3.5657715797424316, + "step": 6184 + }, + { + "epoch": 1.0, + "learning_rate": 5.2115317349798665e-06, + "logits/chosen": -1.6573961973190308, + "logits/rejected": -1.5720456838607788, + "logps/chosen": -54.38692092895508, + "logps/rejected": -14.002790451049805, + "loss": 0.0737, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.484985828399658, + "rewards/margins": 2.2577247619628906, + "rewards/rejected": 0.22726097702980042, + "step": 6185 + }, + { + "epoch": 1.0, + "learning_rate": 5.210218650403101e-06, + "logits/chosen": -1.1356581449508667, + "logits/rejected": -1.1226710081100464, + "logps/chosen": -13.437668800354004, + "logps/rejected": -4.845431327819824, + "loss": 0.3758, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.1001811027526855, + "rewards/margins": 0.3316269516944885, + "rewards/rejected": 0.768554151058197, + "step": 6186 + }, + { + "epoch": 1.0, + "learning_rate": 5.208905551302214e-06, + "logits/chosen": -1.1391806602478027, + "logits/rejected": -1.1233569383621216, + "logps/chosen": -177.4840850830078, + "logps/rejected": -162.45545959472656, + "loss": 0.0632, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.488429546356201, + "rewards/margins": 2.7572054862976074, + "rewards/rejected": 1.7312240600585938, + "step": 6187 + }, + { + "epoch": 1.0, + "learning_rate": 5.207592437767931e-06, + "logits/chosen": -1.2977977991104126, + "logits/rejected": -1.2677792310714722, + "logps/chosen": -84.18927001953125, + "logps/rejected": -96.57537078857422, + "loss": 0.2565, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.2024903297424316, + "rewards/margins": 1.0857139825820923, + "rewards/rejected": 1.1167763471603394, + "step": 6188 + }, + { + "epoch": 1.0, + "learning_rate": 5.206279309890975e-06, + "logits/chosen": -1.1340248584747314, + "logits/rejected": -1.2767443656921387, + "logps/chosen": -84.5176773071289, + "logps/rejected": -121.85150146484375, + "loss": 0.9369, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.890773057937622, + "rewards/margins": -1.248408555984497, + "rewards/rejected": 5.139181613922119, + "step": 6189 + }, + { + "epoch": 1.0, + "learning_rate": 5.20496616776207e-06, + "logits/chosen": -1.2670071125030518, + "logits/rejected": -1.2639648914337158, + "logps/chosen": -54.131107330322266, + "logps/rejected": -51.10669708251953, + "loss": 1.8596, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.5899555683135986, + "rewards/margins": -0.238966703414917, + "rewards/rejected": 1.8289222717285156, + "step": 6190 + }, + { + "epoch": 1.0, + "learning_rate": 5.203653011471943e-06, + "logits/chosen": -1.040555715560913, + "logits/rejected": -0.9875328540802002, + "logps/chosen": -167.2618865966797, + "logps/rejected": -48.05321502685547, + "loss": 0.0091, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.749871730804443, + "rewards/margins": 4.471887588500977, + "rewards/rejected": 2.2779839038848877, + "step": 6191 + }, + { + "epoch": 1.01, + "learning_rate": 5.202339841111319e-06, + "logits/chosen": -1.1325583457946777, + "logits/rejected": -1.2315728664398193, + "logps/chosen": -183.18389892578125, + "logps/rejected": -115.62306213378906, + "loss": 0.5147, + "rewards/accuracies": 1.0, + "rewards/chosen": 7.808178901672363, + "rewards/margins": 0.09273386001586914, + "rewards/rejected": 7.715445041656494, + "step": 6192 + }, + { + "epoch": 1.01, + "learning_rate": 5.201026656770927e-06, + "logits/chosen": -1.3610926866531372, + "logits/rejected": -1.26500403881073, + "logps/chosen": -69.06825256347656, + "logps/rejected": -54.20149612426758, + "loss": 1.1785, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.0535950660705566, + "rewards/margins": -0.7717678546905518, + "rewards/rejected": 3.8253629207611084, + "step": 6193 + }, + { + "epoch": 1.01, + "learning_rate": 5.199713458541495e-06, + "logits/chosen": -1.3690239191055298, + "logits/rejected": -1.185294270515442, + "logps/chosen": -131.7483673095703, + "logps/rejected": -34.887359619140625, + "loss": 0.0353, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.800419807434082, + "rewards/margins": 3.137805223464966, + "rewards/rejected": 2.662614583969116, + "step": 6194 + }, + { + "epoch": 1.01, + "learning_rate": 5.198400246513753e-06, + "logits/chosen": -1.1782044172286987, + "logits/rejected": -1.1893869638442993, + "logps/chosen": -92.51223754882812, + "logps/rejected": -122.29139709472656, + "loss": 0.619, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.6774215698242188, + "rewards/margins": 0.3222564458847046, + "rewards/rejected": 1.3551651239395142, + "step": 6195 + }, + { + "epoch": 1.01, + "learning_rate": 5.197087020778431e-06, + "logits/chosen": -1.211401343345642, + "logits/rejected": -1.0283135175704956, + "logps/chosen": -90.61522674560547, + "logps/rejected": -65.86774444580078, + "loss": 0.0937, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.343857765197754, + "rewards/margins": 1.585120439529419, + "rewards/rejected": 3.758737325668335, + "step": 6196 + }, + { + "epoch": 1.01, + "learning_rate": 5.195773781426261e-06, + "logits/chosen": -1.2133463621139526, + "logits/rejected": -1.207724690437317, + "logps/chosen": -0.8473482728004456, + "logps/rejected": -4.648342132568359, + "loss": 0.3814, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.22605495154857635, + "rewards/margins": -0.057238683104515076, + "rewards/rejected": 0.28329363465309143, + "step": 6197 + }, + { + "epoch": 1.01, + "learning_rate": 5.194460528547974e-06, + "logits/chosen": -1.1849712133407593, + "logits/rejected": -0.9688615202903748, + "logps/chosen": -130.45529174804688, + "logps/rejected": -41.18895721435547, + "loss": 0.1862, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.923285007476807, + "rewards/margins": 3.3609635829925537, + "rewards/rejected": 2.562321424484253, + "step": 6198 + }, + { + "epoch": 1.01, + "learning_rate": 5.193147262234306e-06, + "logits/chosen": -1.1575371026992798, + "logits/rejected": -1.1575371026992798, + "logps/chosen": -91.9933853149414, + "logps/rejected": -91.9933853149414, + "loss": 0.4428, + "rewards/accuracies": 0.0, + "rewards/chosen": 4.271846771240234, + "rewards/margins": 0.0, + "rewards/rejected": 4.271846771240234, + "step": 6199 + }, + { + "epoch": 1.01, + "learning_rate": 5.191833982575989e-06, + "logits/chosen": -1.2454719543457031, + "logits/rejected": -1.2454719543457031, + "logps/chosen": -51.58396911621094, + "logps/rejected": -51.58396911621094, + "loss": 0.3474, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.133650302886963, + "rewards/margins": 0.0, + "rewards/rejected": 3.133650302886963, + "step": 6200 + }, + { + "epoch": 1.01, + "learning_rate": 5.190520689663759e-06, + "logits/chosen": -1.0042169094085693, + "logits/rejected": -0.976853609085083, + "logps/chosen": -41.00446319580078, + "logps/rejected": -42.322166442871094, + "loss": 0.4905, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.844468832015991, + "rewards/margins": 0.9777405261993408, + "rewards/rejected": 2.8667283058166504, + "step": 6201 + }, + { + "epoch": 1.01, + "learning_rate": 5.189207383588353e-06, + "logits/chosen": -1.2409180402755737, + "logits/rejected": -1.4611546993255615, + "logps/chosen": -44.86436462402344, + "logps/rejected": -33.20460891723633, + "loss": 2.3994, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.3943710327148438, + "rewards/margins": 1.4737807512283325, + "rewards/rejected": 1.9205902814865112, + "step": 6202 + }, + { + "epoch": 1.01, + "learning_rate": 5.187894064440505e-06, + "logits/chosen": -1.3940290212631226, + "logits/rejected": -1.3161287307739258, + "logps/chosen": -112.41954803466797, + "logps/rejected": -39.88517761230469, + "loss": 0.8148, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.692974090576172, + "rewards/margins": 2.4680404663085938, + "rewards/rejected": 3.224933624267578, + "step": 6203 + }, + { + "epoch": 1.01, + "learning_rate": 5.186580732310956e-06, + "logits/chosen": -1.427384853363037, + "logits/rejected": -1.2903343439102173, + "logps/chosen": -138.57254028320312, + "logps/rejected": -42.527076721191406, + "loss": 0.3146, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.401971340179443, + "rewards/margins": 2.8739829063415527, + "rewards/rejected": 2.5279884338378906, + "step": 6204 + }, + { + "epoch": 1.01, + "learning_rate": 5.185267387290445e-06, + "logits/chosen": -1.0952048301696777, + "logits/rejected": -1.1425410509109497, + "logps/chosen": -55.35649108886719, + "logps/rejected": -38.902191162109375, + "loss": 1.5294, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.4540908336639404, + "rewards/margins": -2.6088106632232666, + "rewards/rejected": 6.062901496887207, + "step": 6205 + }, + { + "epoch": 1.01, + "learning_rate": 5.18395402946971e-06, + "logits/chosen": -1.1388822793960571, + "logits/rejected": -0.5892918109893799, + "logps/chosen": -118.24996948242188, + "logps/rejected": -35.90296936035156, + "loss": 0.1057, + "rewards/accuracies": 1.0, + "rewards/chosen": 8.141314506530762, + "rewards/margins": 7.0636749267578125, + "rewards/rejected": 1.0776394605636597, + "step": 6206 + }, + { + "epoch": 1.01, + "learning_rate": 5.182640658939491e-06, + "logits/chosen": -1.2841405868530273, + "logits/rejected": -1.2719736099243164, + "logps/chosen": -152.99588012695312, + "logps/rejected": -51.9739875793457, + "loss": 0.4168, + "rewards/accuracies": 1.0, + "rewards/chosen": 7.715191841125488, + "rewards/margins": 3.6513266563415527, + "rewards/rejected": 4.0638651847839355, + "step": 6207 + }, + { + "epoch": 1.01, + "learning_rate": 5.181327275790532e-06, + "logits/chosen": -1.2864258289337158, + "logits/rejected": -1.4046411514282227, + "logps/chosen": -37.03706359863281, + "logps/rejected": -66.8046646118164, + "loss": 2.5454, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.2393798828125, + "rewards/margins": -2.615170478820801, + "rewards/rejected": 4.854550361633301, + "step": 6208 + }, + { + "epoch": 1.01, + "learning_rate": 5.180013880113573e-06, + "logits/chosen": -1.4246759414672852, + "logits/rejected": -1.1860467195510864, + "logps/chosen": -119.77824401855469, + "logps/rejected": -77.70297241210938, + "loss": 0.0303, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.923230171203613, + "rewards/margins": 3.363065481185913, + "rewards/rejected": 2.5601646900177, + "step": 6209 + }, + { + "epoch": 1.01, + "learning_rate": 5.178700471999357e-06, + "logits/chosen": -1.4858112335205078, + "logits/rejected": -1.3205242156982422, + "logps/chosen": -97.47288513183594, + "logps/rejected": -164.79156494140625, + "loss": 0.3254, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.9047088623046875, + "rewards/margins": 0.09740447998046875, + "rewards/rejected": 6.807304382324219, + "step": 6210 + }, + { + "epoch": 1.01, + "learning_rate": 5.177387051538631e-06, + "logits/chosen": -1.4077495336532593, + "logits/rejected": -1.4031040668487549, + "logps/chosen": -103.52740478515625, + "logps/rejected": -140.37271118164062, + "loss": 1.5858, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.3615479469299316, + "rewards/margins": -2.8981690406799316, + "rewards/rejected": 6.259716987609863, + "step": 6211 + }, + { + "epoch": 1.01, + "learning_rate": 5.176073618822138e-06, + "logits/chosen": -1.5667587518692017, + "logits/rejected": -1.1995967626571655, + "logps/chosen": -132.17840576171875, + "logps/rejected": -125.33370971679688, + "loss": 0.6709, + "rewards/accuracies": 0.0, + "rewards/chosen": 4.619809150695801, + "rewards/margins": -0.5738310813903809, + "rewards/rejected": 5.193640232086182, + "step": 6212 + }, + { + "epoch": 1.01, + "learning_rate": 5.174760173940625e-06, + "logits/chosen": -1.4043424129486084, + "logits/rejected": -1.3209792375564575, + "logps/chosen": -68.85494995117188, + "logps/rejected": -51.31898498535156, + "loss": 0.6556, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.5653610229492188, + "rewards/margins": -0.8016610145568848, + "rewards/rejected": 3.3670220375061035, + "step": 6213 + }, + { + "epoch": 1.01, + "learning_rate": 5.173446716984837e-06, + "logits/chosen": -1.1182951927185059, + "logits/rejected": -1.1023794412612915, + "logps/chosen": -100.52533721923828, + "logps/rejected": -44.162864685058594, + "loss": 0.1746, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.1214470863342285, + "rewards/margins": 1.0757813453674316, + "rewards/rejected": 3.045665740966797, + "step": 6214 + }, + { + "epoch": 1.01, + "learning_rate": 5.172133248045521e-06, + "logits/chosen": -1.0275875329971313, + "logits/rejected": -1.0388686656951904, + "logps/chosen": -57.11314392089844, + "logps/rejected": -108.96163177490234, + "loss": 0.4378, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.814518690109253, + "rewards/margins": 0.7380392551422119, + "rewards/rejected": 2.076479434967041, + "step": 6215 + }, + { + "epoch": 1.01, + "learning_rate": 5.170819767213428e-06, + "logits/chosen": -1.5698308944702148, + "logits/rejected": -1.5482053756713867, + "logps/chosen": -85.53721618652344, + "logps/rejected": -52.99778366088867, + "loss": 1.499, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.6163994073867798, + "rewards/margins": 0.39250218868255615, + "rewards/rejected": 1.2238972187042236, + "step": 6216 + }, + { + "epoch": 1.01, + "learning_rate": 5.1695062745793035e-06, + "logits/chosen": -1.064892053604126, + "logits/rejected": -1.0764347314834595, + "logps/chosen": -19.46582794189453, + "logps/rejected": -54.3177490234375, + "loss": 0.9288, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.23712825775146484, + "rewards/margins": -0.5064279437065125, + "rewards/rejected": 0.7435562014579773, + "step": 6217 + }, + { + "epoch": 1.01, + "learning_rate": 5.168192770233901e-06, + "logits/chosen": -1.1648752689361572, + "logits/rejected": -1.1932483911514282, + "logps/chosen": -69.41291809082031, + "logps/rejected": -195.22833251953125, + "loss": 1.0617, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.8246140480041504, + "rewards/margins": 1.9473313093185425, + "rewards/rejected": 0.8772827386856079, + "step": 6218 + }, + { + "epoch": 1.01, + "learning_rate": 5.166879254267968e-06, + "logits/chosen": -1.2443856000900269, + "logits/rejected": -1.135425090789795, + "logps/chosen": -48.35264587402344, + "logps/rejected": -80.29576110839844, + "loss": 1.6336, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.5289764404296875, + "rewards/margins": -3.1454882621765137, + "rewards/rejected": 5.674464702606201, + "step": 6219 + }, + { + "epoch": 1.01, + "learning_rate": 5.165565726772258e-06, + "logits/chosen": -1.243987798690796, + "logits/rejected": -1.2051082849502563, + "logps/chosen": -153.9181365966797, + "logps/rejected": -92.83548736572266, + "loss": 0.4562, + "rewards/accuracies": 0.0, + "rewards/chosen": 8.147978782653809, + "rewards/margins": -0.2869710922241211, + "rewards/rejected": 8.43494987487793, + "step": 6220 + }, + { + "epoch": 1.01, + "learning_rate": 5.164252187837523e-06, + "logits/chosen": -1.4658279418945312, + "logits/rejected": -1.4658279418945312, + "logps/chosen": -89.79119873046875, + "logps/rejected": -89.79119873046875, + "loss": 1.0889, + "rewards/accuracies": 0.0, + "rewards/chosen": 4.737701416015625, + "rewards/margins": 0.0, + "rewards/rejected": 4.737701416015625, + "step": 6221 + }, + { + "epoch": 1.01, + "learning_rate": 5.1629386375545165e-06, + "logits/chosen": -1.260155200958252, + "logits/rejected": -1.260155200958252, + "logps/chosen": -55.626708984375, + "logps/rejected": -55.626708984375, + "loss": 0.5444, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.48872447013855, + "rewards/margins": 0.0, + "rewards/rejected": 3.48872447013855, + "step": 6222 + }, + { + "epoch": 1.01, + "learning_rate": 5.161625076013992e-06, + "logits/chosen": -1.179578185081482, + "logits/rejected": -1.1820517778396606, + "logps/chosen": -44.35765075683594, + "logps/rejected": -87.4334945678711, + "loss": 0.601, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.7328296899795532, + "rewards/margins": -0.8348277807235718, + "rewards/rejected": 2.567657470703125, + "step": 6223 + }, + { + "epoch": 1.01, + "learning_rate": 5.160311503306703e-06, + "logits/chosen": -0.9142277240753174, + "logits/rejected": -0.9142277240753174, + "logps/chosen": -24.972885131835938, + "logps/rejected": -24.972885131835938, + "loss": 0.6461, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.41959723830223083, + "rewards/margins": 0.0, + "rewards/rejected": 0.41959723830223083, + "step": 6224 + }, + { + "epoch": 1.01, + "learning_rate": 5.158997919523406e-06, + "logits/chosen": -1.3527684211730957, + "logits/rejected": -1.4311507940292358, + "logps/chosen": -127.05512237548828, + "logps/rejected": -96.2007827758789, + "loss": 0.2603, + "rewards/accuracies": 1.0, + "rewards/chosen": 7.178768157958984, + "rewards/margins": 0.3865370750427246, + "rewards/rejected": 6.79223108291626, + "step": 6225 + }, + { + "epoch": 1.01, + "learning_rate": 5.157684324754858e-06, + "logits/chosen": -1.1713910102844238, + "logits/rejected": -1.1723381280899048, + "logps/chosen": -8.392714500427246, + "logps/rejected": -2.378023862838745, + "loss": 0.3382, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.673524022102356, + "rewards/margins": 0.13097810745239258, + "rewards/rejected": 0.5425459146499634, + "step": 6226 + }, + { + "epoch": 1.01, + "learning_rate": 5.1563707190918155e-06, + "logits/chosen": -1.1087868213653564, + "logits/rejected": -1.1141525506973267, + "logps/chosen": -78.58846282958984, + "logps/rejected": -120.1862564086914, + "loss": 0.5125, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.2732110023498535, + "rewards/margins": 1.2911972999572754, + "rewards/rejected": 0.9820137023925781, + "step": 6227 + }, + { + "epoch": 1.01, + "learning_rate": 5.155057102625035e-06, + "logits/chosen": -1.4072589874267578, + "logits/rejected": -1.4072589874267578, + "logps/chosen": -87.01701354980469, + "logps/rejected": -87.01701354980469, + "loss": 0.3757, + "rewards/accuracies": 0.0, + "rewards/chosen": 6.112724304199219, + "rewards/margins": 0.0, + "rewards/rejected": 6.112724304199219, + "step": 6228 + }, + { + "epoch": 1.01, + "learning_rate": 5.1537434754452765e-06, + "logits/chosen": -1.1781258583068848, + "logits/rejected": -1.1945544481277466, + "logps/chosen": -67.53821563720703, + "logps/rejected": -82.46389770507812, + "loss": 0.7522, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.5683350563049316, + "rewards/margins": -0.2238166332244873, + "rewards/rejected": 2.792151689529419, + "step": 6229 + }, + { + "epoch": 1.01, + "learning_rate": 5.152429837643298e-06, + "logits/chosen": -1.2086330652236938, + "logits/rejected": -1.0747076272964478, + "logps/chosen": -88.98129272460938, + "logps/rejected": -68.8731460571289, + "loss": 0.1575, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.760612487792969, + "rewards/margins": 1.0111184120178223, + "rewards/rejected": 4.7494940757751465, + "step": 6230 + }, + { + "epoch": 1.01, + "learning_rate": 5.151116189309861e-06, + "logits/chosen": -1.1181391477584839, + "logits/rejected": -1.0584248304367065, + "logps/chosen": -171.82675170898438, + "logps/rejected": -123.94061279296875, + "loss": 0.0721, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.030804395675659, + "rewards/margins": 1.9747130870819092, + "rewards/rejected": 1.05609130859375, + "step": 6231 + }, + { + "epoch": 1.01, + "learning_rate": 5.149802530535724e-06, + "logits/chosen": -1.130760908126831, + "logits/rejected": -1.1298936605453491, + "logps/chosen": -35.807334899902344, + "logps/rejected": -59.252777099609375, + "loss": 0.5214, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.6611945629119873, + "rewards/margins": 0.6693755388259888, + "rewards/rejected": 1.9918190240859985, + "step": 6232 + }, + { + "epoch": 1.01, + "learning_rate": 5.148488861411649e-06, + "logits/chosen": -1.3018755912780762, + "logits/rejected": -1.228428602218628, + "logps/chosen": -101.20974731445312, + "logps/rejected": -54.999019622802734, + "loss": 0.089, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.485116720199585, + "rewards/margins": 1.65725839138031, + "rewards/rejected": 1.827858328819275, + "step": 6233 + }, + { + "epoch": 1.01, + "learning_rate": 5.1471751820284e-06, + "logits/chosen": -1.605820655822754, + "logits/rejected": -1.4459640979766846, + "logps/chosen": -94.03043365478516, + "logps/rejected": -49.6725959777832, + "loss": 0.0126, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.68428897857666, + "rewards/margins": 3.976799488067627, + "rewards/rejected": 2.707489490509033, + "step": 6234 + }, + { + "epoch": 1.01, + "learning_rate": 5.14586149247674e-06, + "logits/chosen": -1.111081838607788, + "logits/rejected": -1.2262967824935913, + "logps/chosen": -35.16047286987305, + "logps/rejected": -84.07730102539062, + "loss": 1.5703, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.5634663105010986, + "rewards/margins": -1.6001980304718018, + "rewards/rejected": 5.1636643409729, + "step": 6235 + }, + { + "epoch": 1.01, + "learning_rate": 5.144547792847428e-06, + "logits/chosen": -1.1438484191894531, + "logits/rejected": -1.1438484191894531, + "logps/chosen": -82.18989562988281, + "logps/rejected": -82.18989562988281, + "loss": 0.646, + "rewards/accuracies": 0.0, + "rewards/chosen": 4.455180644989014, + "rewards/margins": 0.0, + "rewards/rejected": 4.455180644989014, + "step": 6236 + }, + { + "epoch": 1.01, + "learning_rate": 5.143234083231231e-06, + "logits/chosen": -1.585749626159668, + "logits/rejected": -1.5002776384353638, + "logps/chosen": -91.54106140136719, + "logps/rejected": -200.93423461914062, + "loss": 1.0717, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.5430870056152344, + "rewards/margins": -2.015186309814453, + "rewards/rejected": 4.5582733154296875, + "step": 6237 + }, + { + "epoch": 1.01, + "learning_rate": 5.141920363718916e-06, + "logits/chosen": -0.8183650374412537, + "logits/rejected": -0.7192603945732117, + "logps/chosen": -36.21390151977539, + "logps/rejected": -43.24542236328125, + "loss": 0.3608, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0099918842315674, + "rewards/margins": 0.3018512725830078, + "rewards/rejected": 2.7081406116485596, + "step": 6238 + }, + { + "epoch": 1.01, + "learning_rate": 5.140606634401246e-06, + "logits/chosen": -1.1441885232925415, + "logits/rejected": -0.9875446557998657, + "logps/chosen": -85.35186767578125, + "logps/rejected": -51.98705291748047, + "loss": 0.4638, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.541259765625, + "rewards/margins": 1.835806131362915, + "rewards/rejected": 2.705453634262085, + "step": 6239 + }, + { + "epoch": 1.01, + "learning_rate": 5.139292895368989e-06, + "logits/chosen": -1.695733904838562, + "logits/rejected": -1.542232871055603, + "logps/chosen": -143.71713256835938, + "logps/rejected": -91.76917266845703, + "loss": 0.0728, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.052112102508545, + "rewards/margins": 2.6969621181488037, + "rewards/rejected": 3.355149984359741, + "step": 6240 + }, + { + "epoch": 1.01, + "learning_rate": 5.1379791467129105e-06, + "logits/chosen": -1.1309360265731812, + "logits/rejected": -1.2499934434890747, + "logps/chosen": -87.58732604980469, + "logps/rejected": -121.01927185058594, + "loss": 1.6416, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.756850481033325, + "rewards/margins": -3.2101004123687744, + "rewards/rejected": 6.9669508934021, + "step": 6241 + }, + { + "epoch": 1.01, + "learning_rate": 5.136665388523779e-06, + "logits/chosen": -0.86370849609375, + "logits/rejected": -0.8552645444869995, + "logps/chosen": -52.530799865722656, + "logps/rejected": -17.726316452026367, + "loss": 0.4294, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5697624087333679, + "rewards/margins": 0.09618127346038818, + "rewards/rejected": 0.47358113527297974, + "step": 6242 + }, + { + "epoch": 1.01, + "learning_rate": 5.1353516208923605e-06, + "logits/chosen": -1.1925387382507324, + "logits/rejected": -1.175767421722412, + "logps/chosen": -175.32388305664062, + "logps/rejected": -119.34861755371094, + "loss": 0.1808, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.212432861328125, + "rewards/margins": 3.2176527976989746, + "rewards/rejected": 1.9947799444198608, + "step": 6243 + }, + { + "epoch": 1.01, + "learning_rate": 5.1340378439094276e-06, + "logits/chosen": -1.5694149732589722, + "logits/rejected": -1.6695584058761597, + "logps/chosen": -167.9858856201172, + "logps/rejected": -63.71612548828125, + "loss": 0.0488, + "rewards/accuracies": 1.0, + "rewards/chosen": 8.015419006347656, + "rewards/margins": 2.303788661956787, + "rewards/rejected": 5.711630344390869, + "step": 6244 + }, + { + "epoch": 1.01, + "learning_rate": 5.132724057665747e-06, + "logits/chosen": -1.1694117784500122, + "logits/rejected": -1.1675671339035034, + "logps/chosen": -1.567795753479004, + "logps/rejected": -9.386164665222168, + "loss": 0.3116, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.27359458804130554, + "rewards/margins": 0.20319177210330963, + "rewards/rejected": 0.07040281593799591, + "step": 6245 + }, + { + "epoch": 1.01, + "learning_rate": 5.13141026225209e-06, + "logits/chosen": -1.563698410987854, + "logits/rejected": -1.2316417694091797, + "logps/chosen": -99.5575180053711, + "logps/rejected": -64.57771301269531, + "loss": 0.6674, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.4480812549591064, + "rewards/margins": 1.6308097839355469, + "rewards/rejected": 1.8172714710235596, + "step": 6246 + }, + { + "epoch": 1.01, + "learning_rate": 5.130096457759227e-06, + "logits/chosen": -1.1947729587554932, + "logits/rejected": -1.1543523073196411, + "logps/chosen": -60.04020690917969, + "logps/rejected": -79.03598022460938, + "loss": 0.019, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.489588260650635, + "rewards/margins": 3.35807204246521, + "rewards/rejected": 2.131516218185425, + "step": 6247 + }, + { + "epoch": 1.01, + "learning_rate": 5.12878264427793e-06, + "logits/chosen": -1.3752950429916382, + "logits/rejected": -1.2806110382080078, + "logps/chosen": -88.15652465820312, + "logps/rejected": -57.38369369506836, + "loss": 0.3515, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9342117309570312, + "rewards/margins": 0.6460490226745605, + "rewards/rejected": 2.2881627082824707, + "step": 6248 + }, + { + "epoch": 1.01, + "learning_rate": 5.127468821898971e-06, + "logits/chosen": -1.620047688484192, + "logits/rejected": -1.5569403171539307, + "logps/chosen": -110.42710876464844, + "logps/rejected": -95.07502746582031, + "loss": 0.0799, + "rewards/accuracies": 1.0, + "rewards/chosen": 7.667799472808838, + "rewards/margins": 2.2004714012145996, + "rewards/rejected": 5.467328071594238, + "step": 6249 + }, + { + "epoch": 1.01, + "learning_rate": 5.126154990713123e-06, + "logits/chosen": -1.222856879234314, + "logits/rejected": -1.2881195545196533, + "logps/chosen": -109.6432876586914, + "logps/rejected": -44.43004608154297, + "loss": 0.7341, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.8302223682403564, + "rewards/margins": -0.268796443939209, + "rewards/rejected": 2.0990188121795654, + "step": 6250 + }, + { + "epoch": 1.01, + "learning_rate": 5.124841150811159e-06, + "logits/chosen": -1.0161134004592896, + "logits/rejected": -0.9557058215141296, + "logps/chosen": -39.82207107543945, + "logps/rejected": -30.74930191040039, + "loss": 0.6135, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.6271274089813232, + "rewards/margins": 0.5877978801727295, + "rewards/rejected": 3.0393295288085938, + "step": 6251 + }, + { + "epoch": 1.01, + "learning_rate": 5.12352730228385e-06, + "logits/chosen": -1.037415862083435, + "logits/rejected": -1.037415862083435, + "logps/chosen": -41.65996170043945, + "logps/rejected": -41.65996170043945, + "loss": 0.5763, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.1668827533721924, + "rewards/margins": 0.0, + "rewards/rejected": 3.1668827533721924, + "step": 6252 + }, + { + "epoch": 1.01, + "learning_rate": 5.122213445221976e-06, + "logits/chosen": -1.168701171875, + "logits/rejected": -1.171539545059204, + "logps/chosen": -28.914649963378906, + "logps/rejected": -37.080047607421875, + "loss": 1.7349, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.430864691734314, + "rewards/margins": -1.9969955682754517, + "rewards/rejected": 3.4278602600097656, + "step": 6253 + }, + { + "epoch": 1.02, + "learning_rate": 5.1208995797163085e-06, + "logits/chosen": -1.2786099910736084, + "logits/rejected": -1.3144009113311768, + "logps/chosen": -58.10652160644531, + "logps/rejected": -98.4148941040039, + "loss": 0.9935, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.0940215587615967, + "rewards/margins": -1.580791711807251, + "rewards/rejected": 4.674813270568848, + "step": 6254 + }, + { + "epoch": 1.02, + "learning_rate": 5.1195857058576245e-06, + "logits/chosen": -1.4283978939056396, + "logits/rejected": -1.2001490592956543, + "logps/chosen": -119.94891357421875, + "logps/rejected": -74.81432342529297, + "loss": 0.5655, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.066709995269775, + "rewards/margins": 1.797091007232666, + "rewards/rejected": 3.2696189880371094, + "step": 6255 + }, + { + "epoch": 1.02, + "learning_rate": 5.118271823736699e-06, + "logits/chosen": -1.0191160440444946, + "logits/rejected": -0.9924176931381226, + "logps/chosen": -85.61958312988281, + "logps/rejected": -49.876121520996094, + "loss": 0.2191, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.3882644176483154, + "rewards/margins": 0.6247290372848511, + "rewards/rejected": 1.7635353803634644, + "step": 6256 + }, + { + "epoch": 1.02, + "learning_rate": 5.116957933444311e-06, + "logits/chosen": -1.1490532159805298, + "logits/rejected": -0.9176468253135681, + "logps/chosen": -138.7372283935547, + "logps/rejected": -15.072884559631348, + "loss": 0.0762, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.137568712234497, + "rewards/margins": 1.8181331157684326, + "rewards/rejected": 0.31943559646606445, + "step": 6257 + }, + { + "epoch": 1.02, + "learning_rate": 5.115644035071234e-06, + "logits/chosen": -1.3372293710708618, + "logits/rejected": -1.2012341022491455, + "logps/chosen": -93.7021255493164, + "logps/rejected": -81.83450317382812, + "loss": 0.0904, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.213924407958984, + "rewards/margins": 3.946176052093506, + "rewards/rejected": 2.2677483558654785, + "step": 6258 + }, + { + "epoch": 1.02, + "learning_rate": 5.11433012870825e-06, + "logits/chosen": -1.2510950565338135, + "logits/rejected": -1.1779451370239258, + "logps/chosen": -53.05950927734375, + "logps/rejected": -31.1218318939209, + "loss": 1.0521, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.4551796913146973, + "rewards/margins": 0.7946093082427979, + "rewards/rejected": 2.6605703830718994, + "step": 6259 + }, + { + "epoch": 1.02, + "learning_rate": 5.113016214446136e-06, + "logits/chosen": -1.3947625160217285, + "logits/rejected": -1.251348853111267, + "logps/chosen": -58.10425567626953, + "logps/rejected": -21.124208450317383, + "loss": 0.0586, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9467453956604004, + "rewards/margins": 2.7239444255828857, + "rewards/rejected": 0.22280101478099823, + "step": 6260 + }, + { + "epoch": 1.02, + "learning_rate": 5.111702292375671e-06, + "logits/chosen": -1.0574705600738525, + "logits/rejected": -0.9950194954872131, + "logps/chosen": -64.45343017578125, + "logps/rejected": -6.860042572021484, + "loss": 0.7579, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.0158066749572754, + "rewards/margins": 0.3093940019607544, + "rewards/rejected": 1.706412672996521, + "step": 6261 + }, + { + "epoch": 1.02, + "learning_rate": 5.1103883625876335e-06, + "logits/chosen": -0.8910362124443054, + "logits/rejected": -0.8546386361122131, + "logps/chosen": -39.830474853515625, + "logps/rejected": -44.371376037597656, + "loss": 0.6697, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.1234855651855469, + "rewards/margins": 0.15219956636428833, + "rewards/rejected": 0.9712859988212585, + "step": 6262 + }, + { + "epoch": 1.02, + "learning_rate": 5.109074425172806e-06, + "logits/chosen": -1.1069214344024658, + "logits/rejected": -1.055551290512085, + "logps/chosen": -57.99220657348633, + "logps/rejected": -76.81942749023438, + "loss": 2.3806, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.8746745586395264, + "rewards/margins": -0.629664421081543, + "rewards/rejected": 3.5043389797210693, + "step": 6263 + }, + { + "epoch": 1.02, + "learning_rate": 5.107760480221967e-06, + "logits/chosen": -1.357677698135376, + "logits/rejected": -0.8776344656944275, + "logps/chosen": -137.00509643554688, + "logps/rejected": -22.418190002441406, + "loss": 0.2396, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.2237868309021, + "rewards/margins": 5.468387603759766, + "rewards/rejected": 0.7553993463516235, + "step": 6264 + }, + { + "epoch": 1.02, + "learning_rate": 5.106446527825899e-06, + "logits/chosen": -1.2343965768814087, + "logits/rejected": -1.1482679843902588, + "logps/chosen": -74.52628326416016, + "logps/rejected": -43.75285339355469, + "loss": 0.7094, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.4614944458007812, + "rewards/margins": -0.7222123146057129, + "rewards/rejected": 4.183706760406494, + "step": 6265 + }, + { + "epoch": 1.02, + "learning_rate": 5.1051325680753826e-06, + "logits/chosen": -1.1895548105239868, + "logits/rejected": -1.1941889524459839, + "logps/chosen": -43.915714263916016, + "logps/rejected": -52.00251007080078, + "loss": 0.4108, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.7015079259872437, + "rewards/margins": 0.4246845245361328, + "rewards/rejected": 1.2768234014511108, + "step": 6266 + }, + { + "epoch": 1.02, + "learning_rate": 5.103818601061201e-06, + "logits/chosen": -1.2219675779342651, + "logits/rejected": -1.1257754564285278, + "logps/chosen": -36.55071258544922, + "logps/rejected": -26.867788314819336, + "loss": 1.3138, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.2529454231262207, + "rewards/margins": -0.9013988971710205, + "rewards/rejected": 3.154344320297241, + "step": 6267 + }, + { + "epoch": 1.02, + "learning_rate": 5.102504626874137e-06, + "logits/chosen": -1.62226140499115, + "logits/rejected": -1.628488540649414, + "logps/chosen": -83.21045684814453, + "logps/rejected": -55.38419723510742, + "loss": 1.1994, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.3660943508148193, + "rewards/margins": -1.5885629653930664, + "rewards/rejected": 3.9546573162078857, + "step": 6268 + }, + { + "epoch": 1.02, + "learning_rate": 5.101190645604971e-06, + "logits/chosen": -0.8006094694137573, + "logits/rejected": -0.7869964838027954, + "logps/chosen": -42.826812744140625, + "logps/rejected": -42.71834182739258, + "loss": 1.1884, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.9786590337753296, + "rewards/margins": -2.138359546661377, + "rewards/rejected": 4.117018699645996, + "step": 6269 + }, + { + "epoch": 1.02, + "learning_rate": 5.09987665734449e-06, + "logits/chosen": -1.4624346494674683, + "logits/rejected": -1.492693305015564, + "logps/chosen": -37.37548065185547, + "logps/rejected": -103.75880432128906, + "loss": 0.7102, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.0575928688049316, + "rewards/margins": -0.06719207763671875, + "rewards/rejected": 3.1247849464416504, + "step": 6270 + }, + { + "epoch": 1.02, + "learning_rate": 5.0985626621834776e-06, + "logits/chosen": -0.9645650386810303, + "logits/rejected": -0.9349489212036133, + "logps/chosen": -72.52973175048828, + "logps/rejected": -103.62638854980469, + "loss": 1.1375, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.3101776838302612, + "rewards/margins": 1.1799713373184204, + "rewards/rejected": 0.13020630180835724, + "step": 6271 + }, + { + "epoch": 1.02, + "learning_rate": 5.097248660212717e-06, + "logits/chosen": -1.1277340650558472, + "logits/rejected": -0.9509021639823914, + "logps/chosen": -76.42222595214844, + "logps/rejected": -71.18028259277344, + "loss": 0.4633, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.6862258911132812, + "rewards/margins": -0.41533589363098145, + "rewards/rejected": 3.1015617847442627, + "step": 6272 + }, + { + "epoch": 1.02, + "learning_rate": 5.095934651522995e-06, + "logits/chosen": -1.1428111791610718, + "logits/rejected": -1.1428111791610718, + "logps/chosen": -16.40719223022461, + "logps/rejected": -16.40719223022461, + "loss": 1.3911, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.7328609228134155, + "rewards/margins": 0.0, + "rewards/rejected": 1.7328609228134155, + "step": 6273 + }, + { + "epoch": 1.02, + "learning_rate": 5.094620636205096e-06, + "logits/chosen": -1.2878527641296387, + "logits/rejected": -1.223941683769226, + "logps/chosen": -72.33586120605469, + "logps/rejected": -107.76969146728516, + "loss": 0.258, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.951129913330078, + "rewards/margins": 0.5306899547576904, + "rewards/rejected": 3.4204399585723877, + "step": 6274 + }, + { + "epoch": 1.02, + "learning_rate": 5.093306614349806e-06, + "logits/chosen": -1.293829321861267, + "logits/rejected": -1.3222723007202148, + "logps/chosen": -57.69440460205078, + "logps/rejected": -39.64105224609375, + "loss": 1.7386, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.1478111743927, + "rewards/margins": -3.2212679386138916, + "rewards/rejected": 5.369079113006592, + "step": 6275 + }, + { + "epoch": 1.02, + "learning_rate": 5.091992586047912e-06, + "logits/chosen": -1.198737621307373, + "logits/rejected": -1.115355372428894, + "logps/chosen": -44.273162841796875, + "logps/rejected": -60.196128845214844, + "loss": 0.175, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.245215654373169, + "rewards/margins": 1.369140625, + "rewards/rejected": 0.8760749697685242, + "step": 6276 + }, + { + "epoch": 1.02, + "learning_rate": 5.090678551390201e-06, + "logits/chosen": -1.1773086786270142, + "logits/rejected": -1.1773086786270142, + "logps/chosen": -38.192203521728516, + "logps/rejected": -38.192203521728516, + "loss": 0.5669, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.8411846160888672, + "rewards/margins": 0.0, + "rewards/rejected": 0.8411846160888672, + "step": 6277 + }, + { + "epoch": 1.02, + "learning_rate": 5.089364510467459e-06, + "logits/chosen": -1.0563112497329712, + "logits/rejected": -1.019126296043396, + "logps/chosen": -21.209758758544922, + "logps/rejected": -7.502971172332764, + "loss": 0.8392, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.6562666296958923, + "rewards/margins": 0.2497791051864624, + "rewards/rejected": 0.40648752450942993, + "step": 6278 + }, + { + "epoch": 1.02, + "learning_rate": 5.088050463370476e-06, + "logits/chosen": -1.3692066669464111, + "logits/rejected": -1.2575126886367798, + "logps/chosen": -225.89678955078125, + "logps/rejected": -147.71685791015625, + "loss": 0.0657, + "rewards/accuracies": 1.0, + "rewards/chosen": 7.241763591766357, + "rewards/margins": 2.5530762672424316, + "rewards/rejected": 4.688687324523926, + "step": 6279 + }, + { + "epoch": 1.02, + "learning_rate": 5.08673641019004e-06, + "logits/chosen": -1.3280670642852783, + "logits/rejected": -1.4575319290161133, + "logps/chosen": -95.56169891357422, + "logps/rejected": -35.19256591796875, + "loss": 0.0659, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.347785234451294, + "rewards/margins": 1.9842915534973145, + "rewards/rejected": 0.36349374055862427, + "step": 6280 + }, + { + "epoch": 1.02, + "learning_rate": 5.085422351016937e-06, + "logits/chosen": -1.4863073825836182, + "logits/rejected": -1.1889649629592896, + "logps/chosen": -94.38325500488281, + "logps/rejected": -74.63631439208984, + "loss": 0.4193, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.249840497970581, + "rewards/margins": -0.0644845962524414, + "rewards/rejected": 2.3143250942230225, + "step": 6281 + }, + { + "epoch": 1.02, + "learning_rate": 5.084108285941959e-06, + "logits/chosen": -1.3770982027053833, + "logits/rejected": -1.3355810642242432, + "logps/chosen": -91.16279602050781, + "logps/rejected": -58.899208068847656, + "loss": 0.2877, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.506532192230225, + "rewards/margins": 2.7607579231262207, + "rewards/rejected": 1.7457741498947144, + "step": 6282 + }, + { + "epoch": 1.02, + "learning_rate": 5.082794215055894e-06, + "logits/chosen": -1.1904069185256958, + "logits/rejected": -1.3122844696044922, + "logps/chosen": -81.61116027832031, + "logps/rejected": -78.43245697021484, + "loss": 3.2122, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.1966354846954346, + "rewards/margins": -6.421563148498535, + "rewards/rejected": 8.61819839477539, + "step": 6283 + }, + { + "epoch": 1.02, + "learning_rate": 5.0814801384495315e-06, + "logits/chosen": -1.2164921760559082, + "logits/rejected": -1.2609758377075195, + "logps/chosen": -32.631526947021484, + "logps/rejected": -33.06510925292969, + "loss": 1.3206, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.8002086877822876, + "rewards/margins": -1.8615108728408813, + "rewards/rejected": 3.661719560623169, + "step": 6284 + }, + { + "epoch": 1.02, + "learning_rate": 5.080166056213664e-06, + "logits/chosen": -1.3011937141418457, + "logits/rejected": -1.2947256565093994, + "logps/chosen": -60.504608154296875, + "logps/rejected": -91.66349029541016, + "loss": 0.6925, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1998450756073, + "rewards/margins": 0.42699718475341797, + "rewards/rejected": 2.772847890853882, + "step": 6285 + }, + { + "epoch": 1.02, + "learning_rate": 5.078851968439078e-06, + "logits/chosen": -0.9804961085319519, + "logits/rejected": -0.933091938495636, + "logps/chosen": -79.14933776855469, + "logps/rejected": -50.25019073486328, + "loss": 0.7942, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.032139539718628, + "rewards/margins": 0.7742667198181152, + "rewards/rejected": 2.2578728199005127, + "step": 6286 + }, + { + "epoch": 1.02, + "learning_rate": 5.077537875216568e-06, + "logits/chosen": -1.407660722732544, + "logits/rejected": -1.1288150548934937, + "logps/chosen": -67.99148559570312, + "logps/rejected": -61.50440979003906, + "loss": 0.1426, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0881316661834717, + "rewards/margins": 2.443952798843384, + "rewards/rejected": 0.6441788077354431, + "step": 6287 + }, + { + "epoch": 1.02, + "learning_rate": 5.076223776636926e-06, + "logits/chosen": -1.0469990968704224, + "logits/rejected": -0.9681446552276611, + "logps/chosen": -41.912818908691406, + "logps/rejected": -42.54513931274414, + "loss": 0.2527, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.8850929737091064, + "rewards/margins": 1.0567829608917236, + "rewards/rejected": 1.8283100128173828, + "step": 6288 + }, + { + "epoch": 1.02, + "learning_rate": 5.0749096727909406e-06, + "logits/chosen": -1.175723671913147, + "logits/rejected": -1.175723671913147, + "logps/chosen": -42.21745300292969, + "logps/rejected": -42.21745300292969, + "loss": 0.7423, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.2958076000213623, + "rewards/margins": 0.0, + "rewards/rejected": 2.2958076000213623, + "step": 6289 + }, + { + "epoch": 1.02, + "learning_rate": 5.073595563769407e-06, + "logits/chosen": -1.5571798086166382, + "logits/rejected": -1.4468806982040405, + "logps/chosen": -111.46446990966797, + "logps/rejected": -49.58727264404297, + "loss": 0.756, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.6480050086975098, + "rewards/margins": 1.102231740951538, + "rewards/rejected": 2.5457732677459717, + "step": 6290 + }, + { + "epoch": 1.02, + "learning_rate": 5.0722814496631155e-06, + "logits/chosen": -1.144241213798523, + "logits/rejected": -1.085968255996704, + "logps/chosen": -72.29057312011719, + "logps/rejected": -36.99059295654297, + "loss": 1.1886, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.411276251077652, + "rewards/margins": -1.3737664222717285, + "rewards/rejected": 1.785042643547058, + "step": 6291 + }, + { + "epoch": 1.02, + "learning_rate": 5.070967330562859e-06, + "logits/chosen": -1.0667389631271362, + "logits/rejected": -1.1021413803100586, + "logps/chosen": -102.18238830566406, + "logps/rejected": -52.46849060058594, + "loss": 0.3285, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.256463527679443, + "rewards/margins": 3.70135498046875, + "rewards/rejected": 1.555108666419983, + "step": 6292 + }, + { + "epoch": 1.02, + "learning_rate": 5.069653206559433e-06, + "logits/chosen": -1.0050010681152344, + "logits/rejected": -1.0094197988510132, + "logps/chosen": -52.42292022705078, + "logps/rejected": -66.86140441894531, + "loss": 0.7683, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.4610633850097656, + "rewards/margins": -1.108778476715088, + "rewards/rejected": 3.5698418617248535, + "step": 6293 + }, + { + "epoch": 1.02, + "learning_rate": 5.068339077743629e-06, + "logits/chosen": -1.1167372465133667, + "logits/rejected": -0.8131564855575562, + "logps/chosen": -79.9830093383789, + "logps/rejected": -29.183780670166016, + "loss": 1.4065, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.744410991668701, + "rewards/margins": 3.576643466949463, + "rewards/rejected": 1.1677674055099487, + "step": 6294 + }, + { + "epoch": 1.02, + "learning_rate": 5.067024944206241e-06, + "logits/chosen": -1.1594473123550415, + "logits/rejected": -1.2022440433502197, + "logps/chosen": -48.629051208496094, + "logps/rejected": -50.17479705810547, + "loss": 0.2228, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.154985189437866, + "rewards/margins": 0.5820939540863037, + "rewards/rejected": 1.5728912353515625, + "step": 6295 + }, + { + "epoch": 1.02, + "learning_rate": 5.065710806038063e-06, + "logits/chosen": -1.1614619493484497, + "logits/rejected": -1.106365442276001, + "logps/chosen": -67.75935363769531, + "logps/rejected": -36.274017333984375, + "loss": 0.8711, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.0470192432403564, + "rewards/margins": -1.4391791820526123, + "rewards/rejected": 3.4861984252929688, + "step": 6296 + }, + { + "epoch": 1.02, + "learning_rate": 5.064396663329891e-06, + "logits/chosen": -1.3323322534561157, + "logits/rejected": -1.3386801481246948, + "logps/chosen": -100.67984008789062, + "logps/rejected": -102.19529724121094, + "loss": 0.5178, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.845141649246216, + "rewards/margins": -0.4628875255584717, + "rewards/rejected": 3.3080291748046875, + "step": 6297 + }, + { + "epoch": 1.02, + "learning_rate": 5.06308251617252e-06, + "logits/chosen": -0.8260546326637268, + "logits/rejected": -0.8077254295349121, + "logps/chosen": -40.63931655883789, + "logps/rejected": -115.07173919677734, + "loss": 0.0807, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.1931869983673096, + "rewards/margins": 1.9719948768615723, + "rewards/rejected": 0.2211921662092209, + "step": 6298 + }, + { + "epoch": 1.02, + "learning_rate": 5.0617683646567415e-06, + "logits/chosen": -1.0995662212371826, + "logits/rejected": -1.1800224781036377, + "logps/chosen": -52.33111572265625, + "logps/rejected": -76.22687530517578, + "loss": 1.0797, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.4441055059432983, + "rewards/margins": -1.8497155904769897, + "rewards/rejected": 3.293821096420288, + "step": 6299 + }, + { + "epoch": 1.02, + "learning_rate": 5.0604542088733545e-06, + "logits/chosen": -1.126976728439331, + "logits/rejected": -1.1477136611938477, + "logps/chosen": -69.11475372314453, + "logps/rejected": -104.13343048095703, + "loss": 1.929, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.9975426197052, + "rewards/margins": -3.5073273181915283, + "rewards/rejected": 6.5048699378967285, + "step": 6300 + }, + { + "epoch": 1.02, + "learning_rate": 5.059140048913152e-06, + "logits/chosen": -1.288878321647644, + "logits/rejected": -1.2556718587875366, + "logps/chosen": -215.91842651367188, + "logps/rejected": -126.9601058959961, + "loss": 0.3859, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.516735792160034, + "rewards/margins": -0.12875747680664062, + "rewards/rejected": 3.645493268966675, + "step": 6301 + }, + { + "epoch": 1.02, + "learning_rate": 5.057825884866935e-06, + "logits/chosen": -1.507132649421692, + "logits/rejected": -1.4428908824920654, + "logps/chosen": -45.55229949951172, + "logps/rejected": -46.56352233886719, + "loss": 1.0752, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.5966484546661377, + "rewards/margins": 0.08100128173828125, + "rewards/rejected": 2.5156471729278564, + "step": 6302 + }, + { + "epoch": 1.02, + "learning_rate": 5.056511716825495e-06, + "logits/chosen": -1.083858847618103, + "logits/rejected": -1.1092621088027954, + "logps/chosen": -89.05716705322266, + "logps/rejected": -90.91473388671875, + "loss": 1.5967, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.661597490310669, + "rewards/margins": -3.1207635402679443, + "rewards/rejected": 4.782361030578613, + "step": 6303 + }, + { + "epoch": 1.02, + "learning_rate": 5.05519754487963e-06, + "logits/chosen": -1.2168256044387817, + "logits/rejected": -1.2048406600952148, + "logps/chosen": -266.7530517578125, + "logps/rejected": -52.50149154663086, + "loss": 0.0895, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.7857513427734375, + "rewards/margins": 2.4574198722839355, + "rewards/rejected": 2.328331470489502, + "step": 6304 + }, + { + "epoch": 1.02, + "learning_rate": 5.053883369120137e-06, + "logits/chosen": -1.3084357976913452, + "logits/rejected": -1.3128671646118164, + "logps/chosen": -35.12329864501953, + "logps/rejected": -30.86345672607422, + "loss": 0.2255, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.1456565856933594, + "rewards/margins": 0.6372581124305725, + "rewards/rejected": 0.5083984732627869, + "step": 6305 + }, + { + "epoch": 1.02, + "learning_rate": 5.052569189637813e-06, + "logits/chosen": -1.4268265962600708, + "logits/rejected": -1.4542138576507568, + "logps/chosen": -39.23133850097656, + "logps/rejected": -60.95248031616211, + "loss": 0.5413, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.130430221557617, + "rewards/margins": -0.010929107666015625, + "rewards/rejected": 2.141359329223633, + "step": 6306 + }, + { + "epoch": 1.02, + "learning_rate": 5.051255006523455e-06, + "logits/chosen": -1.3817917108535767, + "logits/rejected": -1.186008095741272, + "logps/chosen": -130.7747344970703, + "logps/rejected": -38.671504974365234, + "loss": 0.0318, + "rewards/accuracies": 1.0, + "rewards/chosen": 8.18567943572998, + "rewards/margins": 5.547907829284668, + "rewards/rejected": 2.6377713680267334, + "step": 6307 + }, + { + "epoch": 1.02, + "learning_rate": 5.049940819867862e-06, + "logits/chosen": -1.0515570640563965, + "logits/rejected": -1.0933550596237183, + "logps/chosen": -89.75349426269531, + "logps/rejected": -108.23326110839844, + "loss": 0.8789, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.827505588531494, + "rewards/margins": -1.447035312652588, + "rewards/rejected": 5.274540901184082, + "step": 6308 + }, + { + "epoch": 1.02, + "learning_rate": 5.04862662976183e-06, + "logits/chosen": -0.501968502998352, + "logits/rejected": -0.48347803950309753, + "logps/chosen": -4.455653190612793, + "logps/rejected": -3.9301953315734863, + "loss": 1.6272, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.041742898523807526, + "rewards/margins": -0.43930765986442566, + "rewards/rejected": 0.4810505509376526, + "step": 6309 + }, + { + "epoch": 1.02, + "learning_rate": 5.047312436296159e-06, + "logits/chosen": -0.956214427947998, + "logits/rejected": -0.956214427947998, + "logps/chosen": -1.676053762435913, + "logps/rejected": -1.676053762435913, + "loss": 0.5612, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.2477937489748001, + "rewards/margins": 0.0, + "rewards/rejected": 0.2477937489748001, + "step": 6310 + }, + { + "epoch": 1.02, + "learning_rate": 5.045998239561646e-06, + "logits/chosen": -1.363676905632019, + "logits/rejected": -1.363676905632019, + "logps/chosen": -58.525821685791016, + "logps/rejected": -58.525821685791016, + "loss": 0.8439, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.7113125324249268, + "rewards/margins": 0.0, + "rewards/rejected": 2.7113125324249268, + "step": 6311 + }, + { + "epoch": 1.02, + "learning_rate": 5.044684039649089e-06, + "logits/chosen": -1.2861295938491821, + "logits/rejected": -1.133000373840332, + "logps/chosen": -141.30340576171875, + "logps/rejected": -44.41448211669922, + "loss": 0.0185, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.371926784515381, + "rewards/margins": 3.7714972496032715, + "rewards/rejected": 2.6004295349121094, + "step": 6312 + }, + { + "epoch": 1.02, + "learning_rate": 5.043369836649289e-06, + "logits/chosen": -1.406844973564148, + "logits/rejected": -1.3801565170288086, + "logps/chosen": -79.74825286865234, + "logps/rejected": -103.97608947753906, + "loss": 0.5576, + "rewards/accuracies": 0.0, + "rewards/chosen": 5.362146854400635, + "rewards/margins": -0.6486425399780273, + "rewards/rejected": 6.010789394378662, + "step": 6313 + }, + { + "epoch": 1.02, + "learning_rate": 5.042055630653042e-06, + "logits/chosen": -0.7754054069519043, + "logits/rejected": -0.8183736205101013, + "logps/chosen": -89.29359436035156, + "logps/rejected": -66.68359375, + "loss": 0.2608, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.5051751136779785, + "rewards/margins": 0.8139488697052002, + "rewards/rejected": 1.6912262439727783, + "step": 6314 + }, + { + "epoch": 1.02, + "learning_rate": 5.040741421751151e-06, + "logits/chosen": -1.2008295059204102, + "logits/rejected": -1.3386294841766357, + "logps/chosen": -87.57011413574219, + "logps/rejected": -48.929481506347656, + "loss": 1.4209, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.3365280628204346, + "rewards/margins": 0.744799017906189, + "rewards/rejected": 1.5917290449142456, + "step": 6315 + }, + { + "epoch": 1.03, + "learning_rate": 5.039427210034411e-06, + "logits/chosen": -1.552838683128357, + "logits/rejected": -1.6284763813018799, + "logps/chosen": -79.0789794921875, + "logps/rejected": -92.80416107177734, + "loss": 0.5382, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.933659315109253, + "rewards/margins": -0.2579994201660156, + "rewards/rejected": 3.1916587352752686, + "step": 6316 + }, + { + "epoch": 1.03, + "learning_rate": 5.038112995593626e-06, + "logits/chosen": -1.4236297607421875, + "logits/rejected": -1.1295958757400513, + "logps/chosen": -106.04348754882812, + "logps/rejected": -46.87678146362305, + "loss": 0.0257, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.985436916351318, + "rewards/margins": 6.184333801269531, + "rewards/rejected": -0.19889679551124573, + "step": 6317 + }, + { + "epoch": 1.03, + "learning_rate": 5.036798778519591e-06, + "logits/chosen": -1.2374327182769775, + "logits/rejected": -1.1780149936676025, + "logps/chosen": -68.76727294921875, + "logps/rejected": -63.30698776245117, + "loss": 0.1744, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.399761915206909, + "rewards/margins": 0.9242099523544312, + "rewards/rejected": 1.475551962852478, + "step": 6318 + }, + { + "epoch": 1.03, + "learning_rate": 5.035484558903111e-06, + "logits/chosen": -1.2885196208953857, + "logits/rejected": -1.2885196208953857, + "logps/chosen": -52.552345275878906, + "logps/rejected": -52.552345275878906, + "loss": 0.3644, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.4446098804473877, + "rewards/margins": 0.0, + "rewards/rejected": 2.4446098804473877, + "step": 6319 + }, + { + "epoch": 1.03, + "learning_rate": 5.034170336834984e-06, + "logits/chosen": -1.5557276010513306, + "logits/rejected": -1.525246262550354, + "logps/chosen": -174.22171020507812, + "logps/rejected": -98.37307739257812, + "loss": 0.0282, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.480008125305176, + "rewards/margins": 3.367758274078369, + "rewards/rejected": 1.112249732017517, + "step": 6320 + }, + { + "epoch": 1.03, + "learning_rate": 5.03285611240601e-06, + "logits/chosen": -1.4259943962097168, + "logits/rejected": -1.2582478523254395, + "logps/chosen": -120.438720703125, + "logps/rejected": -70.78903198242188, + "loss": 0.478, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.905600070953369, + "rewards/margins": 0.6917970180511475, + "rewards/rejected": 2.2138030529022217, + "step": 6321 + }, + { + "epoch": 1.03, + "learning_rate": 5.031541885706987e-06, + "logits/chosen": -1.0397577285766602, + "logits/rejected": -1.0611164569854736, + "logps/chosen": -47.89768981933594, + "logps/rejected": -70.02664947509766, + "loss": 0.6491, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.384082078933716, + "rewards/margins": 0.46183550357818604, + "rewards/rejected": 1.9222465753555298, + "step": 6322 + }, + { + "epoch": 1.03, + "learning_rate": 5.03022765682872e-06, + "logits/chosen": -1.2805136442184448, + "logits/rejected": -1.3875670433044434, + "logps/chosen": -73.98211669921875, + "logps/rejected": -86.67662811279297, + "loss": 0.8354, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.4222588539123535, + "rewards/margins": -0.5307807922363281, + "rewards/rejected": 3.9530396461486816, + "step": 6323 + }, + { + "epoch": 1.03, + "learning_rate": 5.028913425862008e-06, + "logits/chosen": -1.0288876295089722, + "logits/rejected": -1.0359266996383667, + "logps/chosen": -63.133697509765625, + "logps/rejected": -70.02462005615234, + "loss": 0.8565, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.4625656604766846, + "rewards/margins": -0.3939208984375, + "rewards/rejected": 2.8564865589141846, + "step": 6324 + }, + { + "epoch": 1.03, + "learning_rate": 5.027599192897651e-06, + "logits/chosen": -1.4155253171920776, + "logits/rejected": -1.3441308736801147, + "logps/chosen": -42.18305206298828, + "logps/rejected": -30.926345825195312, + "loss": 0.3647, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.807326555252075, + "rewards/margins": 0.6836910247802734, + "rewards/rejected": 2.1236355304718018, + "step": 6325 + }, + { + "epoch": 1.03, + "learning_rate": 5.0262849580264515e-06, + "logits/chosen": -1.0337049961090088, + "logits/rejected": -1.0345237255096436, + "logps/chosen": -54.15137481689453, + "logps/rejected": -59.938987731933594, + "loss": 1.1688, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.7370033264160156, + "rewards/margins": -1.3923346996307373, + "rewards/rejected": 3.129338026046753, + "step": 6326 + }, + { + "epoch": 1.03, + "learning_rate": 5.02497072133921e-06, + "logits/chosen": -1.2523548603057861, + "logits/rejected": -1.1242568492889404, + "logps/chosen": -63.71484375, + "logps/rejected": -30.287477493286133, + "loss": 0.8889, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.6829795837402344, + "rewards/margins": 2.157785177230835, + "rewards/rejected": 0.5251943469047546, + "step": 6327 + }, + { + "epoch": 1.03, + "learning_rate": 5.023656482926728e-06, + "logits/chosen": -1.1869471073150635, + "logits/rejected": -1.2167484760284424, + "logps/chosen": -48.83526611328125, + "logps/rejected": -45.2512092590332, + "loss": 0.308, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.2224349975585938, + "rewards/margins": 0.17860150337219238, + "rewards/rejected": 2.0438334941864014, + "step": 6328 + }, + { + "epoch": 1.03, + "learning_rate": 5.022342242879805e-06, + "logits/chosen": -1.113037109375, + "logits/rejected": -1.1615686416625977, + "logps/chosen": -55.33004379272461, + "logps/rejected": -91.51228332519531, + "loss": 0.6199, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.5625674724578857, + "rewards/margins": -0.8935468196868896, + "rewards/rejected": 4.456114292144775, + "step": 6329 + }, + { + "epoch": 1.03, + "learning_rate": 5.021028001289247e-06, + "logits/chosen": -1.4190449714660645, + "logits/rejected": -1.4899247884750366, + "logps/chosen": -59.33890914916992, + "logps/rejected": -106.9004898071289, + "loss": 0.8958, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.82433819770813, + "rewards/margins": -1.605928659439087, + "rewards/rejected": 4.430266857147217, + "step": 6330 + }, + { + "epoch": 1.03, + "learning_rate": 5.019713758245852e-06, + "logits/chosen": -1.2710981369018555, + "logits/rejected": -1.3782944679260254, + "logps/chosen": -61.05109405517578, + "logps/rejected": -92.13440704345703, + "loss": 1.2108, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.0475013256073, + "rewards/margins": -2.248255968093872, + "rewards/rejected": 5.295757293701172, + "step": 6331 + }, + { + "epoch": 1.03, + "learning_rate": 5.018399513840423e-06, + "logits/chosen": -0.6126735210418701, + "logits/rejected": -0.6666079759597778, + "logps/chosen": -3.650047779083252, + "logps/rejected": -64.62876892089844, + "loss": 1.2879, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.13425317406654358, + "rewards/margins": -1.569213628768921, + "rewards/rejected": 1.703466773033142, + "step": 6332 + }, + { + "epoch": 1.03, + "learning_rate": 5.017085268163761e-06, + "logits/chosen": -1.2519265413284302, + "logits/rejected": -1.2529077529907227, + "logps/chosen": -8.708573341369629, + "logps/rejected": -2.5224390029907227, + "loss": 0.3939, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.2764132618904114, + "rewards/margins": -0.18101859092712402, + "rewards/rejected": 0.4574318528175354, + "step": 6333 + }, + { + "epoch": 1.03, + "learning_rate": 5.01577102130667e-06, + "logits/chosen": -0.9629882574081421, + "logits/rejected": -0.9743552207946777, + "logps/chosen": -77.3492431640625, + "logps/rejected": -91.3148422241211, + "loss": 0.1762, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.1832261085510254, + "rewards/margins": 2.1161394119262695, + "rewards/rejected": 0.06708679348230362, + "step": 6334 + }, + { + "epoch": 1.03, + "learning_rate": 5.014456773359951e-06, + "logits/chosen": -1.1627967357635498, + "logits/rejected": -1.1641329526901245, + "logps/chosen": -59.632102966308594, + "logps/rejected": -52.84262466430664, + "loss": 2.6709, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.881933569908142, + "rewards/margins": -1.1526516675949097, + "rewards/rejected": 3.0345852375030518, + "step": 6335 + }, + { + "epoch": 1.03, + "learning_rate": 5.013142524414404e-06, + "logits/chosen": -0.9995459914207458, + "logits/rejected": -1.0209840536117554, + "logps/chosen": -74.29412078857422, + "logps/rejected": -92.06419372558594, + "loss": 0.3589, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.79730224609375, + "rewards/margins": -0.04597628116607666, + "rewards/rejected": 1.8432785272598267, + "step": 6336 + }, + { + "epoch": 1.03, + "learning_rate": 5.011828274560834e-06, + "logits/chosen": -1.1652674674987793, + "logits/rejected": -1.1626482009887695, + "logps/chosen": -54.467926025390625, + "logps/rejected": -88.285888671875, + "loss": 0.2733, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.9892319440841675, + "rewards/margins": 0.3211357593536377, + "rewards/rejected": 1.6680961847305298, + "step": 6337 + }, + { + "epoch": 1.03, + "learning_rate": 5.0105140238900405e-06, + "logits/chosen": -1.3040778636932373, + "logits/rejected": -1.4738550186157227, + "logps/chosen": -97.3409194946289, + "logps/rejected": -33.67166519165039, + "loss": 0.5245, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.484076738357544, + "rewards/margins": 1.1566574573516846, + "rewards/rejected": 0.3274192810058594, + "step": 6338 + }, + { + "epoch": 1.03, + "learning_rate": 5.00919977249283e-06, + "logits/chosen": -1.5108885765075684, + "logits/rejected": -1.1013540029525757, + "logps/chosen": -111.50318908691406, + "logps/rejected": -23.97657012939453, + "loss": 0.0073, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.455503940582275, + "rewards/margins": 5.789430618286133, + "rewards/rejected": 0.6660732626914978, + "step": 6339 + }, + { + "epoch": 1.03, + "learning_rate": 5.007885520460001e-06, + "logits/chosen": -1.2005155086517334, + "logits/rejected": -1.200697422027588, + "logps/chosen": -89.25859069824219, + "logps/rejected": -42.096832275390625, + "loss": 0.2246, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.2896745204925537, + "rewards/margins": 0.5849015712738037, + "rewards/rejected": 1.70477294921875, + "step": 6340 + }, + { + "epoch": 1.03, + "learning_rate": 5.0065712678823565e-06, + "logits/chosen": -1.2488099336624146, + "logits/rejected": -1.2365635633468628, + "logps/chosen": -91.25477600097656, + "logps/rejected": -74.21206665039062, + "loss": 0.5906, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.714571475982666, + "rewards/margins": 1.8386573791503906, + "rewards/rejected": 0.8759140372276306, + "step": 6341 + }, + { + "epoch": 1.03, + "learning_rate": 5.005257014850701e-06, + "logits/chosen": -1.3873580694198608, + "logits/rejected": -1.3292789459228516, + "logps/chosen": -71.68843078613281, + "logps/rejected": -86.43321990966797, + "loss": 0.4012, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.498675584793091, + "rewards/margins": -0.1696326732635498, + "rewards/rejected": 3.6683082580566406, + "step": 6342 + }, + { + "epoch": 1.03, + "learning_rate": 5.003942761455834e-06, + "logits/chosen": -1.009577751159668, + "logits/rejected": -0.9921644926071167, + "logps/chosen": -65.952392578125, + "logps/rejected": -77.13969421386719, + "loss": 3.0073, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.8021697998046875, + "rewards/margins": 0.3639051914215088, + "rewards/rejected": 3.4382646083831787, + "step": 6343 + }, + { + "epoch": 1.03, + "learning_rate": 5.002628507788561e-06, + "logits/chosen": -1.373152494430542, + "logits/rejected": -1.3259669542312622, + "logps/chosen": -96.8844223022461, + "logps/rejected": -170.92709350585938, + "loss": 0.7616, + "rewards/accuracies": 0.0, + "rewards/chosen": 6.500640392303467, + "rewards/margins": -1.2534732818603516, + "rewards/rejected": 7.754113674163818, + "step": 6344 + }, + { + "epoch": 1.03, + "learning_rate": 5.001314253939682e-06, + "logits/chosen": -1.0386030673980713, + "logits/rejected": -0.9663675427436829, + "logps/chosen": -57.62395477294922, + "logps/rejected": -63.399147033691406, + "loss": 0.4317, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.4077552556991577, + "rewards/margins": -0.1288902759552002, + "rewards/rejected": 1.536645531654358, + "step": 6345 + }, + { + "epoch": 1.03, + "learning_rate": 5e-06, + "logits/chosen": -1.4563251733779907, + "logits/rejected": -1.3314800262451172, + "logps/chosen": -95.07258605957031, + "logps/rejected": -34.35120391845703, + "loss": 0.201, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.2869179248809814, + "rewards/margins": 1.0965003967285156, + "rewards/rejected": 0.19041748344898224, + "step": 6346 + }, + { + "epoch": 1.03, + "learning_rate": 4.99868574606032e-06, + "logits/chosen": -0.9354365468025208, + "logits/rejected": -0.9868143796920776, + "logps/chosen": -42.7667236328125, + "logps/rejected": -112.50326538085938, + "loss": 0.1485, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.923577070236206, + "rewards/margins": 2.1329360008239746, + "rewards/rejected": 0.7906410098075867, + "step": 6347 + }, + { + "epoch": 1.03, + "learning_rate": 4.997371492211441e-06, + "logits/chosen": -1.4400289058685303, + "logits/rejected": -1.4556992053985596, + "logps/chosen": -32.05693435668945, + "logps/rejected": -12.142678260803223, + "loss": 0.8669, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.2997806668281555, + "rewards/margins": -0.3657439351081848, + "rewards/rejected": 0.6655246019363403, + "step": 6348 + }, + { + "epoch": 1.03, + "learning_rate": 4.996057238544167e-06, + "logits/chosen": -1.4196337461471558, + "logits/rejected": -1.4501842260360718, + "logps/chosen": -63.40534210205078, + "logps/rejected": -119.64642333984375, + "loss": 0.2678, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.29656982421875, + "rewards/margins": 0.4073059558868408, + "rewards/rejected": 2.889263868331909, + "step": 6349 + }, + { + "epoch": 1.03, + "learning_rate": 4.994742985149301e-06, + "logits/chosen": -1.2042187452316284, + "logits/rejected": -1.3182073831558228, + "logps/chosen": -57.360191345214844, + "logps/rejected": -148.87570190429688, + "loss": 0.983, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.422628164291382, + "rewards/margins": -1.8002402782440186, + "rewards/rejected": 5.2228684425354, + "step": 6350 + }, + { + "epoch": 1.03, + "learning_rate": 4.993428732117644e-06, + "logits/chosen": -1.348912000656128, + "logits/rejected": -1.1302425861358643, + "logps/chosen": -78.40300750732422, + "logps/rejected": -53.813865661621094, + "loss": 0.0629, + "rewards/accuracies": 1.0, + "rewards/chosen": 7.50970983505249, + "rewards/margins": 2.76615571975708, + "rewards/rejected": 4.74355411529541, + "step": 6351 + }, + { + "epoch": 1.03, + "learning_rate": 4.992114479540001e-06, + "logits/chosen": -1.6066449880599976, + "logits/rejected": -1.575798749923706, + "logps/chosen": -61.58522033691406, + "logps/rejected": -93.25361633300781, + "loss": 0.5496, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.7730462551116943, + "rewards/margins": -0.4139411449432373, + "rewards/rejected": 3.1869874000549316, + "step": 6352 + }, + { + "epoch": 1.03, + "learning_rate": 4.990800227507172e-06, + "logits/chosen": -1.0568146705627441, + "logits/rejected": -1.0615463256835938, + "logps/chosen": -42.36473846435547, + "logps/rejected": -95.06605529785156, + "loss": 0.296, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.2204551696777344, + "rewards/margins": 0.5976310968399048, + "rewards/rejected": 1.6228240728378296, + "step": 6353 + }, + { + "epoch": 1.03, + "learning_rate": 4.98948597610996e-06, + "logits/chosen": -1.513921856880188, + "logits/rejected": -1.5242213010787964, + "logps/chosen": -83.49081420898438, + "logps/rejected": -92.65443420410156, + "loss": 0.522, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.248126268386841, + "rewards/margins": -0.5092666149139404, + "rewards/rejected": 3.7573928833007812, + "step": 6354 + }, + { + "epoch": 1.03, + "learning_rate": 4.9881717254391686e-06, + "logits/chosen": -0.9838132858276367, + "logits/rejected": -0.9472903609275818, + "logps/chosen": -37.286502838134766, + "logps/rejected": -59.35047912597656, + "loss": 0.1727, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.877946138381958, + "rewards/margins": 0.9996150732040405, + "rewards/rejected": 1.8783310651779175, + "step": 6355 + }, + { + "epoch": 1.03, + "learning_rate": 4.9868574755855976e-06, + "logits/chosen": -0.9986622929573059, + "logits/rejected": -1.001342535018921, + "logps/chosen": -29.458274841308594, + "logps/rejected": -105.08660888671875, + "loss": 0.4323, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.722730278968811, + "rewards/margins": -0.30205535888671875, + "rewards/rejected": 1.0247856378555298, + "step": 6356 + }, + { + "epoch": 1.03, + "learning_rate": 4.985543226640052e-06, + "logits/chosen": -1.251750111579895, + "logits/rejected": -1.2295883893966675, + "logps/chosen": -75.06289672851562, + "logps/rejected": -53.28390884399414, + "loss": 0.3294, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.112082004547119, + "rewards/margins": 0.6006290912628174, + "rewards/rejected": 2.5114529132843018, + "step": 6357 + }, + { + "epoch": 1.03, + "learning_rate": 4.9842289786933316e-06, + "logits/chosen": -1.272129774093628, + "logits/rejected": -1.280219316482544, + "logps/chosen": -78.07318878173828, + "logps/rejected": -59.93962860107422, + "loss": 1.3702, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.412837266921997, + "rewards/margins": -0.631289005279541, + "rewards/rejected": 3.044126272201538, + "step": 6358 + }, + { + "epoch": 1.03, + "learning_rate": 4.9829147318362396e-06, + "logits/chosen": -1.407793641090393, + "logits/rejected": -1.440316915512085, + "logps/chosen": -51.77906036376953, + "logps/rejected": -76.09687805175781, + "loss": 0.1632, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.3826851844787598, + "rewards/margins": 1.0176734924316406, + "rewards/rejected": 2.365011692047119, + "step": 6359 + }, + { + "epoch": 1.03, + "learning_rate": 4.981600486159579e-06, + "logits/chosen": -1.429051160812378, + "logits/rejected": -1.4940392971038818, + "logps/chosen": -73.0869140625, + "logps/rejected": -65.06278991699219, + "loss": 0.401, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.9660019874572754, + "rewards/margins": -0.13364553451538086, + "rewards/rejected": 4.099647521972656, + "step": 6360 + }, + { + "epoch": 1.03, + "learning_rate": 4.98028624175415e-06, + "logits/chosen": -1.4384112358093262, + "logits/rejected": -1.3258990049362183, + "logps/chosen": -102.22154235839844, + "logps/rejected": -140.20343017578125, + "loss": 0.2304, + "rewards/accuracies": 1.0, + "rewards/chosen": 7.776097297668457, + "rewards/margins": 2.2156448364257812, + "rewards/rejected": 5.560452461242676, + "step": 6361 + }, + { + "epoch": 1.03, + "learning_rate": 4.978971998710755e-06, + "logits/chosen": -1.4290786981582642, + "logits/rejected": -1.1853402853012085, + "logps/chosen": -82.85641479492188, + "logps/rejected": -33.03300094604492, + "loss": 0.0677, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.0120849609375, + "rewards/margins": 4.453438758850098, + "rewards/rejected": 0.558646023273468, + "step": 6362 + }, + { + "epoch": 1.03, + "learning_rate": 4.977657757120196e-06, + "logits/chosen": -0.8759573101997375, + "logits/rejected": -0.8360834717750549, + "logps/chosen": -91.93170166015625, + "logps/rejected": -63.03668975830078, + "loss": 0.4043, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.8822983503341675, + "rewards/margins": 0.5402138233184814, + "rewards/rejected": 1.342084527015686, + "step": 6363 + }, + { + "epoch": 1.03, + "learning_rate": 4.976343517073274e-06, + "logits/chosen": -1.7393131256103516, + "logits/rejected": -1.691481590270996, + "logps/chosen": -65.40629577636719, + "logps/rejected": -79.14096069335938, + "loss": 0.367, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.6654136180877686, + "rewards/margins": 1.7295714616775513, + "rewards/rejected": 0.9358421564102173, + "step": 6364 + }, + { + "epoch": 1.03, + "learning_rate": 4.975029278660792e-06, + "logits/chosen": -1.6219267845153809, + "logits/rejected": -1.6622949838638306, + "logps/chosen": -50.04636764526367, + "logps/rejected": -72.84672546386719, + "loss": 2.3051, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.6024022102355957, + "rewards/margins": -2.0171117782592773, + "rewards/rejected": 4.619513988494873, + "step": 6365 + }, + { + "epoch": 1.03, + "learning_rate": 4.97371504197355e-06, + "logits/chosen": -1.0415701866149902, + "logits/rejected": -0.8657609820365906, + "logps/chosen": -67.85855102539062, + "logps/rejected": -30.33416175842285, + "loss": 0.1078, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.36521315574646, + "rewards/margins": 3.102808713912964, + "rewards/rejected": -0.7375955581665039, + "step": 6366 + }, + { + "epoch": 1.03, + "learning_rate": 4.9724008071023505e-06, + "logits/chosen": -1.3070273399353027, + "logits/rejected": -1.3089576959609985, + "logps/chosen": -64.03889465332031, + "logps/rejected": -115.255615234375, + "loss": 0.3485, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9250199794769287, + "rewards/margins": 2.0416741371154785, + "rewards/rejected": 0.8833457827568054, + "step": 6367 + }, + { + "epoch": 1.03, + "learning_rate": 4.971086574137994e-06, + "logits/chosen": -1.1813193559646606, + "logits/rejected": -1.0169223546981812, + "logps/chosen": -45.50050354003906, + "logps/rejected": -15.624504089355469, + "loss": 0.392, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1557540893554688, + "rewards/margins": 2.3905351161956787, + "rewards/rejected": 0.7652189135551453, + "step": 6368 + }, + { + "epoch": 1.03, + "learning_rate": 4.9697723431712815e-06, + "logits/chosen": -1.3107881546020508, + "logits/rejected": -1.346708059310913, + "logps/chosen": -38.36480712890625, + "logps/rejected": -54.54571533203125, + "loss": 0.777, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.544084310531616, + "rewards/margins": -0.07502126693725586, + "rewards/rejected": 2.619105577468872, + "step": 6369 + }, + { + "epoch": 1.03, + "learning_rate": 4.9684581142930135e-06, + "logits/chosen": -1.2292660474777222, + "logits/rejected": -1.0140141248703003, + "logps/chosen": -84.04463195800781, + "logps/rejected": -11.093196868896484, + "loss": 0.0162, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.370744228363037, + "rewards/margins": 3.4481935501098633, + "rewards/rejected": 0.922550618648529, + "step": 6370 + }, + { + "epoch": 1.03, + "learning_rate": 4.967143887593993e-06, + "logits/chosen": -1.1535472869873047, + "logits/rejected": -1.0907344818115234, + "logps/chosen": -53.68348693847656, + "logps/rejected": -92.71258544921875, + "loss": 0.1776, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.8299224376678467, + "rewards/margins": 1.0880576372146606, + "rewards/rejected": 1.741864800453186, + "step": 6371 + }, + { + "epoch": 1.03, + "learning_rate": 4.965829663165017e-06, + "logits/chosen": -1.2480891942977905, + "logits/rejected": -1.067339301109314, + "logps/chosen": -94.28654479980469, + "logps/rejected": -13.917854309082031, + "loss": 1.2754, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.3363068103790283, + "rewards/margins": 1.552353858947754, + "rewards/rejected": 0.7839528918266296, + "step": 6372 + }, + { + "epoch": 1.03, + "learning_rate": 4.96451544109689e-06, + "logits/chosen": -1.1341959238052368, + "logits/rejected": -1.1341959238052368, + "logps/chosen": -79.61389923095703, + "logps/rejected": -79.61389923095703, + "loss": 0.3688, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.5335838794708252, + "rewards/margins": 0.0, + "rewards/rejected": 1.5335838794708252, + "step": 6373 + }, + { + "epoch": 1.03, + "learning_rate": 4.9632012214804086e-06, + "logits/chosen": -1.0085806846618652, + "logits/rejected": -0.9948838353157043, + "logps/chosen": -74.99287414550781, + "logps/rejected": -43.6194953918457, + "loss": 0.4859, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.195443868637085, + "rewards/margins": -0.12827110290527344, + "rewards/rejected": 2.3237149715423584, + "step": 6374 + }, + { + "epoch": 1.03, + "learning_rate": 4.961887004406375e-06, + "logits/chosen": -0.8395252823829651, + "logits/rejected": -0.8862668871879578, + "logps/chosen": -42.311946868896484, + "logps/rejected": -103.1646728515625, + "loss": 0.4359, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.8859822750091553, + "rewards/margins": 1.588666319847107, + "rewards/rejected": 1.2973159551620483, + "step": 6375 + }, + { + "epoch": 1.03, + "learning_rate": 4.960572789965589e-06, + "logits/chosen": -1.167694330215454, + "logits/rejected": -1.2001358270645142, + "logps/chosen": -110.09053039550781, + "logps/rejected": -122.8451919555664, + "loss": 0.2314, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.516406297683716, + "rewards/margins": 0.5328986644744873, + "rewards/rejected": 2.9835076332092285, + "step": 6376 + }, + { + "epoch": 1.04, + "learning_rate": 4.95925857824885e-06, + "logits/chosen": -1.266443133354187, + "logits/rejected": -1.152579665184021, + "logps/chosen": -94.37538146972656, + "logps/rejected": -67.80410766601562, + "loss": 0.424, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.588641405105591, + "rewards/margins": -0.10285425186157227, + "rewards/rejected": 3.691495656967163, + "step": 6377 + }, + { + "epoch": 1.04, + "learning_rate": 4.957944369346957e-06, + "logits/chosen": -0.9390235543251038, + "logits/rejected": -0.9384509921073914, + "logps/chosen": -4.5723371505737305, + "logps/rejected": -12.251847267150879, + "loss": 0.6242, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.2223213165998459, + "rewards/margins": -0.10972337424755096, + "rewards/rejected": 0.33204469084739685, + "step": 6378 + }, + { + "epoch": 1.04, + "learning_rate": 4.956630163350712e-06, + "logits/chosen": -1.3547394275665283, + "logits/rejected": -1.3642306327819824, + "logps/chosen": -61.771854400634766, + "logps/rejected": -80.82084655761719, + "loss": 0.4744, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.6565029621124268, + "rewards/margins": -0.020967483520507812, + "rewards/rejected": 2.6774704456329346, + "step": 6379 + }, + { + "epoch": 1.04, + "learning_rate": 4.955315960350911e-06, + "logits/chosen": -1.4555200338363647, + "logits/rejected": -1.4153164625167847, + "logps/chosen": -57.88528060913086, + "logps/rejected": -205.569091796875, + "loss": 4.4569, + "rewards/accuracies": 0.0, + "rewards/chosen": 4.564334392547607, + "rewards/margins": -6.214697360992432, + "rewards/rejected": 10.779031753540039, + "step": 6380 + }, + { + "epoch": 1.04, + "learning_rate": 4.954001760438355e-06, + "logits/chosen": -1.164387583732605, + "logits/rejected": -1.1504474878311157, + "logps/chosen": -45.286216735839844, + "logps/rejected": -39.49913024902344, + "loss": 0.7262, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.8122150897979736, + "rewards/margins": -0.5878016948699951, + "rewards/rejected": 2.4000167846679688, + "step": 6381 + }, + { + "epoch": 1.04, + "learning_rate": 4.952687563703841e-06, + "logits/chosen": -0.9836465716362, + "logits/rejected": -0.9915199279785156, + "logps/chosen": -4.373262405395508, + "logps/rejected": -23.799835205078125, + "loss": 1.6288, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.38422393798828125, + "rewards/margins": 0.36815279722213745, + "rewards/rejected": 0.016071129590272903, + "step": 6382 + }, + { + "epoch": 1.04, + "learning_rate": 4.95137337023817e-06, + "logits/chosen": -1.273857831954956, + "logits/rejected": -1.2710853815078735, + "logps/chosen": -42.017974853515625, + "logps/rejected": -56.97435760498047, + "loss": 0.9565, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.044358015060425, + "rewards/margins": -0.09001398086547852, + "rewards/rejected": 2.1343719959259033, + "step": 6383 + }, + { + "epoch": 1.04, + "learning_rate": 4.950059180132139e-06, + "logits/chosen": -0.9049392342567444, + "logits/rejected": -0.8584625720977783, + "logps/chosen": -67.05035400390625, + "logps/rejected": -95.66480255126953, + "loss": 0.7798, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9268646240234375, + "rewards/margins": 0.33772802352905273, + "rewards/rejected": 2.5891366004943848, + "step": 6384 + }, + { + "epoch": 1.04, + "learning_rate": 4.948744993476545e-06, + "logits/chosen": -1.312499761581421, + "logits/rejected": -1.339273452758789, + "logps/chosen": -109.36688995361328, + "logps/rejected": -158.30731201171875, + "loss": 0.5709, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.6747688055038452, + "rewards/margins": -0.6908837556838989, + "rewards/rejected": 2.365652561187744, + "step": 6385 + }, + { + "epoch": 1.04, + "learning_rate": 4.947430810362188e-06, + "logits/chosen": -1.194841742515564, + "logits/rejected": -1.194841742515564, + "logps/chosen": -44.449363708496094, + "logps/rejected": -44.449363708496094, + "loss": 0.5373, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.6805999279022217, + "rewards/margins": 0.0, + "rewards/rejected": 3.6805999279022217, + "step": 6386 + }, + { + "epoch": 1.04, + "learning_rate": 4.9461166308798635e-06, + "logits/chosen": -1.1159552335739136, + "logits/rejected": -1.2067029476165771, + "logps/chosen": -37.97612762451172, + "logps/rejected": -104.88829040527344, + "loss": 0.9678, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.7227470874786377, + "rewards/margins": -0.39005351066589355, + "rewards/rejected": 3.1128005981445312, + "step": 6387 + }, + { + "epoch": 1.04, + "learning_rate": 4.944802455120371e-06, + "logits/chosen": -1.1287320852279663, + "logits/rejected": -1.1200735569000244, + "logps/chosen": -71.8330078125, + "logps/rejected": -78.77710723876953, + "loss": 0.3689, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.490460157394409, + "rewards/margins": 0.897685170173645, + "rewards/rejected": 1.5927749872207642, + "step": 6388 + }, + { + "epoch": 1.04, + "learning_rate": 4.943488283174506e-06, + "logits/chosen": -1.0047118663787842, + "logits/rejected": -1.0063221454620361, + "logps/chosen": -1.7632499933242798, + "logps/rejected": -1.0945253372192383, + "loss": 0.4218, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.20136995613574982, + "rewards/margins": -0.0179901123046875, + "rewards/rejected": 0.21936006844043732, + "step": 6389 + }, + { + "epoch": 1.04, + "learning_rate": 4.942174115133066e-06, + "logits/chosen": -0.6093366742134094, + "logits/rejected": -0.6093366742134094, + "logps/chosen": -1.4504618644714355, + "logps/rejected": -1.4504618644714355, + "loss": 0.575, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.20810504257678986, + "rewards/margins": 0.0, + "rewards/rejected": 0.20810504257678986, + "step": 6390 + }, + { + "epoch": 1.04, + "learning_rate": 4.940859951086847e-06, + "logits/chosen": -1.2833665609359741, + "logits/rejected": -1.2193431854248047, + "logps/chosen": -96.37840270996094, + "logps/rejected": -161.5882568359375, + "loss": 0.5712, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.9408546686172485, + "rewards/margins": -0.6308127641677856, + "rewards/rejected": 2.571667432785034, + "step": 6391 + }, + { + "epoch": 1.04, + "learning_rate": 4.939545791126646e-06, + "logits/chosen": -0.9270504117012024, + "logits/rejected": -0.8452844023704529, + "logps/chosen": -74.68537902832031, + "logps/rejected": -73.58484649658203, + "loss": 0.5536, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.8390350341796875, + "rewards/margins": -0.699676513671875, + "rewards/rejected": 3.5387115478515625, + "step": 6392 + }, + { + "epoch": 1.04, + "learning_rate": 4.938231635343259e-06, + "logits/chosen": -1.4936448335647583, + "logits/rejected": -1.2821446657180786, + "logps/chosen": -103.63034057617188, + "logps/rejected": -53.05830001831055, + "loss": 0.4234, + "rewards/accuracies": 1.0, + "rewards/chosen": 7.310229778289795, + "rewards/margins": 3.7625646591186523, + "rewards/rejected": 3.5476651191711426, + "step": 6393 + }, + { + "epoch": 1.04, + "learning_rate": 4.936917483827483e-06, + "logits/chosen": -1.4043771028518677, + "logits/rejected": -1.4035497903823853, + "logps/chosen": -93.03233337402344, + "logps/rejected": -87.4693603515625, + "loss": 0.4589, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.876302480697632, + "rewards/margins": 0.3833038806915283, + "rewards/rejected": 2.4929986000061035, + "step": 6394 + }, + { + "epoch": 1.04, + "learning_rate": 4.93560333667011e-06, + "logits/chosen": -1.3436858654022217, + "logits/rejected": -1.3223060369491577, + "logps/chosen": -35.344200134277344, + "logps/rejected": -48.14878845214844, + "loss": 0.6946, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.7592872381210327, + "rewards/margins": -0.9430238008499146, + "rewards/rejected": 2.7023110389709473, + "step": 6395 + }, + { + "epoch": 1.04, + "learning_rate": 4.9342891939619385e-06, + "logits/chosen": -1.398354411125183, + "logits/rejected": -1.4427614212036133, + "logps/chosen": -88.90804290771484, + "logps/rejected": -84.27552795410156, + "loss": 0.247, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.398270606994629, + "rewards/margins": 0.5585794448852539, + "rewards/rejected": 5.839691162109375, + "step": 6396 + }, + { + "epoch": 1.04, + "learning_rate": 4.93297505579376e-06, + "logits/chosen": -1.5357649326324463, + "logits/rejected": -1.427978754043579, + "logps/chosen": -83.0880126953125, + "logps/rejected": -20.61776351928711, + "loss": 1.0475, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.8520538806915283, + "rewards/margins": 0.7890675067901611, + "rewards/rejected": 1.0629863739013672, + "step": 6397 + }, + { + "epoch": 1.04, + "learning_rate": 4.931660922256372e-06, + "logits/chosen": -1.411819338798523, + "logits/rejected": -1.3438390493392944, + "logps/chosen": -67.50157165527344, + "logps/rejected": -72.17344665527344, + "loss": 0.2021, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.3326187133789062, + "rewards/margins": 0.7605040073394775, + "rewards/rejected": 2.5721147060394287, + "step": 6398 + }, + { + "epoch": 1.04, + "learning_rate": 4.930346793440569e-06, + "logits/chosen": -1.209195852279663, + "logits/rejected": -1.2357609272003174, + "logps/chosen": -43.977046966552734, + "logps/rejected": -42.978572845458984, + "loss": 0.6811, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.7095203399658203, + "rewards/margins": -0.11852431297302246, + "rewards/rejected": 2.8280446529388428, + "step": 6399 + }, + { + "epoch": 1.04, + "learning_rate": 4.929032669437142e-06, + "logits/chosen": -0.728525698184967, + "logits/rejected": -0.7316199541091919, + "logps/chosen": -6.748132705688477, + "logps/rejected": -2.569739818572998, + "loss": 1.0916, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.25327712297439575, + "rewards/margins": -0.18467211723327637, + "rewards/rejected": 0.4379492402076721, + "step": 6400 + }, + { + "epoch": 1.04, + "learning_rate": 4.927718550336887e-06, + "logits/chosen": -1.281011939048767, + "logits/rejected": -1.3259105682373047, + "logps/chosen": -82.61763000488281, + "logps/rejected": -113.68953704833984, + "loss": 0.5638, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.448988437652588, + "rewards/margins": -0.7351951599121094, + "rewards/rejected": 3.1841835975646973, + "step": 6401 + }, + { + "epoch": 1.04, + "learning_rate": 4.926404436230596e-06, + "logits/chosen": -1.5051697492599487, + "logits/rejected": -1.3424804210662842, + "logps/chosen": -90.57449340820312, + "logps/rejected": -35.930335998535156, + "loss": 0.3046, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.8950966596603394, + "rewards/margins": 0.3336963653564453, + "rewards/rejected": 1.561400294303894, + "step": 6402 + }, + { + "epoch": 1.04, + "learning_rate": 4.925090327209062e-06, + "logits/chosen": -1.1696531772613525, + "logits/rejected": -1.0361522436141968, + "logps/chosen": -133.04135131835938, + "logps/rejected": -66.00437927246094, + "loss": 0.2618, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.989245891571045, + "rewards/margins": 0.5802555084228516, + "rewards/rejected": 4.408990383148193, + "step": 6403 + }, + { + "epoch": 1.04, + "learning_rate": 4.9237762233630765e-06, + "logits/chosen": -1.3112752437591553, + "logits/rejected": -1.2899656295776367, + "logps/chosen": -47.69268035888672, + "logps/rejected": -67.85830688476562, + "loss": 0.4454, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.5433387756347656, + "rewards/margins": 0.3979828357696533, + "rewards/rejected": 2.1453559398651123, + "step": 6404 + }, + { + "epoch": 1.04, + "learning_rate": 4.922462124783434e-06, + "logits/chosen": -1.1045955419540405, + "logits/rejected": -1.1401681900024414, + "logps/chosen": -51.444114685058594, + "logps/rejected": -85.27864837646484, + "loss": 0.2966, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.3027763366699219, + "rewards/margins": 0.4057067632675171, + "rewards/rejected": 0.8970695734024048, + "step": 6405 + }, + { + "epoch": 1.04, + "learning_rate": 4.921148031560924e-06, + "logits/chosen": -1.2460991144180298, + "logits/rejected": -1.1686429977416992, + "logps/chosen": -98.34400939941406, + "logps/rejected": -47.98957061767578, + "loss": 0.1165, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.242500305175781, + "rewards/margins": 1.6603446006774902, + "rewards/rejected": 3.582155704498291, + "step": 6406 + }, + { + "epoch": 1.04, + "learning_rate": 4.9198339437863395e-06, + "logits/chosen": -0.9016326665878296, + "logits/rejected": -0.8882541060447693, + "logps/chosen": -13.170502662658691, + "logps/rejected": -13.375904083251953, + "loss": 1.2139, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.7914614677429199, + "rewards/margins": -0.0058089494705200195, + "rewards/rejected": 0.7972704172134399, + "step": 6407 + }, + { + "epoch": 1.04, + "learning_rate": 4.918519861550471e-06, + "logits/chosen": -1.1876798868179321, + "logits/rejected": -1.1934003829956055, + "logps/chosen": -137.61163330078125, + "logps/rejected": -93.21601867675781, + "loss": 0.0189, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.439538478851318, + "rewards/margins": 3.2834320068359375, + "rewards/rejected": 1.1561065912246704, + "step": 6408 + }, + { + "epoch": 1.04, + "learning_rate": 4.917205784944109e-06, + "logits/chosen": -1.2832087278366089, + "logits/rejected": -1.0990798473358154, + "logps/chosen": -61.32502365112305, + "logps/rejected": -15.366750717163086, + "loss": 0.1261, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.758805513381958, + "rewards/margins": 1.8480520248413086, + "rewards/rejected": 0.9107534289360046, + "step": 6409 + }, + { + "epoch": 1.04, + "learning_rate": 4.915891714058044e-06, + "logits/chosen": -0.8743658661842346, + "logits/rejected": -0.7513829469680786, + "logps/chosen": -64.09636688232422, + "logps/rejected": -53.57032775878906, + "loss": 0.3388, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.30008864402771, + "rewards/margins": 1.7075109481811523, + "rewards/rejected": 0.5925777554512024, + "step": 6410 + }, + { + "epoch": 1.04, + "learning_rate": 4.914577648983065e-06, + "logits/chosen": -1.2875829935073853, + "logits/rejected": -1.2717103958129883, + "logps/chosen": -70.74818420410156, + "logps/rejected": -57.37371826171875, + "loss": 1.2977, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.52679443359375, + "rewards/margins": -2.3535523414611816, + "rewards/rejected": 4.880346775054932, + "step": 6411 + }, + { + "epoch": 1.04, + "learning_rate": 4.9132635898099625e-06, + "logits/chosen": -1.698145866394043, + "logits/rejected": -1.6658124923706055, + "logps/chosen": -54.72465515136719, + "logps/rejected": -19.931034088134766, + "loss": 0.2588, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.6189597845077515, + "rewards/margins": 0.44796276092529297, + "rewards/rejected": 1.1709970235824585, + "step": 6412 + }, + { + "epoch": 1.04, + "learning_rate": 4.911949536629526e-06, + "logits/chosen": -1.0324327945709229, + "logits/rejected": -0.8532871007919312, + "logps/chosen": -100.23682403564453, + "logps/rejected": -63.26426696777344, + "loss": 0.1175, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.108745574951172, + "rewards/margins": 3.0646650791168213, + "rewards/rejected": 3.0440804958343506, + "step": 6413 + }, + { + "epoch": 1.04, + "learning_rate": 4.910635489532543e-06, + "logits/chosen": -1.4417306184768677, + "logits/rejected": -1.4522095918655396, + "logps/chosen": -96.22669219970703, + "logps/rejected": -127.13723754882812, + "loss": 0.1988, + "rewards/accuracies": 1.0, + "rewards/chosen": 7.3168511390686035, + "rewards/margins": 1.8411993980407715, + "rewards/rejected": 5.475651741027832, + "step": 6414 + }, + { + "epoch": 1.04, + "learning_rate": 4.9093214486098015e-06, + "logits/chosen": -1.421788215637207, + "logits/rejected": -1.4743815660476685, + "logps/chosen": -153.04620361328125, + "logps/rejected": -191.60540771484375, + "loss": 0.0469, + "rewards/accuracies": 1.0, + "rewards/chosen": 11.055737495422363, + "rewards/margins": 3.343982219696045, + "rewards/rejected": 7.711755275726318, + "step": 6415 + }, + { + "epoch": 1.04, + "learning_rate": 4.90800741395209e-06, + "logits/chosen": -1.2021594047546387, + "logits/rejected": -1.212254524230957, + "logps/chosen": -182.284912109375, + "logps/rejected": -45.62554168701172, + "loss": 0.0071, + "rewards/accuracies": 1.0, + "rewards/chosen": 8.149164199829102, + "rewards/margins": 6.316401958465576, + "rewards/rejected": 1.8327621221542358, + "step": 6416 + }, + { + "epoch": 1.04, + "learning_rate": 4.906693385650196e-06, + "logits/chosen": -1.2937147617340088, + "logits/rejected": -1.1973282098770142, + "logps/chosen": -51.71874237060547, + "logps/rejected": -63.038490295410156, + "loss": 0.5257, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.3620498180389404, + "rewards/margins": 2.094305992126465, + "rewards/rejected": 1.267743706703186, + "step": 6417 + }, + { + "epoch": 1.04, + "learning_rate": 4.905379363794907e-06, + "logits/chosen": -1.2662038803100586, + "logits/rejected": -0.7339069843292236, + "logps/chosen": -174.194580078125, + "logps/rejected": -130.53640747070312, + "loss": 0.1027, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.68313455581665, + "rewards/margins": 1.562086582183838, + "rewards/rejected": 5.1210479736328125, + "step": 6418 + }, + { + "epoch": 1.04, + "learning_rate": 4.904065348477008e-06, + "logits/chosen": -1.4134310483932495, + "logits/rejected": -1.3686628341674805, + "logps/chosen": -87.28538513183594, + "logps/rejected": -81.59735107421875, + "loss": 0.1421, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9566986560821533, + "rewards/margins": 1.8290618658065796, + "rewards/rejected": 1.1276367902755737, + "step": 6419 + }, + { + "epoch": 1.04, + "learning_rate": 4.902751339787284e-06, + "logits/chosen": -1.0425872802734375, + "logits/rejected": -1.0575027465820312, + "logps/chosen": -54.50847625732422, + "logps/rejected": -59.86210632324219, + "loss": 0.4019, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.1456964015960693, + "rewards/margins": -0.13916683197021484, + "rewards/rejected": 2.284863233566284, + "step": 6420 + }, + { + "epoch": 1.04, + "learning_rate": 4.901437337816523e-06, + "logits/chosen": -1.2562687397003174, + "logits/rejected": -1.228994607925415, + "logps/chosen": -61.09925079345703, + "logps/rejected": -73.44676208496094, + "loss": 0.4615, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.6790642738342285, + "rewards/margins": 0.7484017610549927, + "rewards/rejected": 1.9306625127792358, + "step": 6421 + }, + { + "epoch": 1.04, + "learning_rate": 4.900123342655511e-06, + "logits/chosen": -1.2772618532180786, + "logits/rejected": -1.245374083518982, + "logps/chosen": -47.769500732421875, + "logps/rejected": -15.163991928100586, + "loss": 0.2331, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.8213752508163452, + "rewards/margins": 1.1182832717895508, + "rewards/rejected": 0.7030920386314392, + "step": 6422 + }, + { + "epoch": 1.04, + "learning_rate": 4.89880935439503e-06, + "logits/chosen": -1.3025833368301392, + "logits/rejected": -1.3025833368301392, + "logps/chosen": -46.938392639160156, + "logps/rejected": -46.938392639160156, + "loss": 0.3507, + "rewards/accuracies": 0.0, + "rewards/chosen": 6.017598628997803, + "rewards/margins": 0.0, + "rewards/rejected": 6.017598628997803, + "step": 6423 + }, + { + "epoch": 1.04, + "learning_rate": 4.897495373125866e-06, + "logits/chosen": -1.550832986831665, + "logits/rejected": -1.4635393619537354, + "logps/chosen": -112.62576293945312, + "logps/rejected": -46.78352355957031, + "loss": 0.3075, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.0457520484924316, + "rewards/margins": 0.25425493717193604, + "rewards/rejected": 1.7914971113204956, + "step": 6424 + }, + { + "epoch": 1.04, + "learning_rate": 4.896181398938801e-06, + "logits/chosen": -1.6078720092773438, + "logits/rejected": -1.587927222251892, + "logps/chosen": -51.40663146972656, + "logps/rejected": -57.35289001464844, + "loss": 0.302, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.9698944091796875, + "rewards/margins": 1.0898146629333496, + "rewards/rejected": 2.880079746246338, + "step": 6425 + }, + { + "epoch": 1.04, + "learning_rate": 4.894867431924618e-06, + "logits/chosen": -0.9224948883056641, + "logits/rejected": -0.9358383417129517, + "logps/chosen": -53.9130859375, + "logps/rejected": -78.94862365722656, + "loss": 0.9115, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.6056114435195923, + "rewards/margins": -0.8851479291915894, + "rewards/rejected": 2.4907593727111816, + "step": 6426 + }, + { + "epoch": 1.04, + "learning_rate": 4.893553472174102e-06, + "logits/chosen": -1.276248574256897, + "logits/rejected": -1.232359528541565, + "logps/chosen": -134.81146240234375, + "logps/rejected": -196.16607666015625, + "loss": 0.0345, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.3841705322265625, + "rewards/margins": 3.0683822631835938, + "rewards/rejected": 2.3157882690429688, + "step": 6427 + }, + { + "epoch": 1.04, + "learning_rate": 4.892239519778034e-06, + "logits/chosen": -0.9670231342315674, + "logits/rejected": -0.9854248762130737, + "logps/chosen": -157.27377319335938, + "logps/rejected": -104.11688995361328, + "loss": 0.1772, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.666174411773682, + "rewards/margins": 0.8645989894866943, + "rewards/rejected": 3.8015754222869873, + "step": 6428 + }, + { + "epoch": 1.04, + "learning_rate": 4.890925574827195e-06, + "logits/chosen": -1.081240177154541, + "logits/rejected": -1.0477992296218872, + "logps/chosen": -91.68041229248047, + "logps/rejected": -62.68201446533203, + "loss": 0.579, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.0228958129882812, + "rewards/margins": -0.32421112060546875, + "rewards/rejected": 2.34710693359375, + "step": 6429 + }, + { + "epoch": 1.04, + "learning_rate": 4.889611637412367e-06, + "logits/chosen": -1.3335493803024292, + "logits/rejected": -1.3258956670761108, + "logps/chosen": -139.6250762939453, + "logps/rejected": -140.11532592773438, + "loss": 0.239, + "rewards/accuracies": 1.0, + "rewards/chosen": 7.313237190246582, + "rewards/margins": 1.195420742034912, + "rewards/rejected": 6.11781644821167, + "step": 6430 + }, + { + "epoch": 1.04, + "learning_rate": 4.888297707624331e-06, + "logits/chosen": -0.8767714500427246, + "logits/rejected": -1.0734163522720337, + "logps/chosen": -52.30256652832031, + "logps/rejected": -87.0621566772461, + "loss": 1.4277, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.417961835861206, + "rewards/margins": -2.7209079265594482, + "rewards/rejected": 5.138869762420654, + "step": 6431 + }, + { + "epoch": 1.04, + "learning_rate": 4.886983785553865e-06, + "logits/chosen": -0.7812153697013855, + "logits/rejected": -0.8826970458030701, + "logps/chosen": -68.54161071777344, + "logps/rejected": -99.98814392089844, + "loss": 1.5352, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.610593557357788, + "rewards/margins": -3.019428014755249, + "rewards/rejected": 5.630021572113037, + "step": 6432 + }, + { + "epoch": 1.04, + "learning_rate": 4.8856698712917505e-06, + "logits/chosen": -1.2305587530136108, + "logits/rejected": -1.226427435874939, + "logps/chosen": -38.271240234375, + "logps/rejected": -23.094558715820312, + "loss": 0.2945, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.9370895624160767, + "rewards/margins": 0.2866401672363281, + "rewards/rejected": 0.6504493951797485, + "step": 6433 + }, + { + "epoch": 1.04, + "learning_rate": 4.884355964928767e-06, + "logits/chosen": -1.4078270196914673, + "logits/rejected": -1.5100831985473633, + "logps/chosen": -64.91143035888672, + "logps/rejected": -114.20849609375, + "loss": 1.6489, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.713712453842163, + "rewards/margins": -2.753053903579712, + "rewards/rejected": 5.466766357421875, + "step": 6434 + }, + { + "epoch": 1.04, + "learning_rate": 4.883042066555691e-06, + "logits/chosen": -1.4006985425949097, + "logits/rejected": -1.293118953704834, + "logps/chosen": -70.64881896972656, + "logps/rejected": -41.41240310668945, + "loss": 0.1781, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.742356061935425, + "rewards/margins": 0.9596282243728638, + "rewards/rejected": 1.782727837562561, + "step": 6435 + }, + { + "epoch": 1.04, + "learning_rate": 4.881728176263302e-06, + "logits/chosen": -1.4121049642562866, + "logits/rejected": -1.5108040571212769, + "logps/chosen": -199.0248260498047, + "logps/rejected": -84.21737670898438, + "loss": 0.0577, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.391719341278076, + "rewards/margins": 4.7316179275512695, + "rewards/rejected": 1.660101294517517, + "step": 6436 + }, + { + "epoch": 1.04, + "learning_rate": 4.880414294142377e-06, + "logits/chosen": -1.7143267393112183, + "logits/rejected": -1.7594540119171143, + "logps/chosen": -97.7962646484375, + "logps/rejected": -66.51869201660156, + "loss": 0.8656, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.942995548248291, + "rewards/margins": -1.2187514305114746, + "rewards/rejected": 4.161746978759766, + "step": 6437 + }, + { + "epoch": 1.04, + "learning_rate": 4.879100420283692e-06, + "logits/chosen": -0.8962343335151672, + "logits/rejected": -0.8969717621803284, + "logps/chosen": -3.375671863555908, + "logps/rejected": -2.9412145614624023, + "loss": 0.4927, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.24687643349170685, + "rewards/margins": -0.03602747619152069, + "rewards/rejected": 0.28290390968322754, + "step": 6438 + }, + { + "epoch": 1.05, + "learning_rate": 4.8777865547780254e-06, + "logits/chosen": -1.5671446323394775, + "logits/rejected": -1.3761693239212036, + "logps/chosen": -120.97346496582031, + "logps/rejected": -48.24610137939453, + "loss": 0.0065, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.802818298339844, + "rewards/margins": 5.966322898864746, + "rewards/rejected": 0.836495578289032, + "step": 6439 + }, + { + "epoch": 1.05, + "learning_rate": 4.8764726977161505e-06, + "logits/chosen": -1.5358680486679077, + "logits/rejected": -1.5358680486679077, + "logps/chosen": -53.804351806640625, + "logps/rejected": -53.804351806640625, + "loss": 0.395, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.988356113433838, + "rewards/margins": 0.0, + "rewards/rejected": 2.988356113433838, + "step": 6440 + }, + { + "epoch": 1.05, + "learning_rate": 4.875158849188844e-06, + "logits/chosen": -1.368170976638794, + "logits/rejected": -1.2194395065307617, + "logps/chosen": -110.42153930664062, + "logps/rejected": -110.6435546875, + "loss": 0.028, + "rewards/accuracies": 1.0, + "rewards/chosen": 7.185345649719238, + "rewards/margins": 5.77455997467041, + "rewards/rejected": 1.4107856750488281, + "step": 6441 + }, + { + "epoch": 1.05, + "learning_rate": 4.873845009286879e-06, + "logits/chosen": -1.112868070602417, + "logits/rejected": -1.0275650024414062, + "logps/chosen": -156.69644165039062, + "logps/rejected": -69.71865844726562, + "loss": 0.108, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.803662300109863, + "rewards/margins": 3.513676643371582, + "rewards/rejected": 2.2899856567382812, + "step": 6442 + }, + { + "epoch": 1.05, + "learning_rate": 4.8725311781010304e-06, + "logits/chosen": -1.5490925312042236, + "logits/rejected": -1.5266886949539185, + "logps/chosen": -28.82126235961914, + "logps/rejected": -49.964820861816406, + "loss": 0.3251, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.162569999694824, + "rewards/margins": 0.9099752902984619, + "rewards/rejected": 3.2525947093963623, + "step": 6443 + }, + { + "epoch": 1.05, + "learning_rate": 4.871217355722071e-06, + "logits/chosen": -1.03508722782135, + "logits/rejected": -1.03508722782135, + "logps/chosen": -44.963470458984375, + "logps/rejected": -44.963470458984375, + "loss": 0.4045, + "rewards/accuracies": 0.0, + "rewards/chosen": 4.915245056152344, + "rewards/margins": 0.0, + "rewards/rejected": 4.915245056152344, + "step": 6444 + }, + { + "epoch": 1.05, + "learning_rate": 4.869903542240774e-06, + "logits/chosen": -1.30707585811615, + "logits/rejected": -1.2746658325195312, + "logps/chosen": -49.73593521118164, + "logps/rejected": -12.349148750305176, + "loss": 0.4156, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.417208433151245, + "rewards/margins": 1.9102274179458618, + "rewards/rejected": 0.5069810152053833, + "step": 6445 + }, + { + "epoch": 1.05, + "learning_rate": 4.868589737747912e-06, + "logits/chosen": -1.144217610359192, + "logits/rejected": -1.191300392150879, + "logps/chosen": -39.72441864013672, + "logps/rejected": -61.814327239990234, + "loss": 0.4444, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.242926836013794, + "rewards/margins": 0.8437838554382324, + "rewards/rejected": 2.3991429805755615, + "step": 6446 + }, + { + "epoch": 1.05, + "learning_rate": 4.867275942334255e-06, + "logits/chosen": -1.4910731315612793, + "logits/rejected": -1.4136135578155518, + "logps/chosen": -134.47549438476562, + "logps/rejected": -293.1838684082031, + "loss": 1.012, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.7383697032928467, + "rewards/margins": 0.01545405387878418, + "rewards/rejected": 2.7229156494140625, + "step": 6447 + }, + { + "epoch": 1.05, + "learning_rate": 4.865962156090575e-06, + "logits/chosen": -1.5132702589035034, + "logits/rejected": -1.5525637865066528, + "logps/chosen": -105.8868408203125, + "logps/rejected": -115.22837829589844, + "loss": 2.0464, + "rewards/accuracies": 0.0, + "rewards/chosen": 4.415419101715088, + "rewards/margins": -4.024399280548096, + "rewards/rejected": 8.439818382263184, + "step": 6448 + }, + { + "epoch": 1.05, + "learning_rate": 4.864648379107641e-06, + "logits/chosen": -1.3581353425979614, + "logits/rejected": -1.2608448266983032, + "logps/chosen": -60.40808868408203, + "logps/rejected": -26.095813751220703, + "loss": 0.4707, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.6878738403320312, + "rewards/margins": 0.7244921922683716, + "rewards/rejected": 1.9633816480636597, + "step": 6449 + }, + { + "epoch": 1.05, + "learning_rate": 4.863334611476224e-06, + "logits/chosen": -1.0007659196853638, + "logits/rejected": -1.0137571096420288, + "logps/chosen": -68.57005310058594, + "logps/rejected": -76.57032775878906, + "loss": 0.3595, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.5316102504730225, + "rewards/margins": 0.3540222644805908, + "rewards/rejected": 2.1775879859924316, + "step": 6450 + }, + { + "epoch": 1.05, + "learning_rate": 4.862020853287091e-06, + "logits/chosen": -1.1876684427261353, + "logits/rejected": -1.1876684427261353, + "logps/chosen": -70.3271484375, + "logps/rejected": -70.3271484375, + "loss": 0.3569, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.576590061187744, + "rewards/margins": 0.0, + "rewards/rejected": 3.576590061187744, + "step": 6451 + }, + { + "epoch": 1.05, + "learning_rate": 4.860707104631013e-06, + "logits/chosen": -1.543894648551941, + "logits/rejected": -1.5290874242782593, + "logps/chosen": -63.96369552612305, + "logps/rejected": -111.58197784423828, + "loss": 0.1806, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.867669582366943, + "rewards/margins": 1.1408085823059082, + "rewards/rejected": 5.726861000061035, + "step": 6452 + }, + { + "epoch": 1.05, + "learning_rate": 4.859393365598755e-06, + "logits/chosen": -1.4018478393554688, + "logits/rejected": -1.1860759258270264, + "logps/chosen": -115.49536895751953, + "logps/rejected": -51.45922088623047, + "loss": 0.0621, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.377639293670654, + "rewards/margins": 3.181454658508301, + "rewards/rejected": 2.1961846351623535, + "step": 6453 + }, + { + "epoch": 1.05, + "learning_rate": 4.858079636281086e-06, + "logits/chosen": -1.3227300643920898, + "logits/rejected": -1.2430505752563477, + "logps/chosen": -86.83572387695312, + "logps/rejected": -81.44993591308594, + "loss": 0.3893, + "rewards/accuracies": 0.0, + "rewards/chosen": 5.515008449554443, + "rewards/margins": -0.13019132614135742, + "rewards/rejected": 5.645199775695801, + "step": 6454 + }, + { + "epoch": 1.05, + "learning_rate": 4.85676591676877e-06, + "logits/chosen": -1.299342155456543, + "logits/rejected": -1.0906922817230225, + "logps/chosen": -127.8403091430664, + "logps/rejected": -65.1557846069336, + "loss": 0.1357, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.302619457244873, + "rewards/margins": 1.992415189743042, + "rewards/rejected": 3.310204267501831, + "step": 6455 + }, + { + "epoch": 1.05, + "learning_rate": 4.855452207152573e-06, + "logits/chosen": -1.2878981828689575, + "logits/rejected": -1.1875578165054321, + "logps/chosen": -101.8908920288086, + "logps/rejected": -97.41964721679688, + "loss": 1.4035, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.162135362625122, + "rewards/margins": 1.9222679138183594, + "rewards/rejected": 0.2398674041032791, + "step": 6456 + }, + { + "epoch": 1.05, + "learning_rate": 4.854138507523263e-06, + "logits/chosen": -1.4360361099243164, + "logits/rejected": -1.3370038270950317, + "logps/chosen": -91.30281066894531, + "logps/rejected": -123.28526306152344, + "loss": 0.3973, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.882682800292969, + "rewards/margins": 1.58486008644104, + "rewards/rejected": 3.2978227138519287, + "step": 6457 + }, + { + "epoch": 1.05, + "learning_rate": 4.852824817971601e-06, + "logits/chosen": -1.2594114542007446, + "logits/rejected": -1.1986864805221558, + "logps/chosen": -169.50839233398438, + "logps/rejected": -96.68693542480469, + "loss": 1.2222, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.0541839599609375, + "rewards/margins": -1.9066848754882812, + "rewards/rejected": 4.960868835449219, + "step": 6458 + }, + { + "epoch": 1.05, + "learning_rate": 4.851511138588352e-06, + "logits/chosen": -1.339413046836853, + "logits/rejected": -1.331370234489441, + "logps/chosen": -62.61689758300781, + "logps/rejected": -81.48820495605469, + "loss": 0.536, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.8731788992881775, + "rewards/margins": -0.4560241103172302, + "rewards/rejected": 1.3292030096054077, + "step": 6459 + }, + { + "epoch": 1.05, + "learning_rate": 4.850197469464278e-06, + "logits/chosen": -1.1434885263442993, + "logits/rejected": -1.1503783464431763, + "logps/chosen": -68.31130981445312, + "logps/rejected": -82.37432861328125, + "loss": 0.6386, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.332897901535034, + "rewards/margins": -0.09012603759765625, + "rewards/rejected": 2.4230239391326904, + "step": 6460 + }, + { + "epoch": 1.05, + "learning_rate": 4.848883810690141e-06, + "logits/chosen": -1.2177163362503052, + "logits/rejected": -1.2457563877105713, + "logps/chosen": -61.285675048828125, + "logps/rejected": -93.15111541748047, + "loss": 0.6335, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.7411606311798096, + "rewards/margins": 0.08750307559967041, + "rewards/rejected": 1.6536575555801392, + "step": 6461 + }, + { + "epoch": 1.05, + "learning_rate": 4.847570162356703e-06, + "logits/chosen": -1.5928298234939575, + "logits/rejected": -1.4638222455978394, + "logps/chosen": -132.80438232421875, + "logps/rejected": -66.58262634277344, + "loss": 0.0328, + "rewards/accuracies": 1.0, + "rewards/chosen": 7.39736795425415, + "rewards/margins": 3.5520875453948975, + "rewards/rejected": 3.845280408859253, + "step": 6462 + }, + { + "epoch": 1.05, + "learning_rate": 4.846256524554725e-06, + "logits/chosen": -1.611138105392456, + "logits/rejected": -1.6189924478530884, + "logps/chosen": -80.20154571533203, + "logps/rejected": -135.00196838378906, + "loss": 2.5435, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.4337754249572754, + "rewards/margins": -5.060539722442627, + "rewards/rejected": 8.494315147399902, + "step": 6463 + }, + { + "epoch": 1.05, + "learning_rate": 4.844942897374967e-06, + "logits/chosen": -1.1121035814285278, + "logits/rejected": -1.0503722429275513, + "logps/chosen": -21.046255111694336, + "logps/rejected": -71.13574981689453, + "loss": 0.4011, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.8918001651763916, + "rewards/margins": -0.20445609092712402, + "rewards/rejected": 2.0962562561035156, + "step": 6464 + }, + { + "epoch": 1.05, + "learning_rate": 4.843629280908186e-06, + "logits/chosen": -1.2107173204421997, + "logits/rejected": -1.182341456413269, + "logps/chosen": -65.77835083007812, + "logps/rejected": -106.9570083618164, + "loss": 1.4802, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.7588822841644287, + "rewards/margins": -1.8197762966156006, + "rewards/rejected": 4.578658580780029, + "step": 6465 + }, + { + "epoch": 1.05, + "learning_rate": 4.842315675245144e-06, + "logits/chosen": -1.2381454706192017, + "logits/rejected": -1.2038792371749878, + "logps/chosen": -56.93201446533203, + "logps/rejected": -72.34185028076172, + "loss": 0.1678, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.3775062561035156, + "rewards/margins": 1.284233808517456, + "rewards/rejected": 2.0932724475860596, + "step": 6466 + }, + { + "epoch": 1.05, + "learning_rate": 4.841002080476595e-06, + "logits/chosen": -1.3808202743530273, + "logits/rejected": -1.0778464078903198, + "logps/chosen": -100.67396545410156, + "logps/rejected": -51.32212448120117, + "loss": 0.7879, + "rewards/accuracies": 1.0, + "rewards/chosen": 7.061110019683838, + "rewards/margins": 5.274514198303223, + "rewards/rejected": 1.7865955829620361, + "step": 6467 + }, + { + "epoch": 1.05, + "learning_rate": 4.839688496693298e-06, + "logits/chosen": -1.1456199884414673, + "logits/rejected": -1.142012119293213, + "logps/chosen": -11.949335098266602, + "logps/rejected": -2.6782584190368652, + "loss": 0.667, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.3520524203777313, + "rewards/margins": -0.02117207646369934, + "rewards/rejected": 0.37322449684143066, + "step": 6468 + }, + { + "epoch": 1.05, + "learning_rate": 4.83837492398601e-06, + "logits/chosen": -1.0376588106155396, + "logits/rejected": -1.0376588106155396, + "logps/chosen": -3.4339656829833984, + "logps/rejected": -3.4339656829833984, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.1450618505477905, + "rewards/margins": 0.0, + "rewards/rejected": 1.1450618505477905, + "step": 6469 + }, + { + "epoch": 1.05, + "learning_rate": 4.837061362445485e-06, + "logits/chosen": -0.8785332441329956, + "logits/rejected": -0.8933691382408142, + "logps/chosen": -42.166221618652344, + "logps/rejected": -68.86048126220703, + "loss": 1.0527, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.9707794189453125, + "rewards/margins": -1.2475152015686035, + "rewards/rejected": 4.218294620513916, + "step": 6470 + }, + { + "epoch": 1.05, + "learning_rate": 4.835747812162478e-06, + "logits/chosen": -1.2311710119247437, + "logits/rejected": -1.247727632522583, + "logps/chosen": -43.49906539916992, + "logps/rejected": -94.85591888427734, + "loss": 1.1324, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.4017276763916016, + "rewards/margins": -0.8553669452667236, + "rewards/rejected": 2.257094621658325, + "step": 6471 + }, + { + "epoch": 1.05, + "learning_rate": 4.834434273227743e-06, + "logits/chosen": -1.1529103517532349, + "logits/rejected": -1.249712347984314, + "logps/chosen": -51.95398712158203, + "logps/rejected": -71.62893676757812, + "loss": 1.1306, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.078450918197632, + "rewards/margins": -2.046537160873413, + "rewards/rejected": 4.124988079071045, + "step": 6472 + }, + { + "epoch": 1.05, + "learning_rate": 4.8331207457320335e-06, + "logits/chosen": -1.1806740760803223, + "logits/rejected": -1.163901925086975, + "logps/chosen": -44.810123443603516, + "logps/rejected": -48.642791748046875, + "loss": 0.3537, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.4149967432022095, + "rewards/margins": 0.3409057855606079, + "rewards/rejected": 1.0740909576416016, + "step": 6473 + }, + { + "epoch": 1.05, + "learning_rate": 4.831807229766101e-06, + "logits/chosen": -1.2340165376663208, + "logits/rejected": -1.2009965181350708, + "logps/chosen": -48.99141311645508, + "logps/rejected": -29.581417083740234, + "loss": 1.717, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.8402279615402222, + "rewards/margins": 0.5865223407745361, + "rewards/rejected": 1.253705620765686, + "step": 6474 + }, + { + "epoch": 1.05, + "learning_rate": 4.830493725420697e-06, + "logits/chosen": -1.465463638305664, + "logits/rejected": -1.3841896057128906, + "logps/chosen": -195.6793212890625, + "logps/rejected": -34.58538055419922, + "loss": 0.1033, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.101715087890625, + "rewards/margins": 1.6314445734024048, + "rewards/rejected": 1.4702705144882202, + "step": 6475 + }, + { + "epoch": 1.05, + "learning_rate": 4.829180232786574e-06, + "logits/chosen": -1.2830290794372559, + "logits/rejected": -1.2830290794372559, + "logps/chosen": -100.10966491699219, + "logps/rejected": -100.10966491699219, + "loss": 0.3472, + "rewards/accuracies": 0.0, + "rewards/chosen": 4.821141242980957, + "rewards/margins": 0.0, + "rewards/rejected": 4.821141242980957, + "step": 6476 + }, + { + "epoch": 1.05, + "learning_rate": 4.82786675195448e-06, + "logits/chosen": -1.251928687095642, + "logits/rejected": -1.3456976413726807, + "logps/chosen": -60.24993896484375, + "logps/rejected": -93.86402893066406, + "loss": 1.9907, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.8610780239105225, + "rewards/margins": -3.8829057216644287, + "rewards/rejected": 6.743983745574951, + "step": 6477 + }, + { + "epoch": 1.05, + "learning_rate": 4.826553283015165e-06, + "logits/chosen": -1.1017884016036987, + "logits/rejected": -1.1116594076156616, + "logps/chosen": -81.31376647949219, + "logps/rejected": -79.55116271972656, + "loss": 0.2194, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.295640707015991, + "rewards/margins": 1.2936921119689941, + "rewards/rejected": 1.001948595046997, + "step": 6478 + }, + { + "epoch": 1.05, + "learning_rate": 4.8252398260593756e-06, + "logits/chosen": -1.6485223770141602, + "logits/rejected": -1.6105453968048096, + "logps/chosen": -72.06111145019531, + "logps/rejected": -48.19546127319336, + "loss": 0.1313, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.2089524269104004, + "rewards/margins": 1.2792576551437378, + "rewards/rejected": 0.9296947717666626, + "step": 6479 + }, + { + "epoch": 1.05, + "learning_rate": 4.823926381177862e-06, + "logits/chosen": -1.3105132579803467, + "logits/rejected": -1.2912501096725464, + "logps/chosen": -70.50434875488281, + "logps/rejected": -69.869873046875, + "loss": 1.2161, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.6582062244415283, + "rewards/margins": -2.283581018447876, + "rewards/rejected": 4.941787242889404, + "step": 6480 + }, + { + "epoch": 1.05, + "learning_rate": 4.82261294846137e-06, + "logits/chosen": -1.2173576354980469, + "logits/rejected": -1.1006232500076294, + "logps/chosen": -54.70503234863281, + "logps/rejected": -14.497011184692383, + "loss": 0.026, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.8347694873809814, + "rewards/margins": 3.0050456523895264, + "rewards/rejected": 0.8297237753868103, + "step": 6481 + }, + { + "epoch": 1.05, + "learning_rate": 4.821299528000643e-06, + "logits/chosen": -0.9944360256195068, + "logits/rejected": -0.8741709589958191, + "logps/chosen": -96.03634643554688, + "logps/rejected": -31.290584564208984, + "loss": 0.1557, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.8794572353363037, + "rewards/margins": 2.7559499740600586, + "rewards/rejected": 0.1235073134303093, + "step": 6482 + }, + { + "epoch": 1.05, + "learning_rate": 4.819986119886428e-06, + "logits/chosen": -0.7623192071914673, + "logits/rejected": -0.8014287948608398, + "logps/chosen": -16.619972229003906, + "logps/rejected": -33.25992202758789, + "loss": 0.5246, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.5695610046386719, + "rewards/margins": -0.3754734396934509, + "rewards/rejected": 0.9450344443321228, + "step": 6483 + }, + { + "epoch": 1.05, + "learning_rate": 4.81867272420947e-06, + "logits/chosen": -1.3104785680770874, + "logits/rejected": -1.3320075273513794, + "logps/chosen": -53.481048583984375, + "logps/rejected": -70.41932678222656, + "loss": 0.4709, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.587355136871338, + "rewards/margins": -0.444549560546875, + "rewards/rejected": 4.031904697418213, + "step": 6484 + }, + { + "epoch": 1.05, + "learning_rate": 4.81735934106051e-06, + "logits/chosen": -1.0645720958709717, + "logits/rejected": -0.8963881134986877, + "logps/chosen": -87.63613891601562, + "logps/rejected": -36.12712478637695, + "loss": 0.137, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.303785800933838, + "rewards/margins": 1.547016978263855, + "rewards/rejected": 0.7567688226699829, + "step": 6485 + }, + { + "epoch": 1.05, + "learning_rate": 4.8160459705302916e-06, + "logits/chosen": -1.3168010711669922, + "logits/rejected": -1.3090629577636719, + "logps/chosen": -67.46724700927734, + "logps/rejected": -82.96236419677734, + "loss": 0.2876, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.116762638092041, + "rewards/margins": 0.37866830825805664, + "rewards/rejected": 1.7380943298339844, + "step": 6486 + }, + { + "epoch": 1.05, + "learning_rate": 4.814732612709557e-06, + "logits/chosen": -0.8976256847381592, + "logits/rejected": -0.9727209210395813, + "logps/chosen": -111.17266845703125, + "logps/rejected": -71.4486083984375, + "loss": 0.4696, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.010266304016113, + "rewards/margins": 2.098355293273926, + "rewards/rejected": 1.9119110107421875, + "step": 6487 + }, + { + "epoch": 1.05, + "learning_rate": 4.813419267689044e-06, + "logits/chosen": -1.1896475553512573, + "logits/rejected": -1.2250113487243652, + "logps/chosen": -91.97019958496094, + "logps/rejected": -98.71296691894531, + "loss": 0.3435, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.087395429611206, + "rewards/margins": 0.06442856788635254, + "rewards/rejected": 2.0229668617248535, + "step": 6488 + }, + { + "epoch": 1.05, + "learning_rate": 4.812105935559496e-06, + "logits/chosen": -1.2732701301574707, + "logits/rejected": -1.258976936340332, + "logps/chosen": -39.12212371826172, + "logps/rejected": -62.51409149169922, + "loss": 0.7431, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.0597894191741943, + "rewards/margins": -1.0025155544281006, + "rewards/rejected": 4.062304973602295, + "step": 6489 + }, + { + "epoch": 1.05, + "learning_rate": 4.81079261641165e-06, + "logits/chosen": -1.405110239982605, + "logits/rejected": -1.3625469207763672, + "logps/chosen": -44.04051971435547, + "logps/rejected": -25.03060531616211, + "loss": 0.3116, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.7188973426818848, + "rewards/margins": 0.9394302368164062, + "rewards/rejected": 2.7794671058654785, + "step": 6490 + }, + { + "epoch": 1.05, + "learning_rate": 4.8094793103362416e-06, + "logits/chosen": -1.2601492404937744, + "logits/rejected": -1.3014920949935913, + "logps/chosen": -85.32378387451172, + "logps/rejected": -83.90231323242188, + "loss": 0.126, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.661640167236328, + "rewards/margins": 1.393567681312561, + "rewards/rejected": 1.268072485923767, + "step": 6491 + }, + { + "epoch": 1.05, + "learning_rate": 4.808166017424011e-06, + "logits/chosen": -1.4756217002868652, + "logits/rejected": -1.3641350269317627, + "logps/chosen": -176.50144958496094, + "logps/rejected": -164.19500732421875, + "loss": 0.0571, + "rewards/accuracies": 1.0, + "rewards/chosen": 9.155064582824707, + "rewards/margins": 2.1161742210388184, + "rewards/rejected": 7.038890361785889, + "step": 6492 + }, + { + "epoch": 1.05, + "learning_rate": 4.8068527377656946e-06, + "logits/chosen": -1.1027404069900513, + "logits/rejected": -1.0303305387496948, + "logps/chosen": -212.7397003173828, + "logps/rejected": -84.61530303955078, + "loss": 1.1765, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.497523784637451, + "rewards/margins": 1.6266696453094482, + "rewards/rejected": 3.870854139328003, + "step": 6493 + }, + { + "epoch": 1.05, + "learning_rate": 4.805539471452026e-06, + "logits/chosen": -1.4556970596313477, + "logits/rejected": -1.253773808479309, + "logps/chosen": -132.0422821044922, + "logps/rejected": -54.28465270996094, + "loss": 0.0291, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.684092998504639, + "rewards/margins": 2.954740285873413, + "rewards/rejected": 2.7293527126312256, + "step": 6494 + }, + { + "epoch": 1.05, + "learning_rate": 4.80422621857374e-06, + "logits/chosen": -0.7982803583145142, + "logits/rejected": -0.7452489733695984, + "logps/chosen": -88.93040466308594, + "logps/rejected": -48.42856979370117, + "loss": 0.2122, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.358662486076355, + "rewards/margins": 0.6680775284767151, + "rewards/rejected": 0.6905849575996399, + "step": 6495 + }, + { + "epoch": 1.05, + "learning_rate": 4.802912979221569e-06, + "logits/chosen": -0.885951042175293, + "logits/rejected": -0.885951042175293, + "logps/chosen": -31.302711486816406, + "logps/rejected": -31.302711486816406, + "loss": 0.3692, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.6431159973144531, + "rewards/margins": 0.0, + "rewards/rejected": 0.6431159973144531, + "step": 6496 + }, + { + "epoch": 1.05, + "learning_rate": 4.801599753486247e-06, + "logits/chosen": -1.319798231124878, + "logits/rejected": -0.9190500378608704, + "logps/chosen": -98.92115783691406, + "logps/rejected": -46.87965393066406, + "loss": 1.5719, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.8927582502365112, + "rewards/margins": -2.9977102279663086, + "rewards/rejected": 4.890468597412109, + "step": 6497 + }, + { + "epoch": 1.05, + "learning_rate": 4.8002865414585055e-06, + "logits/chosen": -1.6037712097167969, + "logits/rejected": -1.4190406799316406, + "logps/chosen": -143.3637237548828, + "logps/rejected": -38.99486541748047, + "loss": 0.0254, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.988365173339844, + "rewards/margins": 3.2177600860595703, + "rewards/rejected": 1.7706050872802734, + "step": 6498 + }, + { + "epoch": 1.05, + "learning_rate": 4.798973343229073e-06, + "logits/chosen": -0.8256576657295227, + "logits/rejected": -0.8787571787834167, + "logps/chosen": -52.052879333496094, + "logps/rejected": -62.753639221191406, + "loss": 1.5465, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.5490745306015015, + "rewards/margins": -0.6472550630569458, + "rewards/rejected": 2.1963295936584473, + "step": 6499 + }, + { + "epoch": 1.06, + "learning_rate": 4.797660158888681e-06, + "logits/chosen": -1.3566555976867676, + "logits/rejected": -1.3057433366775513, + "logps/chosen": -71.11402893066406, + "logps/rejected": -34.6716194152832, + "loss": 0.2104, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.5239899158477783, + "rewards/margins": 0.8543964624404907, + "rewards/rejected": 1.6695934534072876, + "step": 6500 + }, + { + "epoch": 1.06, + "learning_rate": 4.796346988528057e-06, + "logits/chosen": -1.4043986797332764, + "logits/rejected": -1.1013623476028442, + "logps/chosen": -135.24497985839844, + "logps/rejected": -88.0010757446289, + "loss": 0.0568, + "rewards/accuracies": 1.0, + "rewards/chosen": 7.623007297515869, + "rewards/margins": 2.919919490814209, + "rewards/rejected": 4.70308780670166, + "step": 6501 + }, + { + "epoch": 1.06, + "learning_rate": 4.79503383223793e-06, + "logits/chosen": -1.2580164670944214, + "logits/rejected": -1.2762749195098877, + "logps/chosen": -81.40645599365234, + "logps/rejected": -116.44535064697266, + "loss": 0.5667, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1060585975646973, + "rewards/margins": 0.6340255737304688, + "rewards/rejected": 2.4720330238342285, + "step": 6502 + }, + { + "epoch": 1.06, + "learning_rate": 4.793720690109025e-06, + "logits/chosen": -1.3445827960968018, + "logits/rejected": -1.2443461418151855, + "logps/chosen": -72.29444885253906, + "logps/rejected": -74.54843139648438, + "loss": 0.3102, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.586691379547119, + "rewards/margins": 0.24340438842773438, + "rewards/rejected": 2.3432869911193848, + "step": 6503 + }, + { + "epoch": 1.06, + "learning_rate": 4.7924075622320686e-06, + "logits/chosen": -0.9934960603713989, + "logits/rejected": -0.9073818922042847, + "logps/chosen": -71.77430725097656, + "logps/rejected": -91.33192443847656, + "loss": 0.845, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.1818008422851562, + "rewards/margins": -1.3143470287322998, + "rewards/rejected": 3.496147871017456, + "step": 6504 + }, + { + "epoch": 1.06, + "learning_rate": 4.791094448697786e-06, + "logits/chosen": -1.116545557975769, + "logits/rejected": -1.0815136432647705, + "logps/chosen": -37.87824249267578, + "logps/rejected": -37.53253936767578, + "loss": 1.6924, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.6973931789398193, + "rewards/margins": 1.2416459321975708, + "rewards/rejected": 1.4557472467422485, + "step": 6505 + }, + { + "epoch": 1.06, + "learning_rate": 4.7897813495969e-06, + "logits/chosen": -0.9543560147285461, + "logits/rejected": -0.9532399773597717, + "logps/chosen": -4.20976448059082, + "logps/rejected": -28.775251388549805, + "loss": 1.9127, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.4957098066806793, + "rewards/margins": -0.5591683387756348, + "rewards/rejected": 1.0548781156539917, + "step": 6506 + }, + { + "epoch": 1.06, + "learning_rate": 4.788468265020135e-06, + "logits/chosen": -1.232758641242981, + "logits/rejected": -1.3002197742462158, + "logps/chosen": -70.63710021972656, + "logps/rejected": -89.23400115966797, + "loss": 0.9033, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.1801650524139404, + "rewards/margins": -1.3322007656097412, + "rewards/rejected": 3.5123658180236816, + "step": 6507 + }, + { + "epoch": 1.06, + "learning_rate": 4.7871551950582105e-06, + "logits/chosen": -1.106977105140686, + "logits/rejected": -1.1249933242797852, + "logps/chosen": -50.08368682861328, + "logps/rejected": -70.63286590576172, + "loss": 0.5918, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.675335645675659, + "rewards/margins": -0.3489038944244385, + "rewards/rejected": 4.024239540100098, + "step": 6508 + }, + { + "epoch": 1.06, + "learning_rate": 4.785842139801848e-06, + "logits/chosen": -1.2569700479507446, + "logits/rejected": -1.2927320003509521, + "logps/chosen": -43.013214111328125, + "logps/rejected": -142.37721252441406, + "loss": 1.0239, + "rewards/accuracies": 0.0, + "rewards/chosen": 4.657566070556641, + "rewards/margins": -1.893064022064209, + "rewards/rejected": 6.55063009262085, + "step": 6509 + }, + { + "epoch": 1.06, + "learning_rate": 4.784529099341766e-06, + "logits/chosen": -0.9103237390518188, + "logits/rejected": -0.9445439577102661, + "logps/chosen": -48.31901550292969, + "logps/rejected": -37.91740417480469, + "loss": 0.564, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1324639320373535, + "rewards/margins": 0.6509907245635986, + "rewards/rejected": 2.481473207473755, + "step": 6510 + }, + { + "epoch": 1.06, + "learning_rate": 4.783216073768685e-06, + "logits/chosen": -1.0448648929595947, + "logits/rejected": -1.033910870552063, + "logps/chosen": -85.31449890136719, + "logps/rejected": -41.52388000488281, + "loss": 1.0796, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.9786674976348877, + "rewards/margins": -0.16478776931762695, + "rewards/rejected": 2.1434552669525146, + "step": 6511 + }, + { + "epoch": 1.06, + "learning_rate": 4.781903063173321e-06, + "logits/chosen": -1.308661937713623, + "logits/rejected": -1.3083667755126953, + "logps/chosen": -65.24032592773438, + "logps/rejected": -45.74067306518555, + "loss": 0.355, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.3675460815429688, + "rewards/margins": 0.09857439994812012, + "rewards/rejected": 2.2689716815948486, + "step": 6512 + }, + { + "epoch": 1.06, + "learning_rate": 4.780590067646393e-06, + "logits/chosen": -1.0877548456192017, + "logits/rejected": -1.0373526811599731, + "logps/chosen": -42.31359100341797, + "logps/rejected": -68.62820434570312, + "loss": 0.5774, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.3439643383026123, + "rewards/margins": -0.489898681640625, + "rewards/rejected": 2.8338630199432373, + "step": 6513 + }, + { + "epoch": 1.06, + "learning_rate": 4.779277087278615e-06, + "logits/chosen": -1.6249221563339233, + "logits/rejected": -1.5142618417739868, + "logps/chosen": -82.51429748535156, + "logps/rejected": -27.82317352294922, + "loss": 0.0467, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.908015489578247, + "rewards/margins": 3.612462282180786, + "rewards/rejected": 0.29555320739746094, + "step": 6514 + }, + { + "epoch": 1.06, + "learning_rate": 4.777964122160702e-06, + "logits/chosen": -1.4928375482559204, + "logits/rejected": -1.497240424156189, + "logps/chosen": -44.422481536865234, + "logps/rejected": -103.70210266113281, + "loss": 0.6213, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.408036470413208, + "rewards/margins": 0.348812460899353, + "rewards/rejected": 1.059224009513855, + "step": 6515 + }, + { + "epoch": 1.06, + "learning_rate": 4.776651172383367e-06, + "logits/chosen": -0.9124853610992432, + "logits/rejected": -0.8532117605209351, + "logps/chosen": -86.82827758789062, + "logps/rejected": -45.41252136230469, + "loss": 0.4437, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.299957275390625, + "rewards/margins": 0.05680537223815918, + "rewards/rejected": 3.243151903152466, + "step": 6516 + }, + { + "epoch": 1.06, + "learning_rate": 4.775338238037323e-06, + "logits/chosen": -1.2929610013961792, + "logits/rejected": -1.2264916896820068, + "logps/chosen": -94.1190185546875, + "logps/rejected": -39.588497161865234, + "loss": 0.4043, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.0970749855041504, + "rewards/margins": -0.08717989921569824, + "rewards/rejected": 2.1842548847198486, + "step": 6517 + }, + { + "epoch": 1.06, + "learning_rate": 4.77402531921328e-06, + "logits/chosen": -1.4006236791610718, + "logits/rejected": -1.407576322555542, + "logps/chosen": -46.173728942871094, + "logps/rejected": -53.01529312133789, + "loss": 1.4123, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.043830156326294, + "rewards/margins": -1.2794108390808105, + "rewards/rejected": 2.3232409954071045, + "step": 6518 + }, + { + "epoch": 1.06, + "learning_rate": 4.772712416001951e-06, + "logits/chosen": -1.243045449256897, + "logits/rejected": -1.2243824005126953, + "logps/chosen": -56.783172607421875, + "logps/rejected": -84.48759460449219, + "loss": 0.1302, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.9966201782226562, + "rewards/margins": 1.2815475463867188, + "rewards/rejected": 2.7150726318359375, + "step": 6519 + }, + { + "epoch": 1.06, + "learning_rate": 4.771399528494042e-06, + "logits/chosen": -1.3260878324508667, + "logits/rejected": -1.3963028192520142, + "logps/chosen": -46.6849250793457, + "logps/rejected": -30.309450149536133, + "loss": 0.979, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.356278657913208, + "rewards/margins": -0.32460761070251465, + "rewards/rejected": 2.6808862686157227, + "step": 6520 + }, + { + "epoch": 1.06, + "learning_rate": 4.770086656780263e-06, + "logits/chosen": -1.2427666187286377, + "logits/rejected": -1.2412878274917603, + "logps/chosen": -79.70747375488281, + "logps/rejected": -119.2444839477539, + "loss": 0.321, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.9072479009628296, + "rewards/margins": 0.6663154363632202, + "rewards/rejected": 1.2409324645996094, + "step": 6521 + }, + { + "epoch": 1.06, + "learning_rate": 4.76877380095132e-06, + "logits/chosen": -0.9389485716819763, + "logits/rejected": -1.054299235343933, + "logps/chosen": -91.47853088378906, + "logps/rejected": -137.7535400390625, + "loss": 1.1748, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.127302646636963, + "rewards/margins": -2.2009949684143066, + "rewards/rejected": 5.3282976150512695, + "step": 6522 + }, + { + "epoch": 1.06, + "learning_rate": 4.76746096109792e-06, + "logits/chosen": -1.1586871147155762, + "logits/rejected": -1.1765251159667969, + "logps/chosen": -103.18930053710938, + "logps/rejected": -114.08988189697266, + "loss": 1.712, + "rewards/accuracies": 0.0, + "rewards/chosen": 4.213525295257568, + "rewards/margins": -2.51986026763916, + "rewards/rejected": 6.7333855628967285, + "step": 6523 + }, + { + "epoch": 1.06, + "learning_rate": 4.7661481373107675e-06, + "logits/chosen": -0.9362010359764099, + "logits/rejected": -0.9408255815505981, + "logps/chosen": -45.23147201538086, + "logps/rejected": -78.01634216308594, + "loss": 0.4546, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.0932743549346924, + "rewards/margins": -0.38514137268066406, + "rewards/rejected": 2.4784157276153564, + "step": 6524 + }, + { + "epoch": 1.06, + "learning_rate": 4.764835329680566e-06, + "logits/chosen": -1.257189393043518, + "logits/rejected": -1.257189393043518, + "logps/chosen": -41.495235443115234, + "logps/rejected": -41.495235443115234, + "loss": 0.3938, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.0490268468856812, + "rewards/margins": 0.0, + "rewards/rejected": 1.0490268468856812, + "step": 6525 + }, + { + "epoch": 1.06, + "learning_rate": 4.763522538298018e-06, + "logits/chosen": -1.1073966026306152, + "logits/rejected": -1.106271743774414, + "logps/chosen": -12.725833892822266, + "logps/rejected": -13.101028442382812, + "loss": 0.6388, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.2436152547597885, + "rewards/margins": -0.32842665910720825, + "rewards/rejected": 0.572041928768158, + "step": 6526 + }, + { + "epoch": 1.06, + "learning_rate": 4.762209763253824e-06, + "logits/chosen": -0.9973675012588501, + "logits/rejected": -0.9866356253623962, + "logps/chosen": -112.64250946044922, + "logps/rejected": -70.89266204833984, + "loss": 0.4673, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.3124611377716064, + "rewards/margins": -0.40369272232055664, + "rewards/rejected": 2.716153860092163, + "step": 6527 + }, + { + "epoch": 1.06, + "learning_rate": 4.760897004638687e-06, + "logits/chosen": -1.00779390335083, + "logits/rejected": -1.00779390335083, + "logps/chosen": -23.169471740722656, + "logps/rejected": -23.169471740722656, + "loss": 0.5136, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.9180443286895752, + "rewards/margins": 0.0, + "rewards/rejected": 1.9180443286895752, + "step": 6528 + }, + { + "epoch": 1.06, + "learning_rate": 4.759584262543304e-06, + "logits/chosen": -1.1516237258911133, + "logits/rejected": -0.8282742500305176, + "logps/chosen": -104.45359802246094, + "logps/rejected": -20.065013885498047, + "loss": 0.0152, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.6768999099731445, + "rewards/margins": 5.782273769378662, + "rewards/rejected": -0.10537376254796982, + "step": 6529 + }, + { + "epoch": 1.06, + "learning_rate": 4.758271537058373e-06, + "logits/chosen": -1.3241702318191528, + "logits/rejected": -1.338007926940918, + "logps/chosen": -65.11415100097656, + "logps/rejected": -65.6692886352539, + "loss": 0.1734, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.102508544921875, + "rewards/margins": 0.9681640863418579, + "rewards/rejected": 1.134344458580017, + "step": 6530 + }, + { + "epoch": 1.06, + "learning_rate": 4.756958828274592e-06, + "logits/chosen": -0.8320307731628418, + "logits/rejected": -0.8308770060539246, + "logps/chosen": -74.52702331542969, + "logps/rejected": -95.36282348632812, + "loss": 0.8814, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.202838182449341, + "rewards/margins": -1.5739014148712158, + "rewards/rejected": 4.776739597320557, + "step": 6531 + }, + { + "epoch": 1.06, + "learning_rate": 4.755646136282656e-06, + "logits/chosen": -1.2329788208007812, + "logits/rejected": -1.1896625757217407, + "logps/chosen": -93.51847076416016, + "logps/rejected": -104.11188507080078, + "loss": 0.1209, + "rewards/accuracies": 1.0, + "rewards/chosen": 7.61438512802124, + "rewards/margins": 1.3169918060302734, + "rewards/rejected": 6.297393321990967, + "step": 6532 + }, + { + "epoch": 1.06, + "learning_rate": 4.75433346117326e-06, + "logits/chosen": -1.245411992073059, + "logits/rejected": -1.1372525691986084, + "logps/chosen": -104.2221450805664, + "logps/rejected": -28.021984100341797, + "loss": 0.1405, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.7846946716308594, + "rewards/margins": 1.19019615650177, + "rewards/rejected": 1.5944985151290894, + "step": 6533 + }, + { + "epoch": 1.06, + "learning_rate": 4.753020803037098e-06, + "logits/chosen": -0.9790078401565552, + "logits/rejected": -0.9306809902191162, + "logps/chosen": -42.983177185058594, + "logps/rejected": -62.08069610595703, + "loss": 0.5024, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.3854633569717407, + "rewards/margins": -0.48489952087402344, + "rewards/rejected": 1.8703628778457642, + "step": 6534 + }, + { + "epoch": 1.06, + "learning_rate": 4.751708161964861e-06, + "logits/chosen": -1.090887188911438, + "logits/rejected": -1.128860354423523, + "logps/chosen": -102.40060424804688, + "logps/rejected": -86.95832824707031, + "loss": 1.7814, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.4972336292266846, + "rewards/margins": -3.5104644298553467, + "rewards/rejected": 7.007698059082031, + "step": 6535 + }, + { + "epoch": 1.06, + "learning_rate": 4.75039553804724e-06, + "logits/chosen": -1.369516134262085, + "logits/rejected": -1.2543808221817017, + "logps/chosen": -61.147186279296875, + "logps/rejected": -18.01943588256836, + "loss": 0.738, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.3777740001678467, + "rewards/margins": 2.1281633377075195, + "rewards/rejected": 0.24961070716381073, + "step": 6536 + }, + { + "epoch": 1.06, + "learning_rate": 4.7490829313749264e-06, + "logits/chosen": -1.421759843826294, + "logits/rejected": -1.3667155504226685, + "logps/chosen": -132.44131469726562, + "logps/rejected": -89.72100830078125, + "loss": 0.1977, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.596823215484619, + "rewards/margins": 3.0336601734161377, + "rewards/rejected": 2.5631630420684814, + "step": 6537 + }, + { + "epoch": 1.06, + "learning_rate": 4.747770342038608e-06, + "logits/chosen": -1.1558334827423096, + "logits/rejected": -1.2570562362670898, + "logps/chosen": -42.76451873779297, + "logps/rejected": -63.043785095214844, + "loss": 0.6903, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.6654961109161377, + "rewards/margins": -1.0849900245666504, + "rewards/rejected": 3.750486135482788, + "step": 6538 + }, + { + "epoch": 1.06, + "learning_rate": 4.746457770128972e-06, + "logits/chosen": -1.2764943838119507, + "logits/rejected": -1.3615068197250366, + "logps/chosen": -38.86470031738281, + "logps/rejected": -153.95944213867188, + "loss": 0.5848, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.235083818435669, + "rewards/margins": -0.7236106395721436, + "rewards/rejected": 2.9586944580078125, + "step": 6539 + }, + { + "epoch": 1.06, + "learning_rate": 4.745145215736705e-06, + "logits/chosen": -1.3636724948883057, + "logits/rejected": -1.2875466346740723, + "logps/chosen": -102.27792358398438, + "logps/rejected": -83.38359832763672, + "loss": 0.5442, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.8604812622070312, + "rewards/margins": -0.6724543571472168, + "rewards/rejected": 4.532935619354248, + "step": 6540 + }, + { + "epoch": 1.06, + "learning_rate": 4.743832678952492e-06, + "logits/chosen": -1.3301503658294678, + "logits/rejected": -1.2476046085357666, + "logps/chosen": -59.26828384399414, + "logps/rejected": -20.709014892578125, + "loss": 0.7889, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.2958309650421143, + "rewards/margins": 0.7895485162734985, + "rewards/rejected": 1.5062824487686157, + "step": 6541 + }, + { + "epoch": 1.06, + "learning_rate": 4.742520159867018e-06, + "logits/chosen": -1.0853711366653442, + "logits/rejected": -1.0403169393539429, + "logps/chosen": -87.13645935058594, + "logps/rejected": -41.006080627441406, + "loss": 0.3907, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.496328830718994, + "rewards/margins": 0.8826638460159302, + "rewards/rejected": 1.613664984703064, + "step": 6542 + }, + { + "epoch": 1.06, + "learning_rate": 4.741207658570965e-06, + "logits/chosen": -1.6142159700393677, + "logits/rejected": -1.5386180877685547, + "logps/chosen": -87.451904296875, + "logps/rejected": -35.16889190673828, + "loss": 0.124, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.006121873855591, + "rewards/margins": 1.8425472974777222, + "rewards/rejected": 1.1635745763778687, + "step": 6543 + }, + { + "epoch": 1.06, + "learning_rate": 4.739895175155012e-06, + "logits/chosen": -0.7742065191268921, + "logits/rejected": -0.7742065191268921, + "logps/chosen": -61.8714714050293, + "logps/rejected": -61.8714714050293, + "loss": 0.7392, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.669713258743286, + "rewards/margins": 0.0, + "rewards/rejected": 2.669713258743286, + "step": 6544 + }, + { + "epoch": 1.06, + "learning_rate": 4.738582709709842e-06, + "logits/chosen": -1.2395507097244263, + "logits/rejected": -1.1718071699142456, + "logps/chosen": -62.91096496582031, + "logps/rejected": -68.35401916503906, + "loss": 0.4475, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.812703013420105, + "rewards/margins": -0.35010993480682373, + "rewards/rejected": 2.1628129482269287, + "step": 6545 + }, + { + "epoch": 1.06, + "learning_rate": 4.737270262326134e-06, + "logits/chosen": -1.492334008216858, + "logits/rejected": -1.1486668586730957, + "logps/chosen": -123.49992370605469, + "logps/rejected": -59.67544937133789, + "loss": 0.0017, + "rewards/accuracies": 1.0, + "rewards/chosen": 9.319328308105469, + "rewards/margins": 5.882218360900879, + "rewards/rejected": 3.4371097087860107, + "step": 6546 + }, + { + "epoch": 1.06, + "learning_rate": 4.7359578330945635e-06, + "logits/chosen": -1.1412004232406616, + "logits/rejected": -1.1412004232406616, + "logps/chosen": -81.92539978027344, + "logps/rejected": -81.92539978027344, + "loss": 0.3611, + "rewards/accuracies": 0.0, + "rewards/chosen": 6.0851030349731445, + "rewards/margins": 0.0, + "rewards/rejected": 6.0851030349731445, + "step": 6547 + }, + { + "epoch": 1.06, + "learning_rate": 4.734645422105809e-06, + "logits/chosen": -1.1566544771194458, + "logits/rejected": -1.0721147060394287, + "logps/chosen": -46.94453430175781, + "logps/rejected": -47.539222717285156, + "loss": 0.1021, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.310049533843994, + "rewards/margins": 2.0020265579223633, + "rewards/rejected": 0.308023065328598, + "step": 6548 + }, + { + "epoch": 1.06, + "learning_rate": 4.733333029450546e-06, + "logits/chosen": -1.489411473274231, + "logits/rejected": -1.2810882329940796, + "logps/chosen": -91.31227111816406, + "logps/rejected": -14.571237564086914, + "loss": 0.2667, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.234591960906982, + "rewards/margins": 4.356247425079346, + "rewards/rejected": 0.8783445358276367, + "step": 6549 + }, + { + "epoch": 1.06, + "learning_rate": 4.732020655219447e-06, + "logits/chosen": -1.141518473625183, + "logits/rejected": -0.9942345023155212, + "logps/chosen": -52.98875427246094, + "logps/rejected": -52.35334014892578, + "loss": 0.3277, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.6353347301483154, + "rewards/margins": 0.7236464023590088, + "rewards/rejected": 2.9116883277893066, + "step": 6550 + }, + { + "epoch": 1.06, + "learning_rate": 4.730708299503184e-06, + "logits/chosen": -1.1299729347229004, + "logits/rejected": -0.9683458209037781, + "logps/chosen": -78.49105834960938, + "logps/rejected": -43.40711212158203, + "loss": 0.084, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.6992013454437256, + "rewards/margins": 1.7748894691467285, + "rewards/rejected": 1.924311876296997, + "step": 6551 + }, + { + "epoch": 1.06, + "learning_rate": 4.729395962392431e-06, + "logits/chosen": -1.0418485403060913, + "logits/rejected": -0.8831194043159485, + "logps/chosen": -49.17070007324219, + "logps/rejected": -85.04034423828125, + "loss": 1.0241, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.2780091762542725, + "rewards/margins": -1.8298065662384033, + "rewards/rejected": 5.107815742492676, + "step": 6552 + }, + { + "epoch": 1.06, + "learning_rate": 4.728083643977855e-06, + "logits/chosen": -1.1452950239181519, + "logits/rejected": -1.1235226392745972, + "logps/chosen": -102.49638366699219, + "logps/rejected": -43.41981506347656, + "loss": 0.7991, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.1710205078125, + "rewards/margins": -1.0573441982269287, + "rewards/rejected": 2.2283647060394287, + "step": 6553 + }, + { + "epoch": 1.06, + "learning_rate": 4.7267713443501274e-06, + "logits/chosen": -1.1650470495224, + "logits/rejected": -1.1109516620635986, + "logps/chosen": -42.035377502441406, + "logps/rejected": -80.34274291992188, + "loss": 0.4513, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.912113904953003, + "rewards/margins": -0.023745059967041016, + "rewards/rejected": 2.935858964920044, + "step": 6554 + }, + { + "epoch": 1.06, + "learning_rate": 4.725459063599915e-06, + "logits/chosen": -1.273927927017212, + "logits/rejected": -1.2370368242263794, + "logps/chosen": -48.23456954956055, + "logps/rejected": -61.746864318847656, + "loss": 1.0646, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.5347554683685303, + "rewards/margins": -1.3497004508972168, + "rewards/rejected": 3.884455919265747, + "step": 6555 + }, + { + "epoch": 1.06, + "learning_rate": 4.724146801817882e-06, + "logits/chosen": -1.1026430130004883, + "logits/rejected": -1.133388876914978, + "logps/chosen": -99.49472045898438, + "logps/rejected": -90.78915405273438, + "loss": 0.4744, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9034087657928467, + "rewards/margins": 0.29192566871643066, + "rewards/rejected": 2.611483097076416, + "step": 6556 + }, + { + "epoch": 1.06, + "learning_rate": 4.722834559094696e-06, + "logits/chosen": -0.8494104743003845, + "logits/rejected": -0.8535729646682739, + "logps/chosen": -2.7962493896484375, + "logps/rejected": -2.2815449237823486, + "loss": 0.3493, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5772773623466492, + "rewards/margins": 0.0486907958984375, + "rewards/rejected": 0.5285865664482117, + "step": 6557 + }, + { + "epoch": 1.06, + "learning_rate": 4.721522335521019e-06, + "logits/chosen": -1.1303421258926392, + "logits/rejected": -0.832528293132782, + "logps/chosen": -51.539100646972656, + "logps/rejected": -88.05289459228516, + "loss": 0.3044, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.7042595148086548, + "rewards/margins": 0.4976249933242798, + "rewards/rejected": 1.206634521484375, + "step": 6558 + }, + { + "epoch": 1.06, + "learning_rate": 4.720210131187514e-06, + "logits/chosen": -0.7565922141075134, + "logits/rejected": -0.7565922141075134, + "logps/chosen": -36.42223358154297, + "logps/rejected": -36.42223358154297, + "loss": 0.4551, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.5240013599395752, + "rewards/margins": 0.0, + "rewards/rejected": 1.5240013599395752, + "step": 6559 + }, + { + "epoch": 1.06, + "learning_rate": 4.718897946184842e-06, + "logits/chosen": -1.1603138446807861, + "logits/rejected": -1.161959171295166, + "logps/chosen": -61.764869689941406, + "logps/rejected": -77.26786041259766, + "loss": 0.7814, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.4244468212127686, + "rewards/margins": -0.32664966583251953, + "rewards/rejected": 2.751096487045288, + "step": 6560 + }, + { + "epoch": 1.06, + "learning_rate": 4.717585780603661e-06, + "logits/chosen": -1.3301200866699219, + "logits/rejected": -1.3301200866699219, + "logps/chosen": -64.07244110107422, + "logps/rejected": -64.07244110107422, + "loss": 0.6122, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.456498861312866, + "rewards/margins": 0.0, + "rewards/rejected": 3.456498861312866, + "step": 6561 + }, + { + "epoch": 1.07, + "learning_rate": 4.71627363453463e-06, + "logits/chosen": -1.04978609085083, + "logits/rejected": -0.6857849359512329, + "logps/chosen": -116.85163879394531, + "logps/rejected": -43.26177215576172, + "loss": 0.1679, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.956608533859253, + "rewards/margins": 1.1333694458007812, + "rewards/rejected": 2.8232390880584717, + "step": 6562 + }, + { + "epoch": 1.07, + "learning_rate": 4.7149615080684075e-06, + "logits/chosen": -1.1324849128723145, + "logits/rejected": -1.1348366737365723, + "logps/chosen": -20.290292739868164, + "logps/rejected": -21.284086227416992, + "loss": 0.6834, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.6112818121910095, + "rewards/margins": 0.03935474157333374, + "rewards/rejected": 0.5719270706176758, + "step": 6563 + }, + { + "epoch": 1.07, + "learning_rate": 4.7136494012956475e-06, + "logits/chosen": -1.55475914478302, + "logits/rejected": -1.4987260103225708, + "logps/chosen": -52.73223114013672, + "logps/rejected": -30.305469512939453, + "loss": 0.099, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.654144287109375, + "rewards/margins": 1.988106369972229, + "rewards/rejected": -0.3339620530605316, + "step": 6564 + }, + { + "epoch": 1.07, + "learning_rate": 4.712337314307004e-06, + "logits/chosen": -1.0391985177993774, + "logits/rejected": -1.0391985177993774, + "logps/chosen": -20.145118713378906, + "logps/rejected": -20.145118713378906, + "loss": 0.3903, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.2568687200546265, + "rewards/margins": 0.0, + "rewards/rejected": 1.2568687200546265, + "step": 6565 + }, + { + "epoch": 1.07, + "learning_rate": 4.71102524719313e-06, + "logits/chosen": -1.4565067291259766, + "logits/rejected": -1.4129084348678589, + "logps/chosen": -84.96803283691406, + "logps/rejected": -126.47542572021484, + "loss": 0.5382, + "rewards/accuracies": 0.0, + "rewards/chosen": 7.440324306488037, + "rewards/margins": -0.6027798652648926, + "rewards/rejected": 8.04310417175293, + "step": 6566 + }, + { + "epoch": 1.07, + "learning_rate": 4.709713200044678e-06, + "logits/chosen": -1.4937701225280762, + "logits/rejected": -1.3811280727386475, + "logps/chosen": -106.74356079101562, + "logps/rejected": -38.571903228759766, + "loss": 1.4345, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.9065948724746704, + "rewards/margins": 1.8069419860839844, + "rewards/rejected": 0.09965286403894424, + "step": 6567 + }, + { + "epoch": 1.07, + "learning_rate": 4.708401172952296e-06, + "logits/chosen": -1.2006173133850098, + "logits/rejected": -1.2567907571792603, + "logps/chosen": -81.58956909179688, + "logps/rejected": -51.991722106933594, + "loss": 1.8889, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.0277650356292725, + "rewards/margins": -0.0949544906616211, + "rewards/rejected": 3.1227195262908936, + "step": 6568 + }, + { + "epoch": 1.07, + "learning_rate": 4.707089166006634e-06, + "logits/chosen": -1.6604621410369873, + "logits/rejected": -1.6602685451507568, + "logps/chosen": -52.38167190551758, + "logps/rejected": -77.36260986328125, + "loss": 0.404, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.8869214057922363, + "rewards/margins": 0.825092077255249, + "rewards/rejected": 2.0618293285369873, + "step": 6569 + }, + { + "epoch": 1.07, + "learning_rate": 4.70577717929834e-06, + "logits/chosen": -1.4984548091888428, + "logits/rejected": -1.411242961883545, + "logps/chosen": -80.22644805908203, + "logps/rejected": -57.8603401184082, + "loss": 0.6354, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.6386420726776123, + "rewards/margins": -0.3929784297943115, + "rewards/rejected": 4.031620502471924, + "step": 6570 + }, + { + "epoch": 1.07, + "learning_rate": 4.704465212918058e-06, + "logits/chosen": -0.6216042637825012, + "logits/rejected": -0.6835646629333496, + "logps/chosen": -95.00663757324219, + "logps/rejected": -68.99656677246094, + "loss": 0.2436, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.0034127235412598, + "rewards/margins": 1.0842018127441406, + "rewards/rejected": 0.9192108511924744, + "step": 6571 + }, + { + "epoch": 1.07, + "learning_rate": 4.703153266956434e-06, + "logits/chosen": -1.2601215839385986, + "logits/rejected": -1.2803248167037964, + "logps/chosen": -4.747920513153076, + "logps/rejected": -39.596885681152344, + "loss": 1.4055, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.5001217722892761, + "rewards/margins": -0.5385550856590271, + "rewards/rejected": 1.0386768579483032, + "step": 6572 + }, + { + "epoch": 1.07, + "learning_rate": 4.70184134150411e-06, + "logits/chosen": -1.5517070293426514, + "logits/rejected": -1.4747424125671387, + "logps/chosen": -81.64964294433594, + "logps/rejected": -81.11898040771484, + "loss": 0.7878, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1304147243499756, + "rewards/margins": 0.31911182403564453, + "rewards/rejected": 2.811302900314331, + "step": 6573 + }, + { + "epoch": 1.07, + "learning_rate": 4.700529436651729e-06, + "logits/chosen": -0.9611296057701111, + "logits/rejected": -0.9679630398750305, + "logps/chosen": -13.725763320922852, + "logps/rejected": -4.371006965637207, + "loss": 0.2031, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.4840457439422607, + "rewards/margins": 1.986855149269104, + "rewards/rejected": 0.49719056487083435, + "step": 6574 + }, + { + "epoch": 1.07, + "learning_rate": 4.69921755248993e-06, + "logits/chosen": -1.027182936668396, + "logits/rejected": -0.9452677965164185, + "logps/chosen": -70.69080352783203, + "logps/rejected": -34.67771911621094, + "loss": 0.2308, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.7681617736816406, + "rewards/margins": 1.2002830505371094, + "rewards/rejected": 0.5678787231445312, + "step": 6575 + }, + { + "epoch": 1.07, + "learning_rate": 4.697905689109351e-06, + "logits/chosen": -0.8666919469833374, + "logits/rejected": -0.8666919469833374, + "logps/chosen": -40.57185363769531, + "logps/rejected": -40.57185363769531, + "loss": 0.4485, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.309788227081299, + "rewards/margins": 0.0, + "rewards/rejected": 2.309788227081299, + "step": 6576 + }, + { + "epoch": 1.07, + "learning_rate": 4.696593846600632e-06, + "logits/chosen": -1.2800381183624268, + "logits/rejected": -1.432404637336731, + "logps/chosen": -148.42059326171875, + "logps/rejected": -130.60601806640625, + "loss": 1.3357, + "rewards/accuracies": 0.0, + "rewards/chosen": 5.84307861328125, + "rewards/margins": -2.5959901809692383, + "rewards/rejected": 8.439068794250488, + "step": 6577 + }, + { + "epoch": 1.07, + "learning_rate": 4.695282025054406e-06, + "logits/chosen": -1.3946205377578735, + "logits/rejected": -1.3284653425216675, + "logps/chosen": -38.024871826171875, + "logps/rejected": -43.208187103271484, + "loss": 2.1144, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.986171007156372, + "rewards/margins": 1.126214623451233, + "rewards/rejected": 1.8599563837051392, + "step": 6578 + }, + { + "epoch": 1.07, + "learning_rate": 4.693970224561309e-06, + "logits/chosen": -1.0990667343139648, + "logits/rejected": -1.047093152999878, + "logps/chosen": -54.126136779785156, + "logps/rejected": -54.96305847167969, + "loss": 0.3912, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.224327802658081, + "rewards/margins": 1.1185522079467773, + "rewards/rejected": 2.1057755947113037, + "step": 6579 + }, + { + "epoch": 1.07, + "learning_rate": 4.692658445211974e-06, + "logits/chosen": -1.3138115406036377, + "logits/rejected": -1.2559492588043213, + "logps/chosen": -87.06907653808594, + "logps/rejected": -14.781620979309082, + "loss": 0.1143, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.415724277496338, + "rewards/margins": 1.8773906230926514, + "rewards/rejected": 0.5383335947990417, + "step": 6580 + }, + { + "epoch": 1.07, + "learning_rate": 4.6913466870970335e-06, + "logits/chosen": -1.1228889226913452, + "logits/rejected": -1.0871282815933228, + "logps/chosen": -51.06203079223633, + "logps/rejected": -49.023990631103516, + "loss": 0.3687, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.745975971221924, + "rewards/margins": -0.04834890365600586, + "rewards/rejected": 2.7943248748779297, + "step": 6581 + }, + { + "epoch": 1.07, + "learning_rate": 4.6900349503071154e-06, + "logits/chosen": -1.159613013267517, + "logits/rejected": -1.159613013267517, + "logps/chosen": -54.97592544555664, + "logps/rejected": -54.97592544555664, + "loss": 0.5477, + "rewards/accuracies": 0.0, + "rewards/chosen": 4.325336933135986, + "rewards/margins": 0.0, + "rewards/rejected": 4.325336933135986, + "step": 6582 + }, + { + "epoch": 1.07, + "learning_rate": 4.6887232349328474e-06, + "logits/chosen": -1.0273462533950806, + "logits/rejected": -0.9790910482406616, + "logps/chosen": -44.29191589355469, + "logps/rejected": -41.87919616699219, + "loss": 0.9662, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.044668674468994, + "rewards/margins": -0.6333982944488525, + "rewards/rejected": 2.6780669689178467, + "step": 6583 + }, + { + "epoch": 1.07, + "learning_rate": 4.68741154106486e-06, + "logits/chosen": -0.9958425164222717, + "logits/rejected": -1.0524336099624634, + "logps/chosen": -62.001373291015625, + "logps/rejected": -112.68089294433594, + "loss": 0.4261, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.2324204444885254, + "rewards/margins": -0.2926788330078125, + "rewards/rejected": 2.525099277496338, + "step": 6584 + }, + { + "epoch": 1.07, + "learning_rate": 4.686099868793779e-06, + "logits/chosen": -1.2402764558792114, + "logits/rejected": -1.2393211126327515, + "logps/chosen": -1.6805410385131836, + "logps/rejected": -4.374803066253662, + "loss": 0.383, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.3139099180698395, + "rewards/margins": -0.10824370384216309, + "rewards/rejected": 0.42215362191200256, + "step": 6585 + }, + { + "epoch": 1.07, + "learning_rate": 4.684788218210226e-06, + "logits/chosen": -1.1456705331802368, + "logits/rejected": -1.0129064321517944, + "logps/chosen": -123.7073974609375, + "logps/rejected": -109.19913482666016, + "loss": 0.1041, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.119449138641357, + "rewards/margins": 2.6697306632995605, + "rewards/rejected": 3.449718475341797, + "step": 6586 + }, + { + "epoch": 1.07, + "learning_rate": 4.683476589404824e-06, + "logits/chosen": -1.5808937549591064, + "logits/rejected": -1.5946542024612427, + "logps/chosen": -87.75482940673828, + "logps/rejected": -93.55754089355469, + "loss": 0.4462, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.0194528102874756, + "rewards/margins": -0.06570053100585938, + "rewards/rejected": 3.085153341293335, + "step": 6587 + }, + { + "epoch": 1.07, + "learning_rate": 4.682164982468195e-06, + "logits/chosen": -1.193227767944336, + "logits/rejected": -1.2661833763122559, + "logps/chosen": -71.17384338378906, + "logps/rejected": -92.91357421875, + "loss": 0.773, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.2956924438476562, + "rewards/margins": -1.303022861480713, + "rewards/rejected": 3.598715305328369, + "step": 6588 + }, + { + "epoch": 1.07, + "learning_rate": 4.680853397490958e-06, + "logits/chosen": -1.307112216949463, + "logits/rejected": -1.3569008111953735, + "logps/chosen": -39.16029357910156, + "logps/rejected": -34.01323699951172, + "loss": 0.3523, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.138343095779419, + "rewards/margins": 0.2255244255065918, + "rewards/rejected": 2.912818670272827, + "step": 6589 + }, + { + "epoch": 1.07, + "learning_rate": 4.679541834563732e-06, + "logits/chosen": -0.6665695905685425, + "logits/rejected": -0.6951653361320496, + "logps/chosen": -11.851712226867676, + "logps/rejected": -67.8897705078125, + "loss": 0.765, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.32260581851005554, + "rewards/margins": 0.07921820878982544, + "rewards/rejected": 0.2433876097202301, + "step": 6590 + }, + { + "epoch": 1.07, + "learning_rate": 4.678230293777133e-06, + "logits/chosen": -1.2846434116363525, + "logits/rejected": -1.1392347812652588, + "logps/chosen": -82.16547393798828, + "logps/rejected": -58.09696578979492, + "loss": 0.8333, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.024567604064941, + "rewards/margins": 4.415031909942627, + "rewards/rejected": 1.609535574913025, + "step": 6591 + }, + { + "epoch": 1.07, + "learning_rate": 4.676918775221776e-06, + "logits/chosen": -1.539717197418213, + "logits/rejected": -1.4200235605239868, + "logps/chosen": -53.15216064453125, + "logps/rejected": -8.528120040893555, + "loss": 0.2041, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.4741958379745483, + "rewards/margins": 0.7316871285438538, + "rewards/rejected": 0.7425087094306946, + "step": 6592 + }, + { + "epoch": 1.07, + "learning_rate": 4.675607278988274e-06, + "logits/chosen": -1.4275022745132446, + "logits/rejected": -1.436208963394165, + "logps/chosen": -99.80909729003906, + "logps/rejected": -80.903076171875, + "loss": 0.3525, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.838003635406494, + "rewards/margins": 0.7345871925354004, + "rewards/rejected": 2.1034164428710938, + "step": 6593 + }, + { + "epoch": 1.07, + "learning_rate": 4.67429580516724e-06, + "logits/chosen": -1.1758705377578735, + "logits/rejected": -1.022350549697876, + "logps/chosen": -147.1761016845703, + "logps/rejected": -94.54826354980469, + "loss": 0.0193, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.970658779144287, + "rewards/margins": 3.2756574153900146, + "rewards/rejected": 2.6950013637542725, + "step": 6594 + }, + { + "epoch": 1.07, + "learning_rate": 4.672984353849285e-06, + "logits/chosen": -1.3235571384429932, + "logits/rejected": -1.220242977142334, + "logps/chosen": -113.37942504882812, + "logps/rejected": -79.88983917236328, + "loss": 0.3535, + "rewards/accuracies": 1.0, + "rewards/chosen": 8.905131340026855, + "rewards/margins": 3.655670642852783, + "rewards/rejected": 5.249460697174072, + "step": 6595 + }, + { + "epoch": 1.07, + "learning_rate": 4.671672925125016e-06, + "logits/chosen": -0.9801514744758606, + "logits/rejected": -1.0237702131271362, + "logps/chosen": -61.65945816040039, + "logps/rejected": -51.46452331542969, + "loss": 1.1316, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.2216625213623047, + "rewards/margins": 0.5404384136199951, + "rewards/rejected": 1.6812241077423096, + "step": 6596 + }, + { + "epoch": 1.07, + "learning_rate": 4.670361519085041e-06, + "logits/chosen": -0.9588947296142578, + "logits/rejected": -0.9687012434005737, + "logps/chosen": -52.02553176879883, + "logps/rejected": -42.599510192871094, + "loss": 0.3171, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.24505877494812, + "rewards/margins": 0.16198492050170898, + "rewards/rejected": 2.083073854446411, + "step": 6597 + }, + { + "epoch": 1.07, + "learning_rate": 4.669050135819966e-06, + "logits/chosen": -1.243080496788025, + "logits/rejected": -1.1667847633361816, + "logps/chosen": -99.0798568725586, + "logps/rejected": -73.84772491455078, + "loss": 0.4777, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.4118232727050781, + "rewards/margins": -0.14972078800201416, + "rewards/rejected": 1.5615440607070923, + "step": 6598 + }, + { + "epoch": 1.07, + "learning_rate": 4.667738775420396e-06, + "logits/chosen": -1.3575458526611328, + "logits/rejected": -1.2356210947036743, + "logps/chosen": -87.32242584228516, + "logps/rejected": -66.0682144165039, + "loss": 0.1615, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.733591556549072, + "rewards/margins": 3.2438416481018066, + "rewards/rejected": 3.4897499084472656, + "step": 6599 + }, + { + "epoch": 1.07, + "learning_rate": 4.666427437976932e-06, + "logits/chosen": -1.0996719598770142, + "logits/rejected": -1.109248161315918, + "logps/chosen": -60.336021423339844, + "logps/rejected": -73.33831787109375, + "loss": 1.1908, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.6423051357269287, + "rewards/margins": -1.202265739440918, + "rewards/rejected": 3.8445708751678467, + "step": 6600 + }, + { + "epoch": 1.07, + "learning_rate": 4.665116123580176e-06, + "logits/chosen": -1.101615071296692, + "logits/rejected": -1.101615071296692, + "logps/chosen": -78.86721801757812, + "logps/rejected": -78.86721801757812, + "loss": 0.3753, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.8622512817382812, + "rewards/margins": 0.0, + "rewards/rejected": 3.8622512817382812, + "step": 6601 + }, + { + "epoch": 1.07, + "learning_rate": 4.663804832320726e-06, + "logits/chosen": -1.274473786354065, + "logits/rejected": -1.3442407846450806, + "logps/chosen": -172.39129638671875, + "logps/rejected": -231.5268096923828, + "loss": 1.3189, + "rewards/accuracies": 0.0, + "rewards/chosen": 6.016452312469482, + "rewards/margins": -2.515113353729248, + "rewards/rejected": 8.53156566619873, + "step": 6602 + }, + { + "epoch": 1.07, + "learning_rate": 4.662493564289182e-06, + "logits/chosen": -1.3764221668243408, + "logits/rejected": -1.4705595970153809, + "logps/chosen": -275.11224365234375, + "logps/rejected": -49.01157760620117, + "loss": 0.0739, + "rewards/accuracies": 1.0, + "rewards/chosen": 7.809515476226807, + "rewards/margins": 4.234330177307129, + "rewards/rejected": 3.5751850605010986, + "step": 6603 + }, + { + "epoch": 1.07, + "learning_rate": 4.661182319576139e-06, + "logits/chosen": -1.1330868005752563, + "logits/rejected": -1.068993091583252, + "logps/chosen": -89.52169799804688, + "logps/rejected": -50.8475341796875, + "loss": 0.8721, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.7157570123672485, + "rewards/margins": 0.5060539245605469, + "rewards/rejected": 1.2097030878067017, + "step": 6604 + }, + { + "epoch": 1.07, + "learning_rate": 4.659871098272192e-06, + "logits/chosen": -1.1864345073699951, + "logits/rejected": -1.252159833908081, + "logps/chosen": -211.4307098388672, + "logps/rejected": -57.47125244140625, + "loss": 0.2586, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.89797830581665, + "rewards/margins": 0.41146326065063477, + "rewards/rejected": 4.486515045166016, + "step": 6605 + }, + { + "epoch": 1.07, + "learning_rate": 4.6585599004679344e-06, + "logits/chosen": -1.1557552814483643, + "logits/rejected": -1.1731733083724976, + "logps/chosen": -176.61019897460938, + "logps/rejected": -88.88459777832031, + "loss": 0.0245, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.797576904296875, + "rewards/margins": 3.1477341651916504, + "rewards/rejected": 1.6498428583145142, + "step": 6606 + }, + { + "epoch": 1.07, + "learning_rate": 4.6572487262539566e-06, + "logits/chosen": -1.3255739212036133, + "logits/rejected": -1.3409063816070557, + "logps/chosen": -61.63133239746094, + "logps/rejected": -55.5471076965332, + "loss": 0.6594, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.1547698974609375, + "rewards/margins": -0.497971773147583, + "rewards/rejected": 2.6527416706085205, + "step": 6607 + }, + { + "epoch": 1.07, + "learning_rate": 4.655937575720848e-06, + "logits/chosen": -1.5698219537734985, + "logits/rejected": -1.5390878915786743, + "logps/chosen": -86.01935577392578, + "logps/rejected": -124.94822692871094, + "loss": 0.1714, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.5703072547912598, + "rewards/margins": 0.9452294111251831, + "rewards/rejected": 1.6250778436660767, + "step": 6608 + }, + { + "epoch": 1.07, + "learning_rate": 4.6546264489591976e-06, + "logits/chosen": -0.8264154195785522, + "logits/rejected": -0.8634931445121765, + "logps/chosen": -41.43502426147461, + "logps/rejected": -84.65111541748047, + "loss": 1.3407, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.9345936179161072, + "rewards/margins": -2.5865976810455322, + "rewards/rejected": 3.521191358566284, + "step": 6609 + }, + { + "epoch": 1.07, + "learning_rate": 4.653315346059592e-06, + "logits/chosen": -1.3400179147720337, + "logits/rejected": -1.35874342918396, + "logps/chosen": -98.79753112792969, + "logps/rejected": -133.24594116210938, + "loss": 3.5864, + "rewards/accuracies": 0.0, + "rewards/chosen": 4.08200216293335, + "rewards/margins": -5.491727352142334, + "rewards/rejected": 9.573729515075684, + "step": 6610 + }, + { + "epoch": 1.07, + "learning_rate": 4.652004267112615e-06, + "logits/chosen": -1.0280418395996094, + "logits/rejected": -1.0280418395996094, + "logps/chosen": -16.375673294067383, + "logps/rejected": -16.375673294067383, + "loss": 0.402, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.4305365085601807, + "rewards/margins": 0.0, + "rewards/rejected": 2.4305365085601807, + "step": 6611 + }, + { + "epoch": 1.07, + "learning_rate": 4.65069321220885e-06, + "logits/chosen": -1.127773642539978, + "logits/rejected": -1.0834141969680786, + "logps/chosen": -39.48698806762695, + "logps/rejected": -41.816123962402344, + "loss": 0.4782, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.4616024494171143, + "rewards/margins": -0.3511013984680176, + "rewards/rejected": 2.812703847885132, + "step": 6612 + }, + { + "epoch": 1.07, + "learning_rate": 4.649382181438882e-06, + "logits/chosen": -1.1324986219406128, + "logits/rejected": -1.0527968406677246, + "logps/chosen": -31.092147827148438, + "logps/rejected": -8.366747856140137, + "loss": 0.296, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.6877411007881165, + "rewards/margins": 0.47372692823410034, + "rewards/rejected": 0.21401415765285492, + "step": 6613 + }, + { + "epoch": 1.07, + "learning_rate": 4.648071174893285e-06, + "logits/chosen": -1.2874423265457153, + "logits/rejected": -1.1769136190414429, + "logps/chosen": -77.95025634765625, + "logps/rejected": -39.055179595947266, + "loss": 0.3673, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.715754747390747, + "rewards/margins": -0.018786191940307617, + "rewards/rejected": 2.7345409393310547, + "step": 6614 + }, + { + "epoch": 1.07, + "learning_rate": 4.6467601926626395e-06, + "logits/chosen": -1.0166758298873901, + "logits/rejected": -1.0206300020217896, + "logps/chosen": -45.588096618652344, + "logps/rejected": -78.53343200683594, + "loss": 3.2875, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.352393388748169, + "rewards/margins": -1.950542688369751, + "rewards/rejected": 5.30293607711792, + "step": 6615 + }, + { + "epoch": 1.07, + "learning_rate": 4.645449234837523e-06, + "logits/chosen": -1.1551039218902588, + "logits/rejected": -1.1858645677566528, + "logps/chosen": -65.1166000366211, + "logps/rejected": -54.16722106933594, + "loss": 0.2172, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1013741493225098, + "rewards/margins": 0.6832244396209717, + "rewards/rejected": 2.418149709701538, + "step": 6616 + }, + { + "epoch": 1.07, + "learning_rate": 4.6441383015085095e-06, + "logits/chosen": -1.1891504526138306, + "logits/rejected": -1.2121623754501343, + "logps/chosen": -77.97549438476562, + "logps/rejected": -90.8051528930664, + "loss": 0.6844, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.4272689819335938, + "rewards/margins": -0.41340410709381104, + "rewards/rejected": 1.8406730890274048, + "step": 6617 + }, + { + "epoch": 1.07, + "learning_rate": 4.642827392766173e-06, + "logits/chosen": -1.2378616333007812, + "logits/rejected": -1.1985121965408325, + "logps/chosen": -43.08565139770508, + "logps/rejected": -26.96412467956543, + "loss": 0.484, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.8333081007003784, + "rewards/margins": -0.4348989725112915, + "rewards/rejected": 2.26820707321167, + "step": 6618 + }, + { + "epoch": 1.07, + "learning_rate": 4.641516508701083e-06, + "logits/chosen": -1.1036195755004883, + "logits/rejected": -1.1036195755004883, + "logps/chosen": -31.765033721923828, + "logps/rejected": -31.765033721923828, + "loss": 0.3962, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.3554210662841797, + "rewards/margins": 0.0, + "rewards/rejected": 2.3554210662841797, + "step": 6619 + }, + { + "epoch": 1.07, + "learning_rate": 4.64020564940381e-06, + "logits/chosen": -1.178268551826477, + "logits/rejected": -1.1900666952133179, + "logps/chosen": -40.42091369628906, + "logps/rejected": -50.99083709716797, + "loss": 0.6849, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.507467746734619, + "rewards/margins": -1.0409140586853027, + "rewards/rejected": 3.548381805419922, + "step": 6620 + }, + { + "epoch": 1.07, + "learning_rate": 4.638894814964923e-06, + "logits/chosen": -0.9352649450302124, + "logits/rejected": -0.9403455257415771, + "logps/chosen": -102.02951049804688, + "logps/rejected": -94.51531982421875, + "loss": 0.933, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.2810096740722656, + "rewards/margins": 0.007457733154296875, + "rewards/rejected": 1.2735519409179688, + "step": 6621 + }, + { + "epoch": 1.07, + "learning_rate": 4.637584005474987e-06, + "logits/chosen": -0.5555336475372314, + "logits/rejected": -0.6064723134040833, + "logps/chosen": -63.25408172607422, + "logps/rejected": -97.92387390136719, + "loss": 0.6119, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.7554336786270142, + "rewards/margins": -0.02695918083190918, + "rewards/rejected": 1.7823928594589233, + "step": 6622 + }, + { + "epoch": 1.07, + "learning_rate": 4.6362732210245675e-06, + "logits/chosen": -1.523498773574829, + "logits/rejected": -1.4228782653808594, + "logps/chosen": -353.6809997558594, + "logps/rejected": -145.10336303710938, + "loss": 0.02, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.576773166656494, + "rewards/margins": 3.403656005859375, + "rewards/rejected": 3.173117160797119, + "step": 6623 + }, + { + "epoch": 1.08, + "learning_rate": 4.634962461704226e-06, + "logits/chosen": -0.9414727687835693, + "logits/rejected": -0.927937388420105, + "logps/chosen": -2.424400568008423, + "logps/rejected": -4.650032043457031, + "loss": 0.4569, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.15133364498615265, + "rewards/margins": -0.18403805792331696, + "rewards/rejected": 0.3353717029094696, + "step": 6624 + }, + { + "epoch": 1.08, + "learning_rate": 4.633651727604525e-06, + "logits/chosen": -1.2800551652908325, + "logits/rejected": -1.2800551652908325, + "logps/chosen": -51.46063232421875, + "logps/rejected": -51.46063232421875, + "loss": 0.4579, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.78668212890625, + "rewards/margins": 0.0, + "rewards/rejected": 3.78668212890625, + "step": 6625 + }, + { + "epoch": 1.08, + "learning_rate": 4.632341018816023e-06, + "logits/chosen": -1.0279576778411865, + "logits/rejected": -1.0298140048980713, + "logps/chosen": -59.62187194824219, + "logps/rejected": -88.3784408569336, + "loss": 0.3267, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.5567033290863037, + "rewards/margins": 0.22414326667785645, + "rewards/rejected": 2.3325600624084473, + "step": 6626 + }, + { + "epoch": 1.08, + "learning_rate": 4.631030335429278e-06, + "logits/chosen": -1.0684378147125244, + "logits/rejected": -1.1052227020263672, + "logps/chosen": -64.03561401367188, + "logps/rejected": -71.49075317382812, + "loss": 2.4576, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.532025098800659, + "rewards/margins": -3.1378495693206787, + "rewards/rejected": 5.669874668121338, + "step": 6627 + }, + { + "epoch": 1.08, + "learning_rate": 4.6297196775348454e-06, + "logits/chosen": -1.394356369972229, + "logits/rejected": -1.394356369972229, + "logps/chosen": -59.337257385253906, + "logps/rejected": -59.337257385253906, + "loss": 1.15, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.8010871410369873, + "rewards/margins": 0.0, + "rewards/rejected": 3.8010871410369873, + "step": 6628 + }, + { + "epoch": 1.08, + "learning_rate": 4.62840904522328e-06, + "logits/chosen": -1.265653371810913, + "logits/rejected": -1.1069000959396362, + "logps/chosen": -122.86518859863281, + "logps/rejected": -65.4593505859375, + "loss": 0.0265, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.1052093505859375, + "rewards/margins": 3.677281141281128, + "rewards/rejected": 2.4279282093048096, + "step": 6629 + }, + { + "epoch": 1.08, + "learning_rate": 4.627098438585133e-06, + "logits/chosen": -1.4112153053283691, + "logits/rejected": -1.3217031955718994, + "logps/chosen": -111.42817687988281, + "logps/rejected": -25.338069915771484, + "loss": 0.1448, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.588949680328369, + "rewards/margins": 1.0928658246994019, + "rewards/rejected": 1.4960838556289673, + "step": 6630 + }, + { + "epoch": 1.08, + "learning_rate": 4.6257878577109555e-06, + "logits/chosen": -1.359235167503357, + "logits/rejected": -1.3679745197296143, + "logps/chosen": -122.7569351196289, + "logps/rejected": -124.28302001953125, + "loss": 1.1244, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.022084951400757, + "rewards/margins": -1.4488272666931152, + "rewards/rejected": 3.470912218093872, + "step": 6631 + }, + { + "epoch": 1.08, + "learning_rate": 4.624477302691296e-06, + "logits/chosen": -1.0050328969955444, + "logits/rejected": -0.9438474774360657, + "logps/chosen": -46.995460510253906, + "logps/rejected": -18.227659225463867, + "loss": 0.4549, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.000218152999878, + "rewards/margins": 1.283294439315796, + "rewards/rejected": 0.716923713684082, + "step": 6632 + }, + { + "epoch": 1.08, + "learning_rate": 4.6231667736167015e-06, + "logits/chosen": -1.6738295555114746, + "logits/rejected": -1.7145826816558838, + "logps/chosen": -61.44255447387695, + "logps/rejected": -54.41838073730469, + "loss": 1.1549, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.5085934400558472, + "rewards/margins": -1.3160182237625122, + "rewards/rejected": 2.8246116638183594, + "step": 6633 + }, + { + "epoch": 1.08, + "learning_rate": 4.6218562705777185e-06, + "logits/chosen": -1.415705680847168, + "logits/rejected": -0.9174434542655945, + "logps/chosen": -120.58621215820312, + "logps/rejected": -102.05746459960938, + "loss": 0.4022, + "rewards/accuracies": 0.0, + "rewards/chosen": 6.388316631317139, + "rewards/margins": -0.15515565872192383, + "rewards/rejected": 6.5434722900390625, + "step": 6634 + }, + { + "epoch": 1.08, + "learning_rate": 4.6205457936648875e-06, + "logits/chosen": -1.171465516090393, + "logits/rejected": -1.3024438619613647, + "logps/chosen": -97.38060760498047, + "logps/rejected": -93.76969909667969, + "loss": 0.5024, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.4313361644744873, + "rewards/margins": -0.31736230850219727, + "rewards/rejected": 3.7486984729766846, + "step": 6635 + }, + { + "epoch": 1.08, + "learning_rate": 4.619235342968753e-06, + "logits/chosen": -1.4819127321243286, + "logits/rejected": -1.3518927097320557, + "logps/chosen": -91.36614227294922, + "logps/rejected": -45.65569305419922, + "loss": 0.8111, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.247910261154175, + "rewards/margins": 0.2360060214996338, + "rewards/rejected": 2.011904239654541, + "step": 6636 + }, + { + "epoch": 1.08, + "learning_rate": 4.617924918579853e-06, + "logits/chosen": -1.066072702407837, + "logits/rejected": -1.1131292581558228, + "logps/chosen": -61.47710037231445, + "logps/rejected": -99.76185607910156, + "loss": 0.3978, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1660611629486084, + "rewards/margins": 1.3200443983078003, + "rewards/rejected": 1.846016764640808, + "step": 6637 + }, + { + "epoch": 1.08, + "learning_rate": 4.616614520588726e-06, + "logits/chosen": -1.0502469539642334, + "logits/rejected": -1.0514674186706543, + "logps/chosen": -1.9206421375274658, + "logps/rejected": -1.5901683568954468, + "loss": 1.0461, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.297890305519104, + "rewards/margins": 0.07237903773784637, + "rewards/rejected": 0.22551126778125763, + "step": 6638 + }, + { + "epoch": 1.08, + "learning_rate": 4.615304149085907e-06, + "logits/chosen": -1.523158073425293, + "logits/rejected": -1.534525990486145, + "logps/chosen": -187.79751586914062, + "logps/rejected": -44.36449432373047, + "loss": 0.7495, + "rewards/accuracies": 1.0, + "rewards/chosen": 8.098990440368652, + "rewards/margins": 4.358355522155762, + "rewards/rejected": 3.7406349182128906, + "step": 6639 + }, + { + "epoch": 1.08, + "learning_rate": 4.6139938041619325e-06, + "logits/chosen": -1.1539727449417114, + "logits/rejected": -1.2248502969741821, + "logps/chosen": -57.73644256591797, + "logps/rejected": -93.85687255859375, + "loss": 0.7833, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.947167158126831, + "rewards/margins": -0.9020471572875977, + "rewards/rejected": 3.8492143154144287, + "step": 6640 + }, + { + "epoch": 1.08, + "learning_rate": 4.612683485907333e-06, + "logits/chosen": -0.895248293876648, + "logits/rejected": -0.8775206804275513, + "logps/chosen": -86.88113403320312, + "logps/rejected": -43.270721435546875, + "loss": 0.3372, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.913647413253784, + "rewards/margins": 1.9762252569198608, + "rewards/rejected": 1.9374221563339233, + "step": 6641 + }, + { + "epoch": 1.08, + "learning_rate": 4.61137319441264e-06, + "logits/chosen": -1.305893063545227, + "logits/rejected": -1.3242952823638916, + "logps/chosen": -100.11279296875, + "logps/rejected": -74.41183471679688, + "loss": 0.4147, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.585322618484497, + "rewards/margins": 0.4721541404724121, + "rewards/rejected": 3.113168478012085, + "step": 6642 + }, + { + "epoch": 1.08, + "learning_rate": 4.610062929768383e-06, + "logits/chosen": -1.1102349758148193, + "logits/rejected": -1.1469178199768066, + "logps/chosen": -61.290000915527344, + "logps/rejected": -102.50892639160156, + "loss": 0.2117, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.313718557357788, + "rewards/margins": 0.6448739767074585, + "rewards/rejected": 1.6688445806503296, + "step": 6643 + }, + { + "epoch": 1.08, + "learning_rate": 4.6087526920650874e-06, + "logits/chosen": -1.1788743734359741, + "logits/rejected": -1.146799921989441, + "logps/chosen": -61.3741455078125, + "logps/rejected": -38.245296478271484, + "loss": 0.4622, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.171454668045044, + "rewards/margins": 0.28686797618865967, + "rewards/rejected": 1.8845866918563843, + "step": 6644 + }, + { + "epoch": 1.08, + "learning_rate": 4.607442481393279e-06, + "logits/chosen": -0.9312102198600769, + "logits/rejected": -0.8354589939117432, + "logps/chosen": -95.65843200683594, + "logps/rejected": -61.397071838378906, + "loss": 0.2671, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.8826615810394287, + "rewards/margins": 0.7547707557678223, + "rewards/rejected": 3.1278908252716064, + "step": 6645 + }, + { + "epoch": 1.08, + "learning_rate": 4.60613229784348e-06, + "logits/chosen": -0.7514485120773315, + "logits/rejected": -0.7514485120773315, + "logps/chosen": -16.814208984375, + "logps/rejected": -16.814208984375, + "loss": 2.2595, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.36969757080078125, + "rewards/margins": 0.0, + "rewards/rejected": 0.36969757080078125, + "step": 6646 + }, + { + "epoch": 1.08, + "learning_rate": 4.604822141506212e-06, + "logits/chosen": -1.4709455966949463, + "logits/rejected": -1.356412410736084, + "logps/chosen": -114.12529754638672, + "logps/rejected": -44.621978759765625, + "loss": 0.008, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.841468334197998, + "rewards/margins": 4.319858551025391, + "rewards/rejected": 2.5216095447540283, + "step": 6647 + }, + { + "epoch": 1.08, + "learning_rate": 4.6035120124719956e-06, + "logits/chosen": -1.2615330219268799, + "logits/rejected": -1.1449207067489624, + "logps/chosen": -289.20733642578125, + "logps/rejected": -21.77156639099121, + "loss": 0.0041, + "rewards/accuracies": 1.0, + "rewards/chosen": 8.14623737335205, + "rewards/margins": 7.5817365646362305, + "rewards/rejected": 0.564500629901886, + "step": 6648 + }, + { + "epoch": 1.08, + "learning_rate": 4.602201910831348e-06, + "logits/chosen": -1.369819164276123, + "logits/rejected": -1.141287922859192, + "logps/chosen": -122.53348541259766, + "logps/rejected": -14.514885902404785, + "loss": 0.2713, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.6562676429748535, + "rewards/margins": 4.3250932693481445, + "rewards/rejected": 1.331174612045288, + "step": 6649 + }, + { + "epoch": 1.08, + "learning_rate": 4.600891836674784e-06, + "logits/chosen": -1.1676336526870728, + "logits/rejected": -0.9393070340156555, + "logps/chosen": -42.19490051269531, + "logps/rejected": -27.896089553833008, + "loss": 0.2029, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.5663392543792725, + "rewards/margins": 1.547844648361206, + "rewards/rejected": 1.0184946060180664, + "step": 6650 + }, + { + "epoch": 1.08, + "learning_rate": 4.599581790092817e-06, + "logits/chosen": -1.1633353233337402, + "logits/rejected": -1.1580661535263062, + "logps/chosen": -299.8184814453125, + "logps/rejected": -102.65969848632812, + "loss": 0.1234, + "rewards/accuracies": 1.0, + "rewards/chosen": 7.821679592132568, + "rewards/margins": 1.278820514678955, + "rewards/rejected": 6.542859077453613, + "step": 6651 + }, + { + "epoch": 1.08, + "learning_rate": 4.59827177117596e-06, + "logits/chosen": -1.134225845336914, + "logits/rejected": -1.1102321147918701, + "logps/chosen": -68.84291076660156, + "logps/rejected": -47.57225799560547, + "loss": 0.6331, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.5386314392089844, + "rewards/margins": 0.46190643310546875, + "rewards/rejected": 2.0767250061035156, + "step": 6652 + }, + { + "epoch": 1.08, + "learning_rate": 4.596961780014722e-06, + "logits/chosen": -1.0010088682174683, + "logits/rejected": -0.9992319345474243, + "logps/chosen": -32.2484245300293, + "logps/rejected": -22.630935668945312, + "loss": 0.5778, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.6920116543769836, + "rewards/margins": -0.1864059567451477, + "rewards/rejected": 0.8784176111221313, + "step": 6653 + }, + { + "epoch": 1.08, + "learning_rate": 4.595651816699612e-06, + "logits/chosen": -1.035569429397583, + "logits/rejected": -1.0241338014602661, + "logps/chosen": -94.40406036376953, + "logps/rejected": -63.88792419433594, + "loss": 0.5221, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.0391533374786377, + "rewards/margins": -0.17043161392211914, + "rewards/rejected": 3.209584951400757, + "step": 6654 + }, + { + "epoch": 1.08, + "learning_rate": 4.5943418813211364e-06, + "logits/chosen": -0.9013950228691101, + "logits/rejected": -0.9013950228691101, + "logps/chosen": -32.06034851074219, + "logps/rejected": -32.06034851074219, + "loss": 1.1943, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.5310676097869873, + "rewards/margins": 0.0, + "rewards/rejected": 2.5310676097869873, + "step": 6655 + }, + { + "epoch": 1.08, + "learning_rate": 4.593031973969797e-06, + "logits/chosen": -1.4180490970611572, + "logits/rejected": -1.300398826599121, + "logps/chosen": -144.08416748046875, + "logps/rejected": -108.49723052978516, + "loss": 0.6434, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.793632507324219, + "rewards/margins": 2.2927756309509277, + "rewards/rejected": 3.500856876373291, + "step": 6656 + }, + { + "epoch": 1.08, + "learning_rate": 4.591722094736098e-06, + "logits/chosen": -1.3416165113449097, + "logits/rejected": -1.368428111076355, + "logps/chosen": -90.52174377441406, + "logps/rejected": -118.54517364501953, + "loss": 2.2614, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.273266553878784, + "rewards/margins": -4.394850730895996, + "rewards/rejected": 7.668117523193359, + "step": 6657 + }, + { + "epoch": 1.08, + "learning_rate": 4.5904122437105384e-06, + "logits/chosen": -1.1424474716186523, + "logits/rejected": -1.1609320640563965, + "logps/chosen": -56.466552734375, + "logps/rejected": -92.32759094238281, + "loss": 1.8944, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.195197343826294, + "rewards/margins": -2.130603790283203, + "rewards/rejected": 3.325801134109497, + "step": 6658 + }, + { + "epoch": 1.08, + "learning_rate": 4.589102420983618e-06, + "logits/chosen": -1.2999387979507446, + "logits/rejected": -1.2162448167800903, + "logps/chosen": -104.89431762695312, + "logps/rejected": -134.3607177734375, + "loss": 0.3264, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.935272216796875, + "rewards/margins": 0.11971735954284668, + "rewards/rejected": 2.8155548572540283, + "step": 6659 + }, + { + "epoch": 1.08, + "learning_rate": 4.587792626645833e-06, + "logits/chosen": -1.8298763036727905, + "logits/rejected": -1.8120896816253662, + "logps/chosen": -65.17742919921875, + "logps/rejected": -61.411949157714844, + "loss": 0.6654, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.023668050765991, + "rewards/margins": 0.5512306690216064, + "rewards/rejected": 2.4724373817443848, + "step": 6660 + }, + { + "epoch": 1.08, + "learning_rate": 4.5864828607876755e-06, + "logits/chosen": -1.1453800201416016, + "logits/rejected": -1.1376185417175293, + "logps/chosen": -46.440834045410156, + "logps/rejected": -50.094303131103516, + "loss": 1.0473, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.2660744190216064, + "rewards/margins": -0.22939491271972656, + "rewards/rejected": 2.495469331741333, + "step": 6661 + }, + { + "epoch": 1.08, + "learning_rate": 4.58517312349964e-06, + "logits/chosen": -1.4532098770141602, + "logits/rejected": -1.2826777696609497, + "logps/chosen": -148.10272216796875, + "logps/rejected": -42.03019332885742, + "loss": 0.1839, + "rewards/accuracies": 1.0, + "rewards/chosen": 7.819283962249756, + "rewards/margins": 5.702444076538086, + "rewards/rejected": 2.116840124130249, + "step": 6662 + }, + { + "epoch": 1.08, + "learning_rate": 4.583863414872217e-06, + "logits/chosen": -0.7887252569198608, + "logits/rejected": -0.7937542796134949, + "logps/chosen": -6.876173496246338, + "logps/rejected": -8.583403587341309, + "loss": 0.7037, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.6888693571090698, + "rewards/margins": -0.5380439758300781, + "rewards/rejected": 1.226913332939148, + "step": 6663 + }, + { + "epoch": 1.08, + "learning_rate": 4.582553734995894e-06, + "logits/chosen": -0.649030327796936, + "logits/rejected": -0.649030327796936, + "logps/chosen": -10.984675407409668, + "logps/rejected": -10.984675407409668, + "loss": 0.379, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.1546380519866943, + "rewards/margins": 0.0, + "rewards/rejected": 1.1546380519866943, + "step": 6664 + }, + { + "epoch": 1.08, + "learning_rate": 4.581244083961159e-06, + "logits/chosen": -0.9341375231742859, + "logits/rejected": -0.9263944625854492, + "logps/chosen": -56.498779296875, + "logps/rejected": -55.798126220703125, + "loss": 0.2197, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9191880226135254, + "rewards/margins": 0.6660659313201904, + "rewards/rejected": 2.253122091293335, + "step": 6665 + }, + { + "epoch": 1.08, + "learning_rate": 4.5799344618584945e-06, + "logits/chosen": -1.1123522520065308, + "logits/rejected": -1.0801981687545776, + "logps/chosen": -68.36036682128906, + "logps/rejected": -96.45968627929688, + "loss": 0.276, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.9750031232833862, + "rewards/margins": 0.32207489013671875, + "rewards/rejected": 1.6529282331466675, + "step": 6666 + }, + { + "epoch": 1.08, + "learning_rate": 4.578624868778385e-06, + "logits/chosen": -1.0967199802398682, + "logits/rejected": -1.0864185094833374, + "logps/chosen": -14.904836654663086, + "logps/rejected": -9.53652572631836, + "loss": 1.135, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.370976060628891, + "rewards/margins": -0.1424335539340973, + "rewards/rejected": 0.5134096145629883, + "step": 6667 + }, + { + "epoch": 1.08, + "learning_rate": 4.577315304811309e-06, + "logits/chosen": -1.1778792142868042, + "logits/rejected": -1.1842682361602783, + "logps/chosen": -163.09664916992188, + "logps/rejected": -43.07517623901367, + "loss": 0.2801, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.9537811279296875, + "rewards/margins": 3.6323962211608887, + "rewards/rejected": 2.321384906768799, + "step": 6668 + }, + { + "epoch": 1.08, + "learning_rate": 4.576005770047746e-06, + "logits/chosen": -1.2847048044204712, + "logits/rejected": -0.9871689677238464, + "logps/chosen": -133.98089599609375, + "logps/rejected": -56.350067138671875, + "loss": 0.0427, + "rewards/accuracies": 1.0, + "rewards/chosen": 7.086169719696045, + "rewards/margins": 3.3590762615203857, + "rewards/rejected": 3.727093458175659, + "step": 6669 + }, + { + "epoch": 1.08, + "learning_rate": 4.574696264578173e-06, + "logits/chosen": -1.1119506359100342, + "logits/rejected": -1.0521671772003174, + "logps/chosen": -57.97968292236328, + "logps/rejected": -78.16650390625, + "loss": 0.7028, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.818697452545166, + "rewards/margins": -0.10122895240783691, + "rewards/rejected": 2.919926404953003, + "step": 6670 + }, + { + "epoch": 1.08, + "learning_rate": 4.573386788493063e-06, + "logits/chosen": -1.394669532775879, + "logits/rejected": -1.271734356880188, + "logps/chosen": -92.75833129882812, + "logps/rejected": -25.414812088012695, + "loss": 0.0282, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.87095046043396, + "rewards/margins": 3.193084716796875, + "rewards/rejected": 0.6778658032417297, + "step": 6671 + }, + { + "epoch": 1.08, + "learning_rate": 4.57207734188289e-06, + "logits/chosen": -1.2497296333312988, + "logits/rejected": -1.1752829551696777, + "logps/chosen": -39.789634704589844, + "logps/rejected": -27.803829193115234, + "loss": 0.632, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.147942304611206, + "rewards/margins": 0.25863146781921387, + "rewards/rejected": 1.8893108367919922, + "step": 6672 + }, + { + "epoch": 1.08, + "learning_rate": 4.5707679248381235e-06, + "logits/chosen": -1.5378737449645996, + "logits/rejected": -1.0857359170913696, + "logps/chosen": -139.95957946777344, + "logps/rejected": -38.33872604370117, + "loss": 0.0008, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.776668071746826, + "rewards/margins": 6.894120216369629, + "rewards/rejected": -0.1174522414803505, + "step": 6673 + }, + { + "epoch": 1.08, + "learning_rate": 4.5694585374492314e-06, + "logits/chosen": -1.226880669593811, + "logits/rejected": -1.1659871339797974, + "logps/chosen": -69.3616943359375, + "logps/rejected": -67.02880859375, + "loss": 0.0294, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.097625732421875, + "rewards/margins": 4.218345642089844, + "rewards/rejected": 0.8792800903320312, + "step": 6674 + }, + { + "epoch": 1.08, + "learning_rate": 4.5681491798066804e-06, + "logits/chosen": -1.3125441074371338, + "logits/rejected": -1.1733670234680176, + "logps/chosen": -118.61640167236328, + "logps/rejected": -30.35613250732422, + "loss": 0.0636, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1742241382598877, + "rewards/margins": 2.6264853477478027, + "rewards/rejected": 0.5477386713027954, + "step": 6675 + }, + { + "epoch": 1.08, + "learning_rate": 4.566839852000936e-06, + "logits/chosen": -1.0315452814102173, + "logits/rejected": -1.0610758066177368, + "logps/chosen": -72.40973663330078, + "logps/rejected": -91.79733276367188, + "loss": 0.5693, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.9873383045196533, + "rewards/margins": 0.33550262451171875, + "rewards/rejected": 1.6518356800079346, + "step": 6676 + }, + { + "epoch": 1.08, + "learning_rate": 4.565530554122458e-06, + "logits/chosen": -0.8175614476203918, + "logits/rejected": -0.7769444584846497, + "logps/chosen": -21.236913681030273, + "logps/rejected": -4.190711498260498, + "loss": 0.3323, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.1328519582748413, + "rewards/margins": 0.8512709736824036, + "rewards/rejected": 0.28158098459243774, + "step": 6677 + }, + { + "epoch": 1.08, + "learning_rate": 4.564221286261709e-06, + "logits/chosen": -0.9767076373100281, + "logits/rejected": -0.9726771712303162, + "logps/chosen": -5.97052526473999, + "logps/rejected": -2.4149065017700195, + "loss": 0.7255, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.3825855851173401, + "rewards/margins": -0.22063487768173218, + "rewards/rejected": 0.6032204627990723, + "step": 6678 + }, + { + "epoch": 1.08, + "learning_rate": 4.562912048509145e-06, + "logits/chosen": -1.2645717859268188, + "logits/rejected": -1.2150722742080688, + "logps/chosen": -36.594905853271484, + "logps/rejected": -51.66334915161133, + "loss": 0.3798, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.804784059524536, + "rewards/margins": 0.3999443054199219, + "rewards/rejected": 2.4048397541046143, + "step": 6679 + }, + { + "epoch": 1.08, + "learning_rate": 4.561602840955223e-06, + "logits/chosen": -0.8581231236457825, + "logits/rejected": -0.8009253740310669, + "logps/chosen": -52.67215347290039, + "logps/rejected": -34.313289642333984, + "loss": 0.6686, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.5799152851104736, + "rewards/margins": 0.38829731941223145, + "rewards/rejected": 1.1916179656982422, + "step": 6680 + }, + { + "epoch": 1.08, + "learning_rate": 4.560293663690397e-06, + "logits/chosen": -1.0940755605697632, + "logits/rejected": -1.0825603008270264, + "logps/chosen": -75.13099670410156, + "logps/rejected": -106.93538665771484, + "loss": 0.143, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9305648803710938, + "rewards/margins": 3.2980270385742188, + "rewards/rejected": -0.367462158203125, + "step": 6681 + }, + { + "epoch": 1.08, + "learning_rate": 4.5589845168051176e-06, + "logits/chosen": -1.269715666770935, + "logits/rejected": -1.0956404209136963, + "logps/chosen": -55.97981262207031, + "logps/rejected": -34.57887268066406, + "loss": 0.087, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.5820465087890625, + "rewards/margins": 2.3023414611816406, + "rewards/rejected": 3.279705047607422, + "step": 6682 + }, + { + "epoch": 1.08, + "learning_rate": 4.557675400389835e-06, + "logits/chosen": -1.4651226997375488, + "logits/rejected": -1.4740995168685913, + "logps/chosen": -111.46685791015625, + "logps/rejected": -67.173828125, + "loss": 0.1189, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.536386251449585, + "rewards/margins": 2.0267465114593506, + "rewards/rejected": 1.5096397399902344, + "step": 6683 + }, + { + "epoch": 1.08, + "learning_rate": 4.5563663145349975e-06, + "logits/chosen": -1.1057630777359009, + "logits/rejected": -0.997931182384491, + "logps/chosen": -65.27297973632812, + "logps/rejected": -9.365242958068848, + "loss": 0.5114, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.8169769644737244, + "rewards/margins": -0.11336427927017212, + "rewards/rejected": 0.9303412437438965, + "step": 6684 + }, + { + "epoch": 1.09, + "learning_rate": 4.55505725933105e-06, + "logits/chosen": -1.3831707239151, + "logits/rejected": -1.4388211965560913, + "logps/chosen": -71.37874603271484, + "logps/rejected": -84.73861694335938, + "loss": 1.1258, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.576890707015991, + "rewards/margins": -2.1336562633514404, + "rewards/rejected": 5.710546970367432, + "step": 6685 + }, + { + "epoch": 1.09, + "learning_rate": 4.553748234868435e-06, + "logits/chosen": -1.0715774297714233, + "logits/rejected": -1.0985990762710571, + "logps/chosen": -70.09013366699219, + "logps/rejected": -72.60183715820312, + "loss": 0.4477, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.132188558578491, + "rewards/margins": 0.9600160121917725, + "rewards/rejected": 1.1721725463867188, + "step": 6686 + }, + { + "epoch": 1.09, + "learning_rate": 4.5524392412375955e-06, + "logits/chosen": -1.1380987167358398, + "logits/rejected": -0.6855527758598328, + "logps/chosen": -86.38656616210938, + "logps/rejected": -70.45741271972656, + "loss": 1.0107, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.429957628250122, + "rewards/margins": -0.9587035179138184, + "rewards/rejected": 3.3886611461639404, + "step": 6687 + }, + { + "epoch": 1.09, + "learning_rate": 4.551130278528968e-06, + "logits/chosen": -1.2096179723739624, + "logits/rejected": -1.1409298181533813, + "logps/chosen": -39.017337799072266, + "logps/rejected": -14.578397750854492, + "loss": 0.1004, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.476006031036377, + "rewards/margins": 1.5115374326705933, + "rewards/rejected": 0.9644685983657837, + "step": 6688 + }, + { + "epoch": 1.09, + "learning_rate": 4.549821346832993e-06, + "logits/chosen": -0.8103013634681702, + "logits/rejected": -0.8111482858657837, + "logps/chosen": -0.9248096346855164, + "logps/rejected": -11.675897598266602, + "loss": 0.4309, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.26297691464424133, + "rewards/margins": 0.2811757028102875, + "rewards/rejected": -0.018198776990175247, + "step": 6689 + }, + { + "epoch": 1.09, + "learning_rate": 4.548512446240102e-06, + "logits/chosen": -1.0759214162826538, + "logits/rejected": -0.8725857734680176, + "logps/chosen": -60.409942626953125, + "logps/rejected": -14.566995620727539, + "loss": 0.2971, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1003174781799316, + "rewards/margins": 2.493464469909668, + "rewards/rejected": 0.6068531274795532, + "step": 6690 + }, + { + "epoch": 1.09, + "learning_rate": 4.54720357684073e-06, + "logits/chosen": -1.0883103609085083, + "logits/rejected": -1.138999104499817, + "logps/chosen": -102.397705078125, + "logps/rejected": -118.72489166259766, + "loss": 0.9803, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.2828422784805298, + "rewards/margins": -0.9368408918380737, + "rewards/rejected": 2.2196831703186035, + "step": 6691 + }, + { + "epoch": 1.09, + "learning_rate": 4.545894738725305e-06, + "logits/chosen": -1.0997768640518188, + "logits/rejected": -1.0330487489700317, + "logps/chosen": -71.96583557128906, + "logps/rejected": -92.89021301269531, + "loss": 0.538, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.5863709449768066, + "rewards/margins": -0.6574110984802246, + "rewards/rejected": 3.2437820434570312, + "step": 6692 + }, + { + "epoch": 1.09, + "learning_rate": 4.544585931984258e-06, + "logits/chosen": -1.1911379098892212, + "logits/rejected": -1.1430836915969849, + "logps/chosen": -54.87652587890625, + "logps/rejected": -73.91590118408203, + "loss": 0.2081, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.6386048793792725, + "rewards/margins": 1.4608834981918335, + "rewards/rejected": 1.177721381187439, + "step": 6693 + }, + { + "epoch": 1.09, + "learning_rate": 4.543277156708013e-06, + "logits/chosen": -1.1652257442474365, + "logits/rejected": -1.2253491878509521, + "logps/chosen": -99.67396545410156, + "logps/rejected": -72.37681579589844, + "loss": 1.5247, + "rewards/accuracies": 0.0, + "rewards/chosen": 5.592479228973389, + "rewards/margins": -2.9899306297302246, + "rewards/rejected": 8.582409858703613, + "step": 6694 + }, + { + "epoch": 1.09, + "learning_rate": 4.541968412986996e-06, + "logits/chosen": -1.248150110244751, + "logits/rejected": -1.4335232973098755, + "logps/chosen": -32.81850814819336, + "logps/rejected": -80.76164245605469, + "loss": 2.0534, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.8832783699035645, + "rewards/margins": -4.0262064933776855, + "rewards/rejected": 7.90948486328125, + "step": 6695 + }, + { + "epoch": 1.09, + "learning_rate": 4.540659700911626e-06, + "logits/chosen": -0.9789657592773438, + "logits/rejected": -1.0414499044418335, + "logps/chosen": -54.35150146484375, + "logps/rejected": -89.32579040527344, + "loss": 0.9071, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.1261773109436035, + "rewards/margins": -0.7265770435333252, + "rewards/rejected": 3.8527543544769287, + "step": 6696 + }, + { + "epoch": 1.09, + "learning_rate": 4.5393510205723265e-06, + "logits/chosen": -0.9587330222129822, + "logits/rejected": -0.9594711065292358, + "logps/chosen": -1.4520490169525146, + "logps/rejected": -13.394149780273438, + "loss": 0.287, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3415564000606537, + "rewards/margins": 0.3273763060569763, + "rewards/rejected": 0.01418008841574192, + "step": 6697 + }, + { + "epoch": 1.09, + "learning_rate": 4.538042372059511e-06, + "logits/chosen": -1.2474063634872437, + "logits/rejected": -1.2543989419937134, + "logps/chosen": -92.00889587402344, + "logps/rejected": -47.05173110961914, + "loss": 0.1172, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.630202531814575, + "rewards/margins": 1.3322117328643799, + "rewards/rejected": 1.2979907989501953, + "step": 6698 + }, + { + "epoch": 1.09, + "learning_rate": 4.536733755463598e-06, + "logits/chosen": -1.3375673294067383, + "logits/rejected": -1.3151745796203613, + "logps/chosen": -156.451416015625, + "logps/rejected": -68.03443145751953, + "loss": 1.0893, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.3529205322265625, + "rewards/margins": 4.04352331161499, + "rewards/rejected": 1.3093971014022827, + "step": 6699 + }, + { + "epoch": 1.09, + "learning_rate": 4.535425170874998e-06, + "logits/chosen": -0.9497235417366028, + "logits/rejected": -1.01182222366333, + "logps/chosen": -57.809139251708984, + "logps/rejected": -70.00772857666016, + "loss": 0.2108, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1748225688934326, + "rewards/margins": 1.2943118810653687, + "rewards/rejected": 1.880510687828064, + "step": 6700 + }, + { + "epoch": 1.09, + "learning_rate": 4.534116618384123e-06, + "logits/chosen": -1.4266539812088013, + "logits/rejected": -1.2991104125976562, + "logps/chosen": -90.59642028808594, + "logps/rejected": -84.97396850585938, + "loss": 0.6527, + "rewards/accuracies": 0.0, + "rewards/chosen": 4.123363018035889, + "rewards/margins": -0.9702267646789551, + "rewards/rejected": 5.093589782714844, + "step": 6701 + }, + { + "epoch": 1.09, + "learning_rate": 4.532808098081382e-06, + "logits/chosen": -1.2888315916061401, + "logits/rejected": -1.2888315916061401, + "logps/chosen": -48.972137451171875, + "logps/rejected": -48.972137451171875, + "loss": 0.6506, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.3656983375549316, + "rewards/margins": 0.0, + "rewards/rejected": 3.3656983375549316, + "step": 6702 + }, + { + "epoch": 1.09, + "learning_rate": 4.5314996100571804e-06, + "logits/chosen": -0.8891595005989075, + "logits/rejected": -0.9384795427322388, + "logps/chosen": -48.65620803833008, + "logps/rejected": -58.68303680419922, + "loss": 3.0038, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.204810857772827, + "rewards/margins": -1.0506830215454102, + "rewards/rejected": 3.2554938793182373, + "step": 6703 + }, + { + "epoch": 1.09, + "learning_rate": 4.530191154401922e-06, + "logits/chosen": -1.3490145206451416, + "logits/rejected": -1.4659582376480103, + "logps/chosen": -61.16276931762695, + "logps/rejected": -127.28182220458984, + "loss": 2.125, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.8956310749053955, + "rewards/margins": -3.5279476642608643, + "rewards/rejected": 6.42357873916626, + "step": 6704 + }, + { + "epoch": 1.09, + "learning_rate": 4.52888273120601e-06, + "logits/chosen": -1.279839038848877, + "logits/rejected": -0.8918639421463013, + "logps/chosen": -69.63899230957031, + "logps/rejected": -89.11824798583984, + "loss": 0.3038, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.001497745513916, + "rewards/margins": 0.2303025722503662, + "rewards/rejected": 2.77119517326355, + "step": 6705 + }, + { + "epoch": 1.09, + "learning_rate": 4.527574340559844e-06, + "logits/chosen": -1.401326298713684, + "logits/rejected": -1.3080521821975708, + "logps/chosen": -42.65980911254883, + "logps/rejected": -42.675575256347656, + "loss": 0.0877, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.986945867538452, + "rewards/margins": 1.681873083114624, + "rewards/rejected": 1.3050727844238281, + "step": 6706 + }, + { + "epoch": 1.09, + "learning_rate": 4.5262659825538204e-06, + "logits/chosen": -1.1392954587936401, + "logits/rejected": -1.1237596273422241, + "logps/chosen": -23.645944595336914, + "logps/rejected": -19.345184326171875, + "loss": 0.4047, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.1407819986343384, + "rewards/margins": 0.37918204069137573, + "rewards/rejected": 0.7615999579429626, + "step": 6707 + }, + { + "epoch": 1.09, + "learning_rate": 4.524957657278336e-06, + "logits/chosen": -0.9477336406707764, + "logits/rejected": -0.9952590465545654, + "logps/chosen": -6.925082206726074, + "logps/rejected": -42.577880859375, + "loss": 1.0581, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.43803054094314575, + "rewards/margins": -0.8674039244651794, + "rewards/rejected": 1.3054344654083252, + "step": 6708 + }, + { + "epoch": 1.09, + "learning_rate": 4.523649364823781e-06, + "logits/chosen": -1.327945590019226, + "logits/rejected": -1.3030325174331665, + "logps/chosen": -35.27696228027344, + "logps/rejected": -25.641494750976562, + "loss": 0.5765, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.229746699333191, + "rewards/margins": -0.7723368406295776, + "rewards/rejected": 2.0020835399627686, + "step": 6709 + }, + { + "epoch": 1.09, + "learning_rate": 4.522341105280548e-06, + "logits/chosen": -1.552436351776123, + "logits/rejected": -1.6172221899032593, + "logps/chosen": -56.222076416015625, + "logps/rejected": -130.85128784179688, + "loss": 2.3944, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.8681442737579346, + "rewards/margins": -4.767401695251465, + "rewards/rejected": 8.63554573059082, + "step": 6710 + }, + { + "epoch": 1.09, + "learning_rate": 4.521032878739025e-06, + "logits/chosen": -1.289007306098938, + "logits/rejected": -1.2722246646881104, + "logps/chosen": -54.220924377441406, + "logps/rejected": -36.84585189819336, + "loss": 0.3802, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.9706344604492188, + "rewards/margins": -0.09314119815826416, + "rewards/rejected": 1.063775658607483, + "step": 6711 + }, + { + "epoch": 1.09, + "learning_rate": 4.519724685289598e-06, + "logits/chosen": -1.1952017545700073, + "logits/rejected": -0.9161160588264465, + "logps/chosen": -218.8351287841797, + "logps/rejected": -24.188215255737305, + "loss": 0.011, + "rewards/accuracies": 1.0, + "rewards/chosen": 7.188511848449707, + "rewards/margins": 6.873322486877441, + "rewards/rejected": 0.3151891827583313, + "step": 6712 + }, + { + "epoch": 1.09, + "learning_rate": 4.518416525022652e-06, + "logits/chosen": -1.0573248863220215, + "logits/rejected": -1.0416113138198853, + "logps/chosen": -84.26521301269531, + "logps/rejected": -95.79737091064453, + "loss": 0.2301, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.4855568408966064, + "rewards/margins": 1.403436303138733, + "rewards/rejected": 1.0821205377578735, + "step": 6713 + }, + { + "epoch": 1.09, + "learning_rate": 4.517108398028566e-06, + "logits/chosen": -0.7297548055648804, + "logits/rejected": -1.0084593296051025, + "logps/chosen": -78.00971984863281, + "logps/rejected": -51.87192916870117, + "loss": 0.3798, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.3531875610351562, + "rewards/margins": 0.5720477104187012, + "rewards/rejected": 2.781139850616455, + "step": 6714 + }, + { + "epoch": 1.09, + "learning_rate": 4.515800304397721e-06, + "logits/chosen": -1.5092709064483643, + "logits/rejected": -1.4576524496078491, + "logps/chosen": -59.87462615966797, + "logps/rejected": -29.946453094482422, + "loss": 0.1023, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.393360137939453, + "rewards/margins": 2.915675640106201, + "rewards/rejected": -0.5223154425621033, + "step": 6715 + }, + { + "epoch": 1.09, + "learning_rate": 4.514492244220494e-06, + "logits/chosen": -1.1007517576217651, + "logits/rejected": -1.0964637994766235, + "logps/chosen": -39.30403518676758, + "logps/rejected": -42.844261169433594, + "loss": 0.2684, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.8986446857452393, + "rewards/margins": 0.3911929130554199, + "rewards/rejected": 3.5074517726898193, + "step": 6716 + }, + { + "epoch": 1.09, + "learning_rate": 4.513184217587258e-06, + "logits/chosen": -0.8744400143623352, + "logits/rejected": -0.8645219206809998, + "logps/chosen": -66.17510986328125, + "logps/rejected": -52.910099029541016, + "loss": 0.2372, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.661934852600098, + "rewards/margins": 0.842585563659668, + "rewards/rejected": 3.8193492889404297, + "step": 6717 + }, + { + "epoch": 1.09, + "learning_rate": 4.5118762245883865e-06, + "logits/chosen": -1.7018427848815918, + "logits/rejected": -1.5875893831253052, + "logps/chosen": -107.0755844116211, + "logps/rejected": -24.195350646972656, + "loss": 0.5386, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.822681427001953, + "rewards/margins": 2.0564041137695312, + "rewards/rejected": 1.7662773132324219, + "step": 6718 + }, + { + "epoch": 1.09, + "learning_rate": 4.510568265314249e-06, + "logits/chosen": -0.774561882019043, + "logits/rejected": -0.7550959587097168, + "logps/chosen": -91.27215576171875, + "logps/rejected": -119.84860229492188, + "loss": 1.1336, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.3172829151153564, + "rewards/margins": 1.1938637495040894, + "rewards/rejected": 1.123419165611267, + "step": 6719 + }, + { + "epoch": 1.09, + "learning_rate": 4.509260339855217e-06, + "logits/chosen": -1.0380076169967651, + "logits/rejected": -0.9177587032318115, + "logps/chosen": -184.54563903808594, + "logps/rejected": -64.69182586669922, + "loss": 0.0972, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.827330112457275, + "rewards/margins": 3.697368621826172, + "rewards/rejected": 2.1299614906311035, + "step": 6720 + }, + { + "epoch": 1.09, + "learning_rate": 4.507952448301648e-06, + "logits/chosen": -1.432938575744629, + "logits/rejected": -1.319020390510559, + "logps/chosen": -83.71101379394531, + "logps/rejected": -68.86781311035156, + "loss": 0.0642, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.0284318923950195, + "rewards/margins": 2.87162184715271, + "rewards/rejected": 3.1568100452423096, + "step": 6721 + }, + { + "epoch": 1.09, + "learning_rate": 4.5066445907439104e-06, + "logits/chosen": -1.2563248872756958, + "logits/rejected": -1.1962450742721558, + "logps/chosen": -235.93504333496094, + "logps/rejected": -90.89606475830078, + "loss": 0.0029, + "rewards/accuracies": 1.0, + "rewards/chosen": 8.512794494628906, + "rewards/margins": 5.767053604125977, + "rewards/rejected": 2.7457406520843506, + "step": 6722 + }, + { + "epoch": 1.09, + "learning_rate": 4.505336767272363e-06, + "logits/chosen": -0.9518637657165527, + "logits/rejected": -0.9083629846572876, + "logps/chosen": -70.12092590332031, + "logps/rejected": -98.39358520507812, + "loss": 0.5329, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.539400577545166, + "rewards/margins": 0.17901849746704102, + "rewards/rejected": 3.360382080078125, + "step": 6723 + }, + { + "epoch": 1.09, + "learning_rate": 4.504028977977364e-06, + "logits/chosen": -1.538195252418518, + "logits/rejected": -1.5962707996368408, + "logps/chosen": -24.913368225097656, + "logps/rejected": -132.84388732910156, + "loss": 2.3097, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.551374912261963, + "rewards/margins": -3.9064207077026367, + "rewards/rejected": 6.4577956199646, + "step": 6724 + }, + { + "epoch": 1.09, + "learning_rate": 4.502721222949271e-06, + "logits/chosen": -1.436007022857666, + "logits/rejected": -1.2177674770355225, + "logps/chosen": -87.80116271972656, + "logps/rejected": -15.16191291809082, + "loss": 0.0064, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.409816265106201, + "rewards/margins": 4.748342037200928, + "rewards/rejected": 0.6614742279052734, + "step": 6725 + }, + { + "epoch": 1.09, + "learning_rate": 4.501413502278435e-06, + "logits/chosen": -1.4289627075195312, + "logits/rejected": -1.4558959007263184, + "logps/chosen": -33.447105407714844, + "logps/rejected": -38.632232666015625, + "loss": 0.4111, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.175748586654663, + "rewards/margins": -0.14774227142333984, + "rewards/rejected": 3.323490858078003, + "step": 6726 + }, + { + "epoch": 1.09, + "learning_rate": 4.500105816055208e-06, + "logits/chosen": -0.743383526802063, + "logits/rejected": -0.7397435903549194, + "logps/chosen": -2.093337059020996, + "logps/rejected": -12.671627044677734, + "loss": 0.5052, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.335680216550827, + "rewards/margins": -0.32529279589653015, + "rewards/rejected": 0.6609730124473572, + "step": 6727 + }, + { + "epoch": 1.09, + "learning_rate": 4.4987981643699405e-06, + "logits/chosen": -1.1432443857192993, + "logits/rejected": -1.3368602991104126, + "logps/chosen": -57.3562126159668, + "logps/rejected": -85.4257583618164, + "loss": 2.8083, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.6712467670440674, + "rewards/margins": -5.075572967529297, + "rewards/rejected": 6.746819496154785, + "step": 6728 + }, + { + "epoch": 1.09, + "learning_rate": 4.4974905473129755e-06, + "logits/chosen": -1.2534873485565186, + "logits/rejected": -1.2933109998703003, + "logps/chosen": -73.90245056152344, + "logps/rejected": -43.90712356567383, + "loss": 0.2992, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.7394859790802, + "rewards/margins": 0.20973777770996094, + "rewards/rejected": 2.5297482013702393, + "step": 6729 + }, + { + "epoch": 1.09, + "learning_rate": 4.4961829649746595e-06, + "logits/chosen": -1.181061863899231, + "logits/rejected": -1.1895010471343994, + "logps/chosen": -7.170920372009277, + "logps/rejected": -17.30510902404785, + "loss": 0.6509, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.12367945164442062, + "rewards/margins": 0.09735021740198135, + "rewards/rejected": 0.02632923237979412, + "step": 6730 + }, + { + "epoch": 1.09, + "learning_rate": 4.494875417445334e-06, + "logits/chosen": -1.4155573844909668, + "logits/rejected": -1.3396960496902466, + "logps/chosen": -124.9110336303711, + "logps/rejected": -68.99591064453125, + "loss": 0.062, + "rewards/accuracies": 1.0, + "rewards/chosen": 8.049915313720703, + "rewards/margins": 3.6204824447631836, + "rewards/rejected": 4.4294328689575195, + "step": 6731 + }, + { + "epoch": 1.09, + "learning_rate": 4.493567904815337e-06, + "logits/chosen": -1.154212474822998, + "logits/rejected": -1.2287057638168335, + "logps/chosen": -76.30717468261719, + "logps/rejected": -83.74250793457031, + "loss": 0.5333, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.510150194168091, + "rewards/margins": -0.6428968906402588, + "rewards/rejected": 4.15304708480835, + "step": 6732 + }, + { + "epoch": 1.09, + "learning_rate": 4.492260427175007e-06, + "logits/chosen": -1.3418158292770386, + "logits/rejected": -1.3501839637756348, + "logps/chosen": -30.824045181274414, + "logps/rejected": -33.26891326904297, + "loss": 0.6169, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.8789420127868652, + "rewards/margins": 0.04039287567138672, + "rewards/rejected": 2.8385491371154785, + "step": 6733 + }, + { + "epoch": 1.09, + "learning_rate": 4.490952984614676e-06, + "logits/chosen": -1.1000151634216309, + "logits/rejected": -1.0680824518203735, + "logps/chosen": -22.161638259887695, + "logps/rejected": -39.03544616699219, + "loss": 0.4346, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.6415308117866516, + "rewards/margins": -0.14593636989593506, + "rewards/rejected": 0.7874671816825867, + "step": 6734 + }, + { + "epoch": 1.09, + "learning_rate": 4.489645577224679e-06, + "logits/chosen": -1.128153920173645, + "logits/rejected": -1.1609164476394653, + "logps/chosen": -51.44575500488281, + "logps/rejected": -64.02517700195312, + "loss": 1.4471, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.0901596546173096, + "rewards/margins": -2.670013666152954, + "rewards/rejected": 4.760173320770264, + "step": 6735 + }, + { + "epoch": 1.09, + "learning_rate": 4.4883382050953415e-06, + "logits/chosen": -1.088317632675171, + "logits/rejected": -0.9471674561500549, + "logps/chosen": -53.424583435058594, + "logps/rejected": -45.04230499267578, + "loss": 0.3862, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.870374321937561, + "rewards/margins": 0.9536209106445312, + "rewards/rejected": 0.9167534112930298, + "step": 6736 + }, + { + "epoch": 1.09, + "learning_rate": 4.487030868316994e-06, + "logits/chosen": -0.9481267333030701, + "logits/rejected": -1.19120192527771, + "logps/chosen": -189.47348022460938, + "logps/rejected": -195.41856384277344, + "loss": 1.336, + "rewards/accuracies": 0.0, + "rewards/chosen": 7.427304267883301, + "rewards/margins": -2.5426836013793945, + "rewards/rejected": 9.969987869262695, + "step": 6737 + }, + { + "epoch": 1.09, + "learning_rate": 4.485723566979959e-06, + "logits/chosen": -1.3556991815567017, + "logits/rejected": -1.2777595520019531, + "logps/chosen": -119.40521240234375, + "logps/rejected": -95.58431243896484, + "loss": 0.0362, + "rewards/accuracies": 1.0, + "rewards/chosen": 8.472575187683105, + "rewards/margins": 2.839383602142334, + "rewards/rejected": 5.6331915855407715, + "step": 6738 + }, + { + "epoch": 1.09, + "learning_rate": 4.4844163011745595e-06, + "logits/chosen": -1.1430408954620361, + "logits/rejected": -1.1120009422302246, + "logps/chosen": -61.71249008178711, + "logps/rejected": -63.54572677612305, + "loss": 0.3689, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.8489445447921753, + "rewards/margins": 0.25882720947265625, + "rewards/rejected": 1.590117335319519, + "step": 6739 + }, + { + "epoch": 1.09, + "learning_rate": 4.483109070991115e-06, + "logits/chosen": -1.304919958114624, + "logits/rejected": -1.149294137954712, + "logps/chosen": -102.51387023925781, + "logps/rejected": -57.64270782470703, + "loss": 0.417, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2176575660705566, + "rewards/margins": 0.44019389152526855, + "rewards/rejected": 2.777463674545288, + "step": 6740 + }, + { + "epoch": 1.09, + "learning_rate": 4.481801876519943e-06, + "logits/chosen": -1.3947575092315674, + "logits/rejected": -1.5059653520584106, + "logps/chosen": -74.27273559570312, + "logps/rejected": -140.07803344726562, + "loss": 2.3693, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.4329193830490112, + "rewards/margins": -4.445184230804443, + "rewards/rejected": 5.878103733062744, + "step": 6741 + }, + { + "epoch": 1.09, + "learning_rate": 4.480494717851359e-06, + "logits/chosen": -0.8575092554092407, + "logits/rejected": -0.8526832461357117, + "logps/chosen": -4.434412002563477, + "logps/rejected": -4.76180362701416, + "loss": 2.1843, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.40743446350097656, + "rewards/margins": 0.0758008062839508, + "rewards/rejected": 0.33163365721702576, + "step": 6742 + }, + { + "epoch": 1.09, + "learning_rate": 4.479187595075673e-06, + "logits/chosen": -0.8352102041244507, + "logits/rejected": -0.8670406937599182, + "logps/chosen": -63.128440856933594, + "logps/rejected": -37.716400146484375, + "loss": 0.8484, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.8614692687988281, + "rewards/margins": -1.4653019905090332, + "rewards/rejected": 2.3267712593078613, + "step": 6743 + }, + { + "epoch": 1.09, + "learning_rate": 4.477880508283197e-06, + "logits/chosen": -1.3260904550552368, + "logits/rejected": -1.3442128896713257, + "logps/chosen": -114.57333374023438, + "logps/rejected": -81.39105224609375, + "loss": 0.3725, + "rewards/accuracies": 0.0, + "rewards/chosen": 4.755429267883301, + "rewards/margins": -0.08861064910888672, + "rewards/rejected": 4.8440399169921875, + "step": 6744 + }, + { + "epoch": 1.09, + "learning_rate": 4.476573457564237e-06, + "logits/chosen": -0.8448460698127747, + "logits/rejected": -0.8448460698127747, + "logps/chosen": -67.97007751464844, + "logps/rejected": -67.97007751464844, + "loss": 0.6523, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.149458408355713, + "rewards/margins": 0.0, + "rewards/rejected": 2.149458408355713, + "step": 6745 + }, + { + "epoch": 1.09, + "learning_rate": 4.4752664430090985e-06, + "logits/chosen": -1.4974169731140137, + "logits/rejected": -1.4015172719955444, + "logps/chosen": -142.3094482421875, + "logps/rejected": -56.88167190551758, + "loss": 0.0505, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.079066753387451, + "rewards/margins": 2.3452818393707275, + "rewards/rejected": 1.7337849140167236, + "step": 6746 + }, + { + "epoch": 1.1, + "learning_rate": 4.473959464708084e-06, + "logits/chosen": -1.2688285112380981, + "logits/rejected": -1.4038044214248657, + "logps/chosen": -252.2886962890625, + "logps/rejected": -75.35002136230469, + "loss": 0.0889, + "rewards/accuracies": 1.0, + "rewards/chosen": 7.173169136047363, + "rewards/margins": 1.75225830078125, + "rewards/rejected": 5.420910835266113, + "step": 6747 + }, + { + "epoch": 1.1, + "learning_rate": 4.472652522751493e-06, + "logits/chosen": -1.4502508640289307, + "logits/rejected": -1.111025094985962, + "logps/chosen": -148.38284301757812, + "logps/rejected": -90.12594604492188, + "loss": 0.0168, + "rewards/accuracies": 1.0, + "rewards/chosen": 7.449989318847656, + "rewards/margins": 3.380465507507324, + "rewards/rejected": 4.069523811340332, + "step": 6748 + }, + { + "epoch": 1.1, + "learning_rate": 4.471345617229623e-06, + "logits/chosen": -0.8287094235420227, + "logits/rejected": -0.8287094235420227, + "logps/chosen": -0.7274497151374817, + "logps/rejected": -0.7274497151374817, + "loss": 0.561, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.20124812424182892, + "rewards/margins": 0.0, + "rewards/rejected": 0.20124812424182892, + "step": 6749 + }, + { + "epoch": 1.1, + "learning_rate": 4.470038748232768e-06, + "logits/chosen": -1.2585113048553467, + "logits/rejected": -1.1505062580108643, + "logps/chosen": -118.21009826660156, + "logps/rejected": -69.02534484863281, + "loss": 0.0518, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.920362949371338, + "rewards/margins": 3.0489578247070312, + "rewards/rejected": 2.8714051246643066, + "step": 6750 + }, + { + "epoch": 1.1, + "learning_rate": 4.4687319158512215e-06, + "logits/chosen": -1.192931890487671, + "logits/rejected": -1.1642999649047852, + "logps/chosen": -29.241443634033203, + "logps/rejected": -63.714622497558594, + "loss": 0.4373, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.325606107711792, + "rewards/margins": -0.22738385200500488, + "rewards/rejected": 2.552989959716797, + "step": 6751 + }, + { + "epoch": 1.1, + "learning_rate": 4.4674251201752726e-06, + "logits/chosen": -1.0647119283676147, + "logits/rejected": -1.053313136100769, + "logps/chosen": -84.27780151367188, + "logps/rejected": -53.78053283691406, + "loss": 0.4049, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.430391788482666, + "rewards/margins": 1.5707662105560303, + "rewards/rejected": 0.8596256375312805, + "step": 6752 + }, + { + "epoch": 1.1, + "learning_rate": 4.466118361295208e-06, + "logits/chosen": -1.33326256275177, + "logits/rejected": -1.2621670961380005, + "logps/chosen": -90.79537963867188, + "logps/rejected": -112.40794372558594, + "loss": 0.0811, + "rewards/accuracies": 1.0, + "rewards/chosen": 8.50390911102295, + "rewards/margins": 1.903031826019287, + "rewards/rejected": 6.600877285003662, + "step": 6753 + }, + { + "epoch": 1.1, + "learning_rate": 4.464811639301314e-06, + "logits/chosen": -0.760286271572113, + "logits/rejected": -0.2859535217285156, + "logps/chosen": -97.60033416748047, + "logps/rejected": -128.56973266601562, + "loss": 1.0125, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.8996734619140625, + "rewards/margins": -1.6227891445159912, + "rewards/rejected": 3.5224626064300537, + "step": 6754 + }, + { + "epoch": 1.1, + "learning_rate": 4.463504954283872e-06, + "logits/chosen": -0.9361060857772827, + "logits/rejected": -0.9361060857772827, + "logps/chosen": -25.79846954345703, + "logps/rejected": -25.79846954345703, + "loss": 1.2803, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.171969175338745, + "rewards/margins": 0.0, + "rewards/rejected": 2.171969175338745, + "step": 6755 + }, + { + "epoch": 1.1, + "learning_rate": 4.46219830633316e-06, + "logits/chosen": -1.5071412324905396, + "logits/rejected": -1.4392883777618408, + "logps/chosen": -54.66106033325195, + "logps/rejected": -91.89210510253906, + "loss": 0.3215, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.351025104522705, + "rewards/margins": 0.6568653583526611, + "rewards/rejected": 2.694159746170044, + "step": 6756 + }, + { + "epoch": 1.1, + "learning_rate": 4.4608916955394575e-06, + "logits/chosen": -0.9455186128616333, + "logits/rejected": -0.8809769153594971, + "logps/chosen": -64.42868041992188, + "logps/rejected": -48.07789611816406, + "loss": 0.3026, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.4527299404144287, + "rewards/margins": 0.18451929092407227, + "rewards/rejected": 2.2682106494903564, + "step": 6757 + }, + { + "epoch": 1.1, + "learning_rate": 4.459585121993037e-06, + "logits/chosen": -1.1338502168655396, + "logits/rejected": -1.1499730348587036, + "logps/chosen": -65.78155517578125, + "logps/rejected": -61.092124938964844, + "loss": 1.9864, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.9048584699630737, + "rewards/margins": -0.7897316217422485, + "rewards/rejected": 2.6945900917053223, + "step": 6758 + }, + { + "epoch": 1.1, + "learning_rate": 4.458278585784171e-06, + "logits/chosen": -1.3333275318145752, + "logits/rejected": -1.2620980739593506, + "logps/chosen": -76.98198699951172, + "logps/rejected": -76.58464050292969, + "loss": 1.0419, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.468793511390686, + "rewards/margins": -1.4952071905136108, + "rewards/rejected": 2.964000701904297, + "step": 6759 + }, + { + "epoch": 1.1, + "learning_rate": 4.45697208700313e-06, + "logits/chosen": -1.026754379272461, + "logits/rejected": -1.0867431163787842, + "logps/chosen": -65.75305938720703, + "logps/rejected": -86.86775970458984, + "loss": 0.7207, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.8586952686309814, + "rewards/margins": -0.3540070056915283, + "rewards/rejected": 2.2127022743225098, + "step": 6760 + }, + { + "epoch": 1.1, + "learning_rate": 4.4556656257401786e-06, + "logits/chosen": -1.4613244533538818, + "logits/rejected": -1.3692368268966675, + "logps/chosen": -88.01849365234375, + "logps/rejected": -27.772998809814453, + "loss": 0.0673, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.5264594554901123, + "rewards/margins": 2.2163596153259277, + "rewards/rejected": 1.3100998401641846, + "step": 6761 + }, + { + "epoch": 1.1, + "learning_rate": 4.454359202085582e-06, + "logits/chosen": -1.089473009109497, + "logits/rejected": -1.0361454486846924, + "logps/chosen": -33.91242980957031, + "logps/rejected": -13.837575912475586, + "loss": 0.2442, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.3475903272628784, + "rewards/margins": 0.6652018427848816, + "rewards/rejected": 0.6823884844779968, + "step": 6762 + }, + { + "epoch": 1.1, + "learning_rate": 4.453052816129602e-06, + "logits/chosen": -1.324844479560852, + "logits/rejected": -1.3549646139144897, + "logps/chosen": -88.71028137207031, + "logps/rejected": -186.7049102783203, + "loss": 0.4227, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.882150411605835, + "rewards/margins": -0.1376173496246338, + "rewards/rejected": 3.0197677612304688, + "step": 6763 + }, + { + "epoch": 1.1, + "learning_rate": 4.4517464679624964e-06, + "logits/chosen": -1.1415690183639526, + "logits/rejected": -1.0086982250213623, + "logps/chosen": -85.68791961669922, + "logps/rejected": -37.65821838378906, + "loss": 0.0531, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.165119171142578, + "rewards/margins": 2.2421348094940186, + "rewards/rejected": -0.07701569050550461, + "step": 6764 + }, + { + "epoch": 1.1, + "learning_rate": 4.450440157674523e-06, + "logits/chosen": -0.9238847494125366, + "logits/rejected": -0.8782241344451904, + "logps/chosen": -115.28758239746094, + "logps/rejected": -67.43071746826172, + "loss": 0.1278, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.06546950340271, + "rewards/margins": 1.328406572341919, + "rewards/rejected": 0.7370628714561462, + "step": 6765 + }, + { + "epoch": 1.1, + "learning_rate": 4.449133885355934e-06, + "logits/chosen": -1.2851393222808838, + "logits/rejected": -1.3001248836517334, + "logps/chosen": -67.24847412109375, + "logps/rejected": -52.77217102050781, + "loss": 0.2486, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.6594483852386475, + "rewards/margins": 0.44912123680114746, + "rewards/rejected": 2.2103271484375, + "step": 6766 + }, + { + "epoch": 1.1, + "learning_rate": 4.4478276510969815e-06, + "logits/chosen": -1.26298189163208, + "logits/rejected": -1.2331088781356812, + "logps/chosen": -66.33831787109375, + "logps/rejected": -66.83629608154297, + "loss": 0.2737, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.726161241531372, + "rewards/margins": 0.3171684741973877, + "rewards/rejected": 1.4089927673339844, + "step": 6767 + }, + { + "epoch": 1.1, + "learning_rate": 4.446521454987913e-06, + "logits/chosen": -0.9954492449760437, + "logits/rejected": -0.9945208430290222, + "logps/chosen": -59.90924072265625, + "logps/rejected": -141.03221130371094, + "loss": 0.9721, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.8430213928222656, + "rewards/margins": -0.028365373611450195, + "rewards/rejected": 1.8713867664337158, + "step": 6768 + }, + { + "epoch": 1.1, + "learning_rate": 4.445215297118976e-06, + "logits/chosen": -1.2836415767669678, + "logits/rejected": -1.133309006690979, + "logps/chosen": -126.69261169433594, + "logps/rejected": -52.75648880004883, + "loss": 0.3883, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.264212131500244, + "rewards/margins": 2.4266369342803955, + "rewards/rejected": 1.8375751972198486, + "step": 6769 + }, + { + "epoch": 1.1, + "learning_rate": 4.443909177580412e-06, + "logits/chosen": -1.6707589626312256, + "logits/rejected": -1.5698460340499878, + "logps/chosen": -65.11665344238281, + "logps/rejected": -14.900964736938477, + "loss": 0.1468, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.45290994644165, + "rewards/margins": 5.394360542297363, + "rewards/rejected": 1.0585492849349976, + "step": 6770 + }, + { + "epoch": 1.1, + "learning_rate": 4.442603096462463e-06, + "logits/chosen": -1.3225921392440796, + "logits/rejected": -1.1944774389266968, + "logps/chosen": -103.1007080078125, + "logps/rejected": -61.33127975463867, + "loss": 0.1701, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.673696994781494, + "rewards/margins": 0.9435421228408813, + "rewards/rejected": 1.7301548719406128, + "step": 6771 + }, + { + "epoch": 1.1, + "learning_rate": 4.441297053855365e-06, + "logits/chosen": -1.1893985271453857, + "logits/rejected": -1.1893095970153809, + "logps/chosen": -102.67512512207031, + "logps/rejected": -105.15171813964844, + "loss": 0.5241, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.69317626953125, + "rewards/margins": -0.5874404907226562, + "rewards/rejected": 3.2806167602539062, + "step": 6772 + }, + { + "epoch": 1.1, + "learning_rate": 4.4399910498493545e-06, + "logits/chosen": -1.157183051109314, + "logits/rejected": -1.1252552270889282, + "logps/chosen": -71.80230712890625, + "logps/rejected": -50.52772521972656, + "loss": 0.1207, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.48308801651001, + "rewards/margins": 3.213013172149658, + "rewards/rejected": 1.2700748443603516, + "step": 6773 + }, + { + "epoch": 1.1, + "learning_rate": 4.438685084534663e-06, + "logits/chosen": -1.0594934225082397, + "logits/rejected": -1.0594934225082397, + "logps/chosen": -14.019073486328125, + "logps/rejected": -14.019073486328125, + "loss": 0.3703, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.5785732269287109, + "rewards/margins": 0.0, + "rewards/rejected": 0.5785732269287109, + "step": 6774 + }, + { + "epoch": 1.1, + "learning_rate": 4.437379158001521e-06, + "logits/chosen": -1.194894790649414, + "logits/rejected": -0.9462695717811584, + "logps/chosen": -57.129066467285156, + "logps/rejected": -150.61199951171875, + "loss": 0.5627, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.2841804027557373, + "rewards/margins": -0.45157408714294434, + "rewards/rejected": 3.7357544898986816, + "step": 6775 + }, + { + "epoch": 1.1, + "learning_rate": 4.436073270340156e-06, + "logits/chosen": -1.2934623956680298, + "logits/rejected": -1.15598726272583, + "logps/chosen": -124.43293762207031, + "logps/rejected": -26.521652221679688, + "loss": 0.1815, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.5576508045196533, + "rewards/margins": 0.8276348114013672, + "rewards/rejected": 1.7300159931182861, + "step": 6776 + }, + { + "epoch": 1.1, + "learning_rate": 4.434767421640792e-06, + "logits/chosen": -0.7643440961837769, + "logits/rejected": -0.759690523147583, + "logps/chosen": -3.1212010383605957, + "logps/rejected": -2.514126777648926, + "loss": 0.3488, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.26945778727531433, + "rewards/margins": 0.07640959322452545, + "rewards/rejected": 0.19304819405078888, + "step": 6777 + }, + { + "epoch": 1.1, + "learning_rate": 4.4334616119936516e-06, + "logits/chosen": -1.4385449886322021, + "logits/rejected": -1.100414514541626, + "logps/chosen": -69.93912506103516, + "logps/rejected": -121.59541320800781, + "loss": 0.2813, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.667682647705078, + "rewards/margins": 0.3006293773651123, + "rewards/rejected": 2.367053270339966, + "step": 6778 + }, + { + "epoch": 1.1, + "learning_rate": 4.432155841488952e-06, + "logits/chosen": -1.0625346899032593, + "logits/rejected": -1.0201858282089233, + "logps/chosen": -70.83607482910156, + "logps/rejected": -38.17998504638672, + "loss": 0.7166, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.30289626121521, + "rewards/margins": -1.159651756286621, + "rewards/rejected": 3.462548017501831, + "step": 6779 + }, + { + "epoch": 1.1, + "learning_rate": 4.4308501102169115e-06, + "logits/chosen": -1.473790168762207, + "logits/rejected": -1.473790168762207, + "logps/chosen": -71.32538604736328, + "logps/rejected": -71.32538604736328, + "loss": 0.3486, + "rewards/accuracies": 0.0, + "rewards/chosen": 4.228222846984863, + "rewards/margins": 0.0, + "rewards/rejected": 4.228222846984863, + "step": 6780 + }, + { + "epoch": 1.1, + "learning_rate": 4.4295444182677425e-06, + "logits/chosen": -1.2247581481933594, + "logits/rejected": -1.3230259418487549, + "logps/chosen": -64.00028228759766, + "logps/rejected": -125.44207763671875, + "loss": 1.7137, + "rewards/accuracies": 0.0, + "rewards/chosen": 4.498205661773682, + "rewards/margins": -3.3779540061950684, + "rewards/rejected": 7.87615966796875, + "step": 6781 + }, + { + "epoch": 1.1, + "learning_rate": 4.428238765731657e-06, + "logits/chosen": -1.3052362203598022, + "logits/rejected": -1.2758907079696655, + "logps/chosen": -94.24893188476562, + "logps/rejected": -70.54418182373047, + "loss": 0.4913, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.0678176879882812, + "rewards/margins": -0.4001481533050537, + "rewards/rejected": 3.467965841293335, + "step": 6782 + }, + { + "epoch": 1.1, + "learning_rate": 4.426933152698862e-06, + "logits/chosen": -1.8204853534698486, + "logits/rejected": -1.7600969076156616, + "logps/chosen": -131.3173828125, + "logps/rejected": -26.32298469543457, + "loss": 0.019, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.303184509277344, + "rewards/margins": 3.633065938949585, + "rewards/rejected": 1.6701185703277588, + "step": 6783 + }, + { + "epoch": 1.1, + "learning_rate": 4.425627579259565e-06, + "logits/chosen": -1.6322821378707886, + "logits/rejected": -1.5773714780807495, + "logps/chosen": -35.76370620727539, + "logps/rejected": -73.38018798828125, + "loss": 0.3339, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.696688413619995, + "rewards/margins": 0.05674314498901367, + "rewards/rejected": 2.6399452686309814, + "step": 6784 + }, + { + "epoch": 1.1, + "learning_rate": 4.424322045503966e-06, + "logits/chosen": -1.3365949392318726, + "logits/rejected": -1.3093245029449463, + "logps/chosen": -142.57415771484375, + "logps/rejected": -126.7238540649414, + "loss": 0.486, + "rewards/accuracies": 0.0, + "rewards/chosen": 6.773037910461426, + "rewards/margins": -0.22435808181762695, + "rewards/rejected": 6.997395992279053, + "step": 6785 + }, + { + "epoch": 1.1, + "learning_rate": 4.423016551522268e-06, + "logits/chosen": -1.376299262046814, + "logits/rejected": -1.131480097770691, + "logps/chosen": -102.13497161865234, + "logps/rejected": -38.16696548461914, + "loss": 0.0171, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.060221195220947, + "rewards/margins": 3.9853763580322266, + "rewards/rejected": 0.07484474033117294, + "step": 6786 + }, + { + "epoch": 1.1, + "learning_rate": 4.421711097404666e-06, + "logits/chosen": -1.3711680173873901, + "logits/rejected": -1.3199659585952759, + "logps/chosen": -100.08074951171875, + "logps/rejected": -44.230587005615234, + "loss": 0.0273, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.243779182434082, + "rewards/margins": 4.896875381469727, + "rewards/rejected": 0.34690362215042114, + "step": 6787 + }, + { + "epoch": 1.1, + "learning_rate": 4.420405683241356e-06, + "logits/chosen": -0.97002774477005, + "logits/rejected": -0.9701403975486755, + "logps/chosen": -3.8003928661346436, + "logps/rejected": -3.4505295753479004, + "loss": 1.8826, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.340399831533432, + "rewards/margins": 0.005716413259506226, + "rewards/rejected": 0.3346834182739258, + "step": 6788 + }, + { + "epoch": 1.1, + "learning_rate": 4.419100309122528e-06, + "logits/chosen": -1.1293566226959229, + "logits/rejected": -1.1077097654342651, + "logps/chosen": -92.08634948730469, + "logps/rejected": -41.04573440551758, + "loss": 0.296, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.619331359863281, + "rewards/margins": 2.514787197113037, + "rewards/rejected": 2.104544162750244, + "step": 6789 + }, + { + "epoch": 1.1, + "learning_rate": 4.4177949751383725e-06, + "logits/chosen": -1.152808427810669, + "logits/rejected": -1.197205901145935, + "logps/chosen": -63.21523666381836, + "logps/rejected": -59.27193832397461, + "loss": 0.2545, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.1102025508880615, + "rewards/margins": 0.9901200532913208, + "rewards/rejected": 1.1200824975967407, + "step": 6790 + }, + { + "epoch": 1.1, + "learning_rate": 4.416489681379076e-06, + "logits/chosen": -1.0097920894622803, + "logits/rejected": -1.0835694074630737, + "logps/chosen": -63.803382873535156, + "logps/rejected": -120.78094482421875, + "loss": 0.8348, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.723775625228882, + "rewards/margins": -0.5057106018066406, + "rewards/rejected": 3.2294862270355225, + "step": 6791 + }, + { + "epoch": 1.1, + "learning_rate": 4.41518442793482e-06, + "logits/chosen": -1.2629190683364868, + "logits/rejected": -0.8577538728713989, + "logps/chosen": -84.36039733886719, + "logps/rejected": -218.131591796875, + "loss": 3.0674, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.1541688442230225, + "rewards/margins": -6.130318641662598, + "rewards/rejected": 8.2844877243042, + "step": 6792 + }, + { + "epoch": 1.1, + "learning_rate": 4.413879214895788e-06, + "logits/chosen": -0.7376730442047119, + "logits/rejected": -0.7283216118812561, + "logps/chosen": -25.224624633789062, + "logps/rejected": -9.850682258605957, + "loss": 0.3085, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.4070453643798828, + "rewards/margins": 0.38908377289772034, + "rewards/rejected": 0.017961597070097923, + "step": 6793 + }, + { + "epoch": 1.1, + "learning_rate": 4.412574042352156e-06, + "logits/chosen": -1.1166691780090332, + "logits/rejected": -1.1416265964508057, + "logps/chosen": -55.4713134765625, + "logps/rejected": -98.87601470947266, + "loss": 0.9955, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.4940506219863892, + "rewards/margins": 0.4989723563194275, + "rewards/rejected": 0.9950782656669617, + "step": 6794 + }, + { + "epoch": 1.1, + "learning_rate": 4.411268910394099e-06, + "logits/chosen": -1.1380088329315186, + "logits/rejected": -1.1566087007522583, + "logps/chosen": -55.21065902709961, + "logps/rejected": -38.27534103393555, + "loss": 0.5821, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.558370590209961, + "rewards/margins": 0.28101420402526855, + "rewards/rejected": 2.2773563861846924, + "step": 6795 + }, + { + "epoch": 1.1, + "learning_rate": 4.409963819111789e-06, + "logits/chosen": -1.3131440877914429, + "logits/rejected": -1.2728363275527954, + "logps/chosen": -181.08079528808594, + "logps/rejected": -57.18280029296875, + "loss": 0.0207, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.860083103179932, + "rewards/margins": 4.0985918045043945, + "rewards/rejected": 2.761491537094116, + "step": 6796 + }, + { + "epoch": 1.1, + "learning_rate": 4.4086587685953966e-06, + "logits/chosen": -1.1186116933822632, + "logits/rejected": -1.0722835063934326, + "logps/chosen": -125.23373413085938, + "logps/rejected": -50.42388916015625, + "loss": 0.2823, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.0042800903320312, + "rewards/margins": 0.5087646245956421, + "rewards/rejected": 1.4955154657363892, + "step": 6797 + }, + { + "epoch": 1.1, + "learning_rate": 4.4073537589350885e-06, + "logits/chosen": -1.4028886556625366, + "logits/rejected": -1.3680671453475952, + "logps/chosen": -135.54486083984375, + "logps/rejected": -101.60887145996094, + "loss": 0.3698, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.5180542469024658, + "rewards/margins": 0.6066910028457642, + "rewards/rejected": 0.9113632440567017, + "step": 6798 + }, + { + "epoch": 1.1, + "learning_rate": 4.406048790221027e-06, + "logits/chosen": -1.22748601436615, + "logits/rejected": -1.158826231956482, + "logps/chosen": -107.654052734375, + "logps/rejected": -68.36703491210938, + "loss": 0.2324, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.7223236560821533, + "rewards/margins": 1.7789772748947144, + "rewards/rejected": 1.943346381187439, + "step": 6799 + }, + { + "epoch": 1.1, + "learning_rate": 4.4047438625433755e-06, + "logits/chosen": -1.1701209545135498, + "logits/rejected": -1.1740710735321045, + "logps/chosen": -92.50645446777344, + "logps/rejected": -94.03874969482422, + "loss": 0.2005, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.090474843978882, + "rewards/margins": 0.7431763410568237, + "rewards/rejected": 1.347298502922058, + "step": 6800 + }, + { + "epoch": 1.1, + "learning_rate": 4.4034389759922894e-06, + "logits/chosen": -1.2635984420776367, + "logits/rejected": -1.1383370161056519, + "logps/chosen": -129.99563598632812, + "logps/rejected": -44.69415283203125, + "loss": 0.0147, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.9933061599731445, + "rewards/margins": 3.6950173377990723, + "rewards/rejected": 2.2982888221740723, + "step": 6801 + }, + { + "epoch": 1.1, + "learning_rate": 4.402134130657925e-06, + "logits/chosen": -1.041214942932129, + "logits/rejected": -1.022341012954712, + "logps/chosen": -27.610958099365234, + "logps/rejected": -4.760165691375732, + "loss": 0.3603, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.7772804498672485, + "rewards/margins": 0.05015653371810913, + "rewards/rejected": 0.7271239161491394, + "step": 6802 + }, + { + "epoch": 1.1, + "learning_rate": 4.400829326630437e-06, + "logits/chosen": -1.1789798736572266, + "logits/rejected": -1.2022842168807983, + "logps/chosen": -62.45269012451172, + "logps/rejected": -46.311248779296875, + "loss": 0.6972, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.149878740310669, + "rewards/margins": 0.35362327098846436, + "rewards/rejected": 1.7962554693222046, + "step": 6803 + }, + { + "epoch": 1.1, + "learning_rate": 4.399524563999972e-06, + "logits/chosen": -1.3594528436660767, + "logits/rejected": -1.2358028888702393, + "logps/chosen": -52.274391174316406, + "logps/rejected": -10.258996963500977, + "loss": 0.4513, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.692604899406433, + "rewards/margins": 0.11479663848876953, + "rewards/rejected": 1.5778082609176636, + "step": 6804 + }, + { + "epoch": 1.1, + "learning_rate": 4.3982198428566775e-06, + "logits/chosen": -1.0110735893249512, + "logits/rejected": -0.9872137308120728, + "logps/chosen": -25.347265243530273, + "logps/rejected": -19.948923110961914, + "loss": 0.2377, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.6765932440757751, + "rewards/margins": 0.5502403378486633, + "rewards/rejected": 0.12635289132595062, + "step": 6805 + }, + { + "epoch": 1.1, + "learning_rate": 4.396915163290698e-06, + "logits/chosen": -1.0294804573059082, + "logits/rejected": -1.0439549684524536, + "logps/chosen": -64.91619110107422, + "logps/rejected": -63.1490478515625, + "loss": 0.2197, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.7056641578674316, + "rewards/margins": 1.1705918312072754, + "rewards/rejected": 1.5350723266601562, + "step": 6806 + }, + { + "epoch": 1.1, + "learning_rate": 4.395610525392175e-06, + "logits/chosen": -1.0957483053207397, + "logits/rejected": -1.073763370513916, + "logps/chosen": -67.51863098144531, + "logps/rejected": -145.93536376953125, + "loss": 1.617, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.52569580078125, + "rewards/margins": -3.1772918701171875, + "rewards/rejected": 5.7029876708984375, + "step": 6807 + }, + { + "epoch": 1.11, + "learning_rate": 4.3943059292512455e-06, + "logits/chosen": -1.1542561054229736, + "logits/rejected": -1.090085506439209, + "logps/chosen": -112.06254577636719, + "logps/rejected": -57.53847122192383, + "loss": 0.2373, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.0862350463867188, + "rewards/margins": 0.5610615015029907, + "rewards/rejected": 1.525173544883728, + "step": 6808 + }, + { + "epoch": 1.11, + "learning_rate": 4.3930013749580445e-06, + "logits/chosen": -1.3453644514083862, + "logits/rejected": -1.3282662630081177, + "logps/chosen": -21.95366096496582, + "logps/rejected": -58.519561767578125, + "loss": 2.1724, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.643363356590271, + "rewards/margins": 0.36637210845947266, + "rewards/rejected": 1.2769912481307983, + "step": 6809 + }, + { + "epoch": 1.11, + "learning_rate": 4.391696862602706e-06, + "logits/chosen": -1.1930570602416992, + "logits/rejected": -1.2258436679840088, + "logps/chosen": -72.30572509765625, + "logps/rejected": -103.21861267089844, + "loss": 0.4294, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.1367478370666504, + "rewards/margins": -0.19288945198059082, + "rewards/rejected": 2.329637289047241, + "step": 6810 + }, + { + "epoch": 1.11, + "learning_rate": 4.390392392275358e-06, + "logits/chosen": -1.344212532043457, + "logits/rejected": -1.3050336837768555, + "logps/chosen": -68.44090270996094, + "logps/rejected": -25.41058349609375, + "loss": 0.1423, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.338029623031616, + "rewards/margins": 1.1349332332611084, + "rewards/rejected": 1.2030963897705078, + "step": 6811 + }, + { + "epoch": 1.11, + "learning_rate": 4.389087964066128e-06, + "logits/chosen": -1.4200413227081299, + "logits/rejected": -1.373999834060669, + "logps/chosen": -60.55481719970703, + "logps/rejected": -22.83332633972168, + "loss": 0.4212, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.6307106018066406, + "rewards/margins": 1.0062536001205444, + "rewards/rejected": 1.6244570016860962, + "step": 6812 + }, + { + "epoch": 1.11, + "learning_rate": 4.387783578065139e-06, + "logits/chosen": -0.909615695476532, + "logits/rejected": -0.6913045048713684, + "logps/chosen": -48.948509216308594, + "logps/rejected": -15.834444046020508, + "loss": 0.0435, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.144228458404541, + "rewards/margins": 2.7014060020446777, + "rewards/rejected": 0.4428224563598633, + "step": 6813 + }, + { + "epoch": 1.11, + "learning_rate": 4.386479234362512e-06, + "logits/chosen": -1.15776526927948, + "logits/rejected": -1.1341711282730103, + "logps/chosen": -69.00128173828125, + "logps/rejected": -128.8772430419922, + "loss": 0.9115, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.048719882965088, + "rewards/margins": -0.6719710826873779, + "rewards/rejected": 2.720690965652466, + "step": 6814 + }, + { + "epoch": 1.11, + "learning_rate": 4.385174933048364e-06, + "logits/chosen": -1.6418726444244385, + "logits/rejected": -1.665967345237732, + "logps/chosen": -54.11920166015625, + "logps/rejected": -62.012271881103516, + "loss": 0.0912, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.4631950855255127, + "rewards/margins": 1.7753658294677734, + "rewards/rejected": 1.6878292560577393, + "step": 6815 + }, + { + "epoch": 1.11, + "learning_rate": 4.383870674212811e-06, + "logits/chosen": -1.5039489269256592, + "logits/rejected": -1.4871469736099243, + "logps/chosen": -57.34559631347656, + "logps/rejected": -89.4647216796875, + "loss": 0.5914, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.0371901988983154, + "rewards/margins": -0.7705116271972656, + "rewards/rejected": 3.807701826095581, + "step": 6816 + }, + { + "epoch": 1.11, + "learning_rate": 4.382566457945965e-06, + "logits/chosen": -1.0950621366500854, + "logits/rejected": -1.0950621366500854, + "logps/chosen": -57.93891906738281, + "logps/rejected": -57.93891906738281, + "loss": 0.3488, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.0084190368652344, + "rewards/margins": 0.0, + "rewards/rejected": 3.0084190368652344, + "step": 6817 + }, + { + "epoch": 1.11, + "learning_rate": 4.381262284337934e-06, + "logits/chosen": -1.4011937379837036, + "logits/rejected": -1.3859333992004395, + "logps/chosen": -154.13792419433594, + "logps/rejected": -96.85901641845703, + "loss": 0.3363, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.756455898284912, + "rewards/margins": 0.10619401931762695, + "rewards/rejected": 6.650261878967285, + "step": 6818 + }, + { + "epoch": 1.11, + "learning_rate": 4.379958153478824e-06, + "logits/chosen": -1.194640040397644, + "logits/rejected": -1.1899183988571167, + "logps/chosen": -124.48612976074219, + "logps/rejected": -108.045166015625, + "loss": 0.4984, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.550685167312622, + "rewards/margins": -0.5200729370117188, + "rewards/rejected": 3.070758104324341, + "step": 6819 + }, + { + "epoch": 1.11, + "learning_rate": 4.37865406545874e-06, + "logits/chosen": -0.9701068997383118, + "logits/rejected": -0.9645780324935913, + "logps/chosen": -9.0750732421875, + "logps/rejected": -23.80504035949707, + "loss": 0.5397, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.931412935256958, + "rewards/margins": 0.052871108055114746, + "rewards/rejected": 1.8785418272018433, + "step": 6820 + }, + { + "epoch": 1.11, + "learning_rate": 4.37735002036778e-06, + "logits/chosen": -1.1661170721054077, + "logits/rejected": -1.1906424760818481, + "logps/chosen": -60.445526123046875, + "logps/rejected": -55.271358489990234, + "loss": 0.5933, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.7510926723480225, + "rewards/margins": 0.8496320247650146, + "rewards/rejected": 1.9014606475830078, + "step": 6821 + }, + { + "epoch": 1.11, + "learning_rate": 4.376046018296043e-06, + "logits/chosen": -1.1598528623580933, + "logits/rejected": -0.9901909232139587, + "logps/chosen": -101.68435668945312, + "logps/rejected": -57.30104064941406, + "loss": 0.4562, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.602804660797119, + "rewards/margins": 2.588961124420166, + "rewards/rejected": 3.013843536376953, + "step": 6822 + }, + { + "epoch": 1.11, + "learning_rate": 4.374742059333621e-06, + "logits/chosen": -1.2662146091461182, + "logits/rejected": -1.2101129293441772, + "logps/chosen": -89.90301513671875, + "logps/rejected": -44.07776641845703, + "loss": 0.2396, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.2788779735565186, + "rewards/margins": 0.8823665380477905, + "rewards/rejected": 1.396511435508728, + "step": 6823 + }, + { + "epoch": 1.11, + "learning_rate": 4.3734381435706075e-06, + "logits/chosen": -1.2820119857788086, + "logits/rejected": -1.3917765617370605, + "logps/chosen": -33.131568908691406, + "logps/rejected": -126.1250228881836, + "loss": 3.3147, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.3734002113342285, + "rewards/margins": -4.1234540939331055, + "rewards/rejected": 6.496854305267334, + "step": 6824 + }, + { + "epoch": 1.11, + "learning_rate": 4.372134271097089e-06, + "logits/chosen": -1.343888282775879, + "logits/rejected": -1.1420974731445312, + "logps/chosen": -87.93860626220703, + "logps/rejected": -27.879390716552734, + "loss": 0.0485, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.110446929931641, + "rewards/margins": 3.7997496128082275, + "rewards/rejected": 0.31069737672805786, + "step": 6825 + }, + { + "epoch": 1.11, + "learning_rate": 4.370830442003152e-06, + "logits/chosen": -1.4536645412445068, + "logits/rejected": -1.5163527727127075, + "logps/chosen": -162.29244995117188, + "logps/rejected": -74.31719970703125, + "loss": 0.2454, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.71439528465271, + "rewards/margins": 0.9461060762405396, + "rewards/rejected": 1.7682892084121704, + "step": 6826 + }, + { + "epoch": 1.11, + "learning_rate": 4.369526656378878e-06, + "logits/chosen": -1.4598442316055298, + "logits/rejected": -1.3237448930740356, + "logps/chosen": -121.71757507324219, + "logps/rejected": -32.17552185058594, + "loss": 0.0088, + "rewards/accuracies": 1.0, + "rewards/chosen": 7.091189861297607, + "rewards/margins": 4.123333930969238, + "rewards/rejected": 2.967855930328369, + "step": 6827 + }, + { + "epoch": 1.11, + "learning_rate": 4.3682229143143465e-06, + "logits/chosen": -0.8785098791122437, + "logits/rejected": -0.8763087391853333, + "logps/chosen": -3.675513505935669, + "logps/rejected": -4.579911231994629, + "loss": 0.8837, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.41692811250686646, + "rewards/margins": -0.08610266447067261, + "rewards/rejected": 0.5030307769775391, + "step": 6828 + }, + { + "epoch": 1.11, + "learning_rate": 4.366919215899634e-06, + "logits/chosen": -1.260669469833374, + "logits/rejected": -1.251911997795105, + "logps/chosen": -51.69219207763672, + "logps/rejected": -90.85166931152344, + "loss": 0.2345, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.9890968799591064, + "rewards/margins": 0.5667457580566406, + "rewards/rejected": 1.4223511219024658, + "step": 6829 + }, + { + "epoch": 1.11, + "learning_rate": 4.365615561224813e-06, + "logits/chosen": -1.2477885484695435, + "logits/rejected": -1.2862457036972046, + "logps/chosen": -45.728660583496094, + "logps/rejected": -78.30245971679688, + "loss": 0.2716, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.858188152313232, + "rewards/margins": 1.356339454650879, + "rewards/rejected": 3.5018486976623535, + "step": 6830 + }, + { + "epoch": 1.11, + "learning_rate": 4.364311950379954e-06, + "logits/chosen": -1.2337634563446045, + "logits/rejected": -1.1778924465179443, + "logps/chosen": -49.82264709472656, + "logps/rejected": -61.5137939453125, + "loss": 0.9139, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.3498528003692627, + "rewards/margins": 0.5632469654083252, + "rewards/rejected": 2.7866058349609375, + "step": 6831 + }, + { + "epoch": 1.11, + "learning_rate": 4.363008383455124e-06, + "logits/chosen": -1.4209357500076294, + "logits/rejected": -1.5588796138763428, + "logps/chosen": -62.520118713378906, + "logps/rejected": -37.507022857666016, + "loss": 1.4089, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2786941528320312, + "rewards/margins": 3.0479583740234375, + "rewards/rejected": 0.23073577880859375, + "step": 6832 + }, + { + "epoch": 1.11, + "learning_rate": 4.361704860540388e-06, + "logits/chosen": -1.1573292016983032, + "logits/rejected": -1.1980693340301514, + "logps/chosen": -32.79582214355469, + "logps/rejected": -55.75285339355469, + "loss": 1.1517, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.561906099319458, + "rewards/margins": 0.22244536876678467, + "rewards/rejected": 1.3394607305526733, + "step": 6833 + }, + { + "epoch": 1.11, + "learning_rate": 4.360401381725806e-06, + "logits/chosen": -1.073061227798462, + "logits/rejected": -0.9915381073951721, + "logps/chosen": -129.97813415527344, + "logps/rejected": -85.31957244873047, + "loss": 0.2128, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9991517066955566, + "rewards/margins": 1.104128360748291, + "rewards/rejected": 1.8950233459472656, + "step": 6834 + }, + { + "epoch": 1.11, + "learning_rate": 4.359097947101437e-06, + "logits/chosen": -1.2615002393722534, + "logits/rejected": -1.0675052404403687, + "logps/chosen": -72.17353820800781, + "logps/rejected": -69.72776794433594, + "loss": 0.6033, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.326999187469482, + "rewards/margins": 1.7279994487762451, + "rewards/rejected": 2.5989997386932373, + "step": 6835 + }, + { + "epoch": 1.11, + "learning_rate": 4.357794556757336e-06, + "logits/chosen": -1.1596508026123047, + "logits/rejected": -1.1590179204940796, + "logps/chosen": -51.01642608642578, + "logps/rejected": -80.65576171875, + "loss": 0.4035, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.0809807777404785, + "rewards/margins": 0.34162986278533936, + "rewards/rejected": 1.7393509149551392, + "step": 6836 + }, + { + "epoch": 1.11, + "learning_rate": 4.3564912107835536e-06, + "logits/chosen": -1.119215726852417, + "logits/rejected": -1.2720164060592651, + "logps/chosen": -92.29393768310547, + "logps/rejected": -129.698486328125, + "loss": 2.284, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.904064178466797, + "rewards/margins": -3.8539133071899414, + "rewards/rejected": 6.757977485656738, + "step": 6837 + }, + { + "epoch": 1.11, + "learning_rate": 4.35518790927014e-06, + "logits/chosen": -1.53080415725708, + "logits/rejected": -1.55734384059906, + "logps/chosen": -33.444190979003906, + "logps/rejected": -70.4629898071289, + "loss": 0.7888, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.141125440597534, + "rewards/margins": -0.9028680324554443, + "rewards/rejected": 3.0439934730529785, + "step": 6838 + }, + { + "epoch": 1.11, + "learning_rate": 4.353884652307141e-06, + "logits/chosen": -1.3331021070480347, + "logits/rejected": -1.3023349046707153, + "logps/chosen": -70.21812438964844, + "logps/rejected": -129.04881286621094, + "loss": 0.1479, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.084547519683838, + "rewards/margins": 2.9244821071624756, + "rewards/rejected": -0.8399345278739929, + "step": 6839 + }, + { + "epoch": 1.11, + "learning_rate": 4.352581439984598e-06, + "logits/chosen": -1.1872050762176514, + "logits/rejected": -1.225215196609497, + "logps/chosen": -179.88157653808594, + "logps/rejected": -101.82450103759766, + "loss": 0.0299, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.786924839019775, + "rewards/margins": 5.007479190826416, + "rewards/rejected": 1.7794456481933594, + "step": 6840 + }, + { + "epoch": 1.11, + "learning_rate": 4.351278272392552e-06, + "logits/chosen": -0.9532220363616943, + "logits/rejected": -0.9034401178359985, + "logps/chosen": -64.8678970336914, + "logps/rejected": -40.60527801513672, + "loss": 0.8483, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.107712507247925, + "rewards/margins": -0.8066589832305908, + "rewards/rejected": 2.9143714904785156, + "step": 6841 + }, + { + "epoch": 1.11, + "learning_rate": 4.349975149621039e-06, + "logits/chosen": -1.2234413623809814, + "logits/rejected": -1.163469672203064, + "logps/chosen": -70.09799194335938, + "logps/rejected": -63.97566604614258, + "loss": 0.8149, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.5953445434570312, + "rewards/margins": 1.2268283367156982, + "rewards/rejected": 1.368516206741333, + "step": 6842 + }, + { + "epoch": 1.11, + "learning_rate": 4.348672071760093e-06, + "logits/chosen": -1.1298695802688599, + "logits/rejected": -1.0410860776901245, + "logps/chosen": -45.56436538696289, + "logps/rejected": -15.234123229980469, + "loss": 0.5025, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.7191364765167236, + "rewards/margins": 0.9371044635772705, + "rewards/rejected": 0.7820320129394531, + "step": 6843 + }, + { + "epoch": 1.11, + "learning_rate": 4.347369038899744e-06, + "logits/chosen": -1.1742684841156006, + "logits/rejected": -1.1198501586914062, + "logps/chosen": -81.110595703125, + "logps/rejected": -19.40463638305664, + "loss": 0.2851, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.9022026062011719, + "rewards/margins": 0.8549436330795288, + "rewards/rejected": 0.047258950769901276, + "step": 6844 + }, + { + "epoch": 1.11, + "learning_rate": 4.346066051130018e-06, + "logits/chosen": -1.596466302871704, + "logits/rejected": -1.5917606353759766, + "logps/chosen": -64.17574310302734, + "logps/rejected": -101.90451049804688, + "loss": 0.1977, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.6512980461120605, + "rewards/margins": 0.7713985443115234, + "rewards/rejected": 4.879899501800537, + "step": 6845 + }, + { + "epoch": 1.11, + "learning_rate": 4.344763108540941e-06, + "logits/chosen": -1.1920714378356934, + "logits/rejected": -1.2342971563339233, + "logps/chosen": -80.94847869873047, + "logps/rejected": -78.07820892333984, + "loss": 0.7948, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.5536065101623535, + "rewards/margins": -0.9531059265136719, + "rewards/rejected": 3.5067124366760254, + "step": 6846 + }, + { + "epoch": 1.11, + "learning_rate": 4.343460211222534e-06, + "logits/chosen": -1.2223689556121826, + "logits/rejected": -1.2730956077575684, + "logps/chosen": -219.79147338867188, + "logps/rejected": -94.25846862792969, + "loss": 0.0402, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.657339572906494, + "rewards/margins": 4.110386848449707, + "rewards/rejected": 1.5469528436660767, + "step": 6847 + }, + { + "epoch": 1.11, + "learning_rate": 4.3421573592648135e-06, + "logits/chosen": -1.443312406539917, + "logits/rejected": -1.3144989013671875, + "logps/chosen": -97.18304443359375, + "logps/rejected": -59.84970474243164, + "loss": 0.5524, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.754825115203857, + "rewards/margins": 2.5228164196014404, + "rewards/rejected": 3.232008695602417, + "step": 6848 + }, + { + "epoch": 1.11, + "learning_rate": 4.340854552757795e-06, + "logits/chosen": -1.1731936931610107, + "logits/rejected": -1.1731936931610107, + "logps/chosen": -60.29231262207031, + "logps/rejected": -60.29231262207031, + "loss": 0.3756, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.5194191932678223, + "rewards/margins": 0.0, + "rewards/rejected": 2.5194191932678223, + "step": 6849 + }, + { + "epoch": 1.11, + "learning_rate": 4.33955179179149e-06, + "logits/chosen": -1.8364241123199463, + "logits/rejected": -1.7547513246536255, + "logps/chosen": -82.51521301269531, + "logps/rejected": -90.71800231933594, + "loss": 0.0439, + "rewards/accuracies": 1.0, + "rewards/chosen": 7.046008586883545, + "rewards/margins": 4.436811447143555, + "rewards/rejected": 2.6091973781585693, + "step": 6850 + }, + { + "epoch": 1.11, + "learning_rate": 4.338249076455907e-06, + "logits/chosen": -1.1714602708816528, + "logits/rejected": -1.1578409671783447, + "logps/chosen": -33.41166687011719, + "logps/rejected": -52.64512634277344, + "loss": 0.8836, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.7897675037384033, + "rewards/margins": 0.5943381786346436, + "rewards/rejected": 2.1954293251037598, + "step": 6851 + }, + { + "epoch": 1.11, + "learning_rate": 4.336946406841051e-06, + "logits/chosen": -1.581578254699707, + "logits/rejected": -1.427483320236206, + "logps/chosen": -183.62939453125, + "logps/rejected": -85.6595687866211, + "loss": 0.1431, + "rewards/accuracies": 1.0, + "rewards/chosen": 7.766208171844482, + "rewards/margins": 3.298673629760742, + "rewards/rejected": 4.46753454208374, + "step": 6852 + }, + { + "epoch": 1.11, + "learning_rate": 4.3356437830369244e-06, + "logits/chosen": -1.1516783237457275, + "logits/rejected": -1.094356894493103, + "logps/chosen": -65.27250671386719, + "logps/rejected": -41.41362380981445, + "loss": 0.4272, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.337592363357544, + "rewards/margins": -0.09610247611999512, + "rewards/rejected": 2.433694839477539, + "step": 6853 + }, + { + "epoch": 1.11, + "learning_rate": 4.334341205133527e-06, + "logits/chosen": -0.8976556062698364, + "logits/rejected": -0.8955286741256714, + "logps/chosen": -3.2292792797088623, + "logps/rejected": -2.3320083618164062, + "loss": 0.3612, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3609074354171753, + "rewards/margins": 0.06710842251777649, + "rewards/rejected": 0.2937990128993988, + "step": 6854 + }, + { + "epoch": 1.11, + "learning_rate": 4.333038673220853e-06, + "logits/chosen": -1.473107099533081, + "logits/rejected": -1.646229863166809, + "logps/chosen": -90.04241943359375, + "logps/rejected": -45.75151443481445, + "loss": 0.761, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.2400436401367188, + "rewards/margins": 0.08744919300079346, + "rewards/rejected": 1.1525944471359253, + "step": 6855 + }, + { + "epoch": 1.11, + "learning_rate": 4.331736187388896e-06, + "logits/chosen": -1.2017927169799805, + "logits/rejected": -1.196938157081604, + "logps/chosen": -3.020627737045288, + "logps/rejected": -2.4576079845428467, + "loss": 0.4055, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.46244269609451294, + "rewards/margins": -0.09455764293670654, + "rewards/rejected": 0.5570003390312195, + "step": 6856 + }, + { + "epoch": 1.11, + "learning_rate": 4.330433747727644e-06, + "logits/chosen": -1.0111922025680542, + "logits/rejected": -1.0306646823883057, + "logps/chosen": -14.46401596069336, + "logps/rejected": -23.328378677368164, + "loss": 0.2904, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5160438418388367, + "rewards/margins": 0.4445180892944336, + "rewards/rejected": 0.07152576744556427, + "step": 6857 + }, + { + "epoch": 1.11, + "learning_rate": 4.329131354327087e-06, + "logits/chosen": -1.3354425430297852, + "logits/rejected": -1.082857370376587, + "logps/chosen": -167.2833251953125, + "logps/rejected": -51.624595642089844, + "loss": 0.2925, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.390689373016357, + "rewards/margins": 1.8954639434814453, + "rewards/rejected": 4.495225429534912, + "step": 6858 + }, + { + "epoch": 1.11, + "learning_rate": 4.3278290072772036e-06, + "logits/chosen": -0.8545904755592346, + "logits/rejected": -0.8545904755592346, + "logps/chosen": -35.21345520019531, + "logps/rejected": -35.21345520019531, + "loss": 0.4407, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.37029990553855896, + "rewards/margins": 0.0, + "rewards/rejected": 0.37029990553855896, + "step": 6859 + }, + { + "epoch": 1.11, + "learning_rate": 4.326526706667977e-06, + "logits/chosen": -1.110982060432434, + "logits/rejected": -1.0960818529129028, + "logps/chosen": -35.6422119140625, + "logps/rejected": -29.66208267211914, + "loss": 0.2998, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.3497092723846436, + "rewards/margins": 0.20482349395751953, + "rewards/rejected": 3.144885778427124, + "step": 6860 + }, + { + "epoch": 1.11, + "learning_rate": 4.3252244525893815e-06, + "logits/chosen": -1.3255977630615234, + "logits/rejected": -1.3376364707946777, + "logps/chosen": -68.5281982421875, + "logps/rejected": -50.97449493408203, + "loss": 1.2365, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.4266494512557983, + "rewards/margins": -2.329164981842041, + "rewards/rejected": 3.75581431388855, + "step": 6861 + }, + { + "epoch": 1.11, + "learning_rate": 4.323922245131392e-06, + "logits/chosen": -1.3430507183074951, + "logits/rejected": -1.3964412212371826, + "logps/chosen": -78.21533203125, + "logps/rejected": -118.43219757080078, + "loss": 0.3731, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.502737522125244, + "rewards/margins": 2.910703420639038, + "rewards/rejected": 0.5920341610908508, + "step": 6862 + }, + { + "epoch": 1.11, + "learning_rate": 4.322620084383979e-06, + "logits/chosen": -1.0816223621368408, + "logits/rejected": -1.1024583578109741, + "logps/chosen": -148.33343505859375, + "logps/rejected": -73.708984375, + "loss": 0.7306, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.881314277648926, + "rewards/margins": 2.405430793762207, + "rewards/rejected": 2.4758834838867188, + "step": 6863 + }, + { + "epoch": 1.11, + "learning_rate": 4.321317970437108e-06, + "logits/chosen": -1.119126319885254, + "logits/rejected": -1.1011112928390503, + "logps/chosen": -88.00411987304688, + "logps/rejected": -104.7934341430664, + "loss": 0.2565, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.188744306564331, + "rewards/margins": 0.6776450872421265, + "rewards/rejected": 1.5110992193222046, + "step": 6864 + }, + { + "epoch": 1.11, + "learning_rate": 4.320015903380743e-06, + "logits/chosen": -1.3530967235565186, + "logits/rejected": -1.3993645906448364, + "logps/chosen": -75.02638244628906, + "logps/rejected": -106.00697326660156, + "loss": 1.475, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.7986695766448975, + "rewards/margins": 1.7824167013168335, + "rewards/rejected": 1.016252875328064, + "step": 6865 + }, + { + "epoch": 1.11, + "learning_rate": 4.318713883304846e-06, + "logits/chosen": -1.3221443891525269, + "logits/rejected": -1.3635767698287964, + "logps/chosen": -104.47821807861328, + "logps/rejected": -111.30778503417969, + "loss": 0.5619, + "rewards/accuracies": 0.0, + "rewards/chosen": 5.214083194732666, + "rewards/margins": -0.14999914169311523, + "rewards/rejected": 5.364082336425781, + "step": 6866 + }, + { + "epoch": 1.11, + "learning_rate": 4.3174119102993725e-06, + "logits/chosen": -0.8331798911094666, + "logits/rejected": -0.8302367925643921, + "logps/chosen": -18.01605987548828, + "logps/rejected": -36.43516540527344, + "loss": 0.726, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5974010825157166, + "rewards/margins": 0.10608524084091187, + "rewards/rejected": 0.4913158416748047, + "step": 6867 + }, + { + "epoch": 1.11, + "learning_rate": 4.316109984454278e-06, + "logits/chosen": -1.3502600193023682, + "logits/rejected": -1.421445608139038, + "logps/chosen": -79.6621322631836, + "logps/rejected": -100.50794982910156, + "loss": 1.8842, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.1998238563537598, + "rewards/margins": -2.8120641708374023, + "rewards/rejected": 6.011888027191162, + "step": 6868 + }, + { + "epoch": 1.11, + "learning_rate": 4.314808105859513e-06, + "logits/chosen": -0.8832705616950989, + "logits/rejected": -0.767979621887207, + "logps/chosen": -56.003597259521484, + "logps/rejected": -52.514564514160156, + "loss": 0.2666, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.820335865020752, + "rewards/margins": 1.3354694843292236, + "rewards/rejected": 2.4848663806915283, + "step": 6869 + }, + { + "epoch": 1.12, + "learning_rate": 4.313506274605023e-06, + "logits/chosen": -1.3877497911453247, + "logits/rejected": -1.2733330726623535, + "logps/chosen": -74.1142807006836, + "logps/rejected": -65.64128112792969, + "loss": 0.2258, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.0918540954589844, + "rewards/margins": 0.8509140014648438, + "rewards/rejected": 1.2409400939941406, + "step": 6870 + }, + { + "epoch": 1.12, + "learning_rate": 4.312204490780755e-06, + "logits/chosen": -1.2058931589126587, + "logits/rejected": -1.1011453866958618, + "logps/chosen": -75.7384033203125, + "logps/rejected": -26.49429702758789, + "loss": 0.0919, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0972702503204346, + "rewards/margins": 1.6580051183700562, + "rewards/rejected": 1.4392651319503784, + "step": 6871 + }, + { + "epoch": 1.12, + "learning_rate": 4.310902754476648e-06, + "logits/chosen": -1.1661874055862427, + "logits/rejected": -1.1275224685668945, + "logps/chosen": -90.72457885742188, + "logps/rejected": -56.85540771484375, + "loss": 0.3102, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.041708469390869, + "rewards/margins": 1.5714333057403564, + "rewards/rejected": 2.4702751636505127, + "step": 6872 + }, + { + "epoch": 1.12, + "learning_rate": 4.30960106578264e-06, + "logits/chosen": -1.0672407150268555, + "logits/rejected": -1.0709978342056274, + "logps/chosen": -19.832950592041016, + "logps/rejected": -60.21503448486328, + "loss": 0.1836, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.6825852394104004, + "rewards/margins": 0.8181229829788208, + "rewards/rejected": 1.8644622564315796, + "step": 6873 + }, + { + "epoch": 1.12, + "learning_rate": 4.308299424788667e-06, + "logits/chosen": -1.0508335828781128, + "logits/rejected": -1.1353161334991455, + "logps/chosen": -13.290420532226562, + "logps/rejected": -39.91722106933594, + "loss": 2.032, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.1908527612686157, + "rewards/margins": -3.0494885444641113, + "rewards/rejected": 4.2403411865234375, + "step": 6874 + }, + { + "epoch": 1.12, + "learning_rate": 4.306997831584658e-06, + "logits/chosen": -0.9094317555427551, + "logits/rejected": -0.7994788885116577, + "logps/chosen": -58.00130081176758, + "logps/rejected": -43.89040756225586, + "loss": 0.5452, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.673823118209839, + "rewards/margins": 0.9177035093307495, + "rewards/rejected": 1.7561196088790894, + "step": 6875 + }, + { + "epoch": 1.12, + "learning_rate": 4.305696286260541e-06, + "logits/chosen": -0.9896169304847717, + "logits/rejected": -0.9978487491607666, + "logps/chosen": -9.249218940734863, + "logps/rejected": -1.2042099237442017, + "loss": 0.4315, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.1958146095275879, + "rewards/margins": -0.10769400000572205, + "rewards/rejected": 0.30350860953330994, + "step": 6876 + }, + { + "epoch": 1.12, + "learning_rate": 4.304394788906242e-06, + "logits/chosen": -1.1832869052886963, + "logits/rejected": -1.2846145629882812, + "logps/chosen": -25.791717529296875, + "logps/rejected": -103.25616455078125, + "loss": 1.2777, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.52082896232605, + "rewards/margins": -1.7978646755218506, + "rewards/rejected": 4.3186936378479, + "step": 6877 + }, + { + "epoch": 1.12, + "learning_rate": 4.3030933396116815e-06, + "logits/chosen": -1.4917247295379639, + "logits/rejected": -1.2813639640808105, + "logps/chosen": -76.55999755859375, + "logps/rejected": -67.61070251464844, + "loss": 0.0178, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.1181159019470215, + "rewards/margins": 3.692556619644165, + "rewards/rejected": 1.4255592823028564, + "step": 6878 + }, + { + "epoch": 1.12, + "learning_rate": 4.301791938466776e-06, + "logits/chosen": -1.719712257385254, + "logits/rejected": -1.6915984153747559, + "logps/chosen": -263.6766357421875, + "logps/rejected": -107.85566711425781, + "loss": 0.0579, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.274273872375488, + "rewards/margins": 2.0976595878601074, + "rewards/rejected": 4.176614284515381, + "step": 6879 + }, + { + "epoch": 1.12, + "learning_rate": 4.300490585561442e-06, + "logits/chosen": -1.1887778043746948, + "logits/rejected": -1.4138641357421875, + "logps/chosen": -31.837665557861328, + "logps/rejected": -63.81190490722656, + "loss": 0.5572, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.5909489393234253, + "rewards/margins": -0.6657794713973999, + "rewards/rejected": 2.256728410720825, + "step": 6880 + }, + { + "epoch": 1.12, + "learning_rate": 4.299189280985589e-06, + "logits/chosen": -1.0199066400527954, + "logits/rejected": -0.9829941987991333, + "logps/chosen": -21.337848663330078, + "logps/rejected": -6.129405975341797, + "loss": 0.4377, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.7301790118217468, + "rewards/margins": 0.07986617088317871, + "rewards/rejected": 0.6503128409385681, + "step": 6881 + }, + { + "epoch": 1.12, + "learning_rate": 4.297888024829126e-06, + "logits/chosen": -1.242845892906189, + "logits/rejected": -1.3720154762268066, + "logps/chosen": -201.59378051757812, + "logps/rejected": -134.29547119140625, + "loss": 0.08, + "rewards/accuracies": 1.0, + "rewards/chosen": 7.705685615539551, + "rewards/margins": 2.2377395629882812, + "rewards/rejected": 5.4679460525512695, + "step": 6882 + }, + { + "epoch": 1.12, + "learning_rate": 4.296586817181957e-06, + "logits/chosen": -1.1503942012786865, + "logits/rejected": -1.1160104274749756, + "logps/chosen": -39.62464904785156, + "logps/rejected": -30.656726837158203, + "loss": 0.322, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.2014553546905518, + "rewards/margins": 0.10905647277832031, + "rewards/rejected": 1.0923988819122314, + "step": 6883 + }, + { + "epoch": 1.12, + "learning_rate": 4.295285658133983e-06, + "logits/chosen": -1.3120933771133423, + "logits/rejected": -0.934941291809082, + "logps/chosen": -140.93833923339844, + "logps/rejected": -33.97393035888672, + "loss": 0.0398, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.088778972625732, + "rewards/margins": 3.1257612705230713, + "rewards/rejected": 1.9630177021026611, + "step": 6884 + }, + { + "epoch": 1.12, + "learning_rate": 4.293984547775102e-06, + "logits/chosen": -0.9164154529571533, + "logits/rejected": -0.9164154529571533, + "logps/chosen": -1.635940670967102, + "logps/rejected": -1.635940670967102, + "loss": 0.5535, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.543782651424408, + "rewards/margins": 0.0, + "rewards/rejected": 0.543782651424408, + "step": 6885 + }, + { + "epoch": 1.12, + "learning_rate": 4.292683486195208e-06, + "logits/chosen": -0.9920939803123474, + "logits/rejected": -1.050304889678955, + "logps/chosen": -91.06932067871094, + "logps/rejected": -70.54129028320312, + "loss": 0.4538, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.279985189437866, + "rewards/margins": 1.4304841756820679, + "rewards/rejected": 1.8495010137557983, + "step": 6886 + }, + { + "epoch": 1.12, + "learning_rate": 4.291382473484193e-06, + "logits/chosen": -1.407052993774414, + "logits/rejected": -1.397325038909912, + "logps/chosen": -70.86869812011719, + "logps/rejected": -44.68001937866211, + "loss": 0.4624, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.9435081481933594, + "rewards/margins": -0.4180881977081299, + "rewards/rejected": 2.3615963459014893, + "step": 6887 + }, + { + "epoch": 1.12, + "learning_rate": 4.2900815097319436e-06, + "logits/chosen": -1.3646906614303589, + "logits/rejected": -1.3175128698349, + "logps/chosen": -60.52800750732422, + "logps/rejected": -42.56267166137695, + "loss": 0.2625, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.102541446685791, + "rewards/margins": 0.44907045364379883, + "rewards/rejected": 3.653470993041992, + "step": 6888 + }, + { + "epoch": 1.12, + "learning_rate": 4.2887805950283455e-06, + "logits/chosen": -0.9913368225097656, + "logits/rejected": -0.9913368225097656, + "logps/chosen": -44.185569763183594, + "logps/rejected": -44.185569763183594, + "loss": 0.3495, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.374196767807007, + "rewards/margins": 0.0, + "rewards/rejected": 2.374196767807007, + "step": 6889 + }, + { + "epoch": 1.12, + "learning_rate": 4.2874797294632776e-06, + "logits/chosen": -1.1244323253631592, + "logits/rejected": -1.1244323253631592, + "logps/chosen": -61.290374755859375, + "logps/rejected": -61.290374755859375, + "loss": 1.0742, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.1644058227539062, + "rewards/margins": 0.0, + "rewards/rejected": 2.1644058227539062, + "step": 6890 + }, + { + "epoch": 1.12, + "learning_rate": 4.286178913126619e-06, + "logits/chosen": -0.9270116686820984, + "logits/rejected": -0.857367992401123, + "logps/chosen": -86.85011291503906, + "logps/rejected": -60.653594970703125, + "loss": 1.3295, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.4382949769496918, + "rewards/margins": -2.5470879077911377, + "rewards/rejected": 2.9853827953338623, + "step": 6891 + }, + { + "epoch": 1.12, + "learning_rate": 4.284878146108244e-06, + "logits/chosen": -1.2973774671554565, + "logits/rejected": -0.9784399271011353, + "logps/chosen": -42.52461242675781, + "logps/rejected": -55.143585205078125, + "loss": 0.4104, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.83982253074646, + "rewards/margins": -0.22536301612854004, + "rewards/rejected": 4.065185546875, + "step": 6892 + }, + { + "epoch": 1.12, + "learning_rate": 4.28357742849802e-06, + "logits/chosen": -1.3154176473617554, + "logits/rejected": -1.3154176473617554, + "logps/chosen": -30.324861526489258, + "logps/rejected": -30.324861526489258, + "loss": 0.3663, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.929817795753479, + "rewards/margins": 0.0, + "rewards/rejected": 1.929817795753479, + "step": 6893 + }, + { + "epoch": 1.12, + "learning_rate": 4.2822767603858185e-06, + "logits/chosen": -1.4372596740722656, + "logits/rejected": -1.4136511087417603, + "logps/chosen": -75.38597106933594, + "logps/rejected": -95.7408676147461, + "loss": 0.7, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.8539031744003296, + "rewards/margins": -1.0528992414474487, + "rewards/rejected": 2.9068024158477783, + "step": 6894 + }, + { + "epoch": 1.12, + "learning_rate": 4.280976141861501e-06, + "logits/chosen": -1.2741730213165283, + "logits/rejected": -1.0145530700683594, + "logps/chosen": -75.62435913085938, + "logps/rejected": -24.365365982055664, + "loss": 0.085, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2166366577148438, + "rewards/margins": 2.540457248687744, + "rewards/rejected": 0.6761795282363892, + "step": 6895 + }, + { + "epoch": 1.12, + "learning_rate": 4.27967557301493e-06, + "logits/chosen": -1.3163059949874878, + "logits/rejected": -1.2218841314315796, + "logps/chosen": -76.8527603149414, + "logps/rejected": -59.34417724609375, + "loss": 0.0901, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.570565223693848, + "rewards/margins": 1.846045732498169, + "rewards/rejected": 2.7245194911956787, + "step": 6896 + }, + { + "epoch": 1.12, + "learning_rate": 4.27837505393596e-06, + "logits/chosen": -1.4461488723754883, + "logits/rejected": -1.462786078453064, + "logps/chosen": -40.090057373046875, + "logps/rejected": -75.39604187011719, + "loss": 0.2056, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.5809311866760254, + "rewards/margins": 0.7121247053146362, + "rewards/rejected": 1.8688064813613892, + "step": 6897 + }, + { + "epoch": 1.12, + "learning_rate": 4.277074584714447e-06, + "logits/chosen": -0.8188632130622864, + "logits/rejected": -0.8646069765090942, + "logps/chosen": -85.60713195800781, + "logps/rejected": -74.94338989257812, + "loss": 0.1764, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.5235297679901123, + "rewards/margins": 1.2918967008590698, + "rewards/rejected": 1.2316330671310425, + "step": 6898 + }, + { + "epoch": 1.12, + "learning_rate": 4.275774165440239e-06, + "logits/chosen": -1.1714516878128052, + "logits/rejected": -1.2090973854064941, + "logps/chosen": -62.616031646728516, + "logps/rejected": -161.6927032470703, + "loss": 0.9908, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.5284007787704468, + "rewards/margins": -0.9658223390579224, + "rewards/rejected": 2.494223117828369, + "step": 6899 + }, + { + "epoch": 1.12, + "learning_rate": 4.274473796203183e-06, + "logits/chosen": -1.0696561336517334, + "logits/rejected": -1.0211925506591797, + "logps/chosen": -59.06562805175781, + "logps/rejected": -27.76479721069336, + "loss": 0.4928, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.5843642950057983, + "rewards/margins": 0.6845301985740662, + "rewards/rejected": 0.8998340964317322, + "step": 6900 + }, + { + "epoch": 1.12, + "learning_rate": 4.273173477093123e-06, + "logits/chosen": -0.9856323599815369, + "logits/rejected": -0.9856323599815369, + "logps/chosen": -3.839207887649536, + "logps/rejected": -3.839207887649536, + "loss": 2.1486, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.620053768157959, + "rewards/margins": 0.0, + "rewards/rejected": 0.620053768157959, + "step": 6901 + }, + { + "epoch": 1.12, + "learning_rate": 4.271873208199899e-06, + "logits/chosen": -1.2243573665618896, + "logits/rejected": -1.2215293645858765, + "logps/chosen": -3.7242610454559326, + "logps/rejected": -1.1325410604476929, + "loss": 0.9925, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.11868651211261749, + "rewards/margins": -0.2096203714609146, + "rewards/rejected": 0.3283068835735321, + "step": 6902 + }, + { + "epoch": 1.12, + "learning_rate": 4.270572989613346e-06, + "logits/chosen": -1.0075676441192627, + "logits/rejected": -0.8391982913017273, + "logps/chosen": -46.400299072265625, + "logps/rejected": -10.024365425109863, + "loss": 0.0809, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.665930986404419, + "rewards/margins": 1.8725895881652832, + "rewards/rejected": 0.793341338634491, + "step": 6903 + }, + { + "epoch": 1.12, + "learning_rate": 4.269272821423298e-06, + "logits/chosen": -1.12839674949646, + "logits/rejected": -1.12839674949646, + "logps/chosen": -95.82781982421875, + "logps/rejected": -95.82781982421875, + "loss": 0.5113, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.0989227294921875, + "rewards/margins": 0.0, + "rewards/rejected": 2.0989227294921875, + "step": 6904 + }, + { + "epoch": 1.12, + "learning_rate": 4.2679727037195835e-06, + "logits/chosen": -1.3389984369277954, + "logits/rejected": -1.3589133024215698, + "logps/chosen": -83.40660858154297, + "logps/rejected": -79.93871307373047, + "loss": 1.1411, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.5652260780334473, + "rewards/margins": -1.6365509033203125, + "rewards/rejected": 5.20177698135376, + "step": 6905 + }, + { + "epoch": 1.12, + "learning_rate": 4.266672636592029e-06, + "logits/chosen": -1.4389283657073975, + "logits/rejected": -1.2159318923950195, + "logps/chosen": -203.52459716796875, + "logps/rejected": -48.627113342285156, + "loss": 0.0376, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.4678955078125, + "rewards/margins": 2.5641913414001465, + "rewards/rejected": 3.9037041664123535, + "step": 6906 + }, + { + "epoch": 1.12, + "learning_rate": 4.265372620130457e-06, + "logits/chosen": -1.0551313161849976, + "logits/rejected": -1.0156307220458984, + "logps/chosen": -48.35731506347656, + "logps/rejected": -57.38264846801758, + "loss": 0.7695, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.841679334640503, + "rewards/margins": 0.5998201370239258, + "rewards/rejected": 3.241859197616577, + "step": 6907 + }, + { + "epoch": 1.12, + "learning_rate": 4.264072654424685e-06, + "logits/chosen": -1.4905747175216675, + "logits/rejected": -1.4660062789916992, + "logps/chosen": -100.43401336669922, + "logps/rejected": -82.51240539550781, + "loss": 0.8061, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.113328695297241, + "rewards/margins": -0.3412764072418213, + "rewards/rejected": 2.4546051025390625, + "step": 6908 + }, + { + "epoch": 1.12, + "learning_rate": 4.262772739564529e-06, + "logits/chosen": -1.0855435132980347, + "logits/rejected": -1.0776054859161377, + "logps/chosen": -59.0980224609375, + "logps/rejected": -40.611732482910156, + "loss": 0.367, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.8337448835372925, + "rewards/margins": -0.07623207569122314, + "rewards/rejected": 1.9099769592285156, + "step": 6909 + }, + { + "epoch": 1.12, + "learning_rate": 4.261472875639801e-06, + "logits/chosen": -1.4753633737564087, + "logits/rejected": -1.504332423210144, + "logps/chosen": -53.10227584838867, + "logps/rejected": -110.8309326171875, + "loss": 1.9516, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.3629748821258545, + "rewards/margins": -3.8798396587371826, + "rewards/rejected": 7.242814540863037, + "step": 6910 + }, + { + "epoch": 1.12, + "learning_rate": 4.2601730627403095e-06, + "logits/chosen": -1.3776354789733887, + "logits/rejected": -1.3191243410110474, + "logps/chosen": -60.61750411987305, + "logps/rejected": -84.6458740234375, + "loss": 1.7503, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.1792042255401611, + "rewards/margins": -2.3687281608581543, + "rewards/rejected": 3.5479323863983154, + "step": 6911 + }, + { + "epoch": 1.12, + "learning_rate": 4.258873300955859e-06, + "logits/chosen": -0.7625229358673096, + "logits/rejected": -0.7109341621398926, + "logps/chosen": -44.15400695800781, + "logps/rejected": -30.715042114257812, + "loss": 0.9485, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.17362380027771, + "rewards/margins": -1.0032758712768555, + "rewards/rejected": 3.1768996715545654, + "step": 6912 + }, + { + "epoch": 1.12, + "learning_rate": 4.257573590376252e-06, + "logits/chosen": -1.2890996932983398, + "logits/rejected": -1.1884039640426636, + "logps/chosen": -62.293853759765625, + "logps/rejected": -49.953495025634766, + "loss": 0.3356, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.6283090114593506, + "rewards/margins": 0.10782361030578613, + "rewards/rejected": 3.5204854011535645, + "step": 6913 + }, + { + "epoch": 1.12, + "learning_rate": 4.256273931091284e-06, + "logits/chosen": -1.2202281951904297, + "logits/rejected": -1.2711914777755737, + "logps/chosen": -51.69965362548828, + "logps/rejected": -74.5152816772461, + "loss": 0.668, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.0509421825408936, + "rewards/margins": 0.23723363876342773, + "rewards/rejected": 1.8137085437774658, + "step": 6914 + }, + { + "epoch": 1.12, + "learning_rate": 4.254974323190749e-06, + "logits/chosen": -1.177742838859558, + "logits/rejected": -1.2039604187011719, + "logps/chosen": -94.64604187011719, + "logps/rejected": -78.04141235351562, + "loss": 0.955, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.8543121814727783, + "rewards/margins": 1.314509630203247, + "rewards/rejected": 1.5398025512695312, + "step": 6915 + }, + { + "epoch": 1.12, + "learning_rate": 4.253674766764441e-06, + "logits/chosen": -1.1011322736740112, + "logits/rejected": -0.9847012758255005, + "logps/chosen": -66.17293548583984, + "logps/rejected": -40.56349182128906, + "loss": 0.1678, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.5973427295684814, + "rewards/margins": 0.9659225940704346, + "rewards/rejected": 2.631420135498047, + "step": 6916 + }, + { + "epoch": 1.12, + "learning_rate": 4.252375261902143e-06, + "logits/chosen": -1.1879807710647583, + "logits/rejected": -1.1172711849212646, + "logps/chosen": -23.869441986083984, + "logps/rejected": -22.774433135986328, + "loss": 0.6433, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.357939600944519, + "rewards/margins": 0.7889881730079651, + "rewards/rejected": 0.568951427936554, + "step": 6917 + }, + { + "epoch": 1.12, + "learning_rate": 4.251075808693641e-06, + "logits/chosen": -1.064072608947754, + "logits/rejected": -1.0593140125274658, + "logps/chosen": -2.3165929317474365, + "logps/rejected": -9.726768493652344, + "loss": 0.4855, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.16002248227596283, + "rewards/margins": 0.025449112057685852, + "rewards/rejected": 0.13457337021827698, + "step": 6918 + }, + { + "epoch": 1.12, + "learning_rate": 4.249776407228714e-06, + "logits/chosen": -1.1339616775512695, + "logits/rejected": -1.1339616775512695, + "logps/chosen": -0.7298092246055603, + "logps/rejected": -0.7298092246055603, + "loss": 0.4935, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.18315839767456055, + "rewards/margins": 0.0, + "rewards/rejected": 0.18315839767456055, + "step": 6919 + }, + { + "epoch": 1.12, + "learning_rate": 4.248477057597139e-06, + "logits/chosen": -1.4280281066894531, + "logits/rejected": -1.461668848991394, + "logps/chosen": -89.5540771484375, + "logps/rejected": -88.14300537109375, + "loss": 0.353, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.1133346557617188, + "rewards/margins": -0.01819300651550293, + "rewards/rejected": 2.1315276622772217, + "step": 6920 + }, + { + "epoch": 1.12, + "learning_rate": 4.247177759888688e-06, + "logits/chosen": -0.8893902897834778, + "logits/rejected": -0.9184415936470032, + "logps/chosen": -68.59365844726562, + "logps/rejected": -48.72969055175781, + "loss": 1.2573, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.532721757888794, + "rewards/margins": 1.616681694984436, + "rewards/rejected": 1.916040062904358, + "step": 6921 + }, + { + "epoch": 1.12, + "learning_rate": 4.245878514193131e-06, + "logits/chosen": -1.2245019674301147, + "logits/rejected": -1.164666771888733, + "logps/chosen": -51.9730339050293, + "logps/rejected": -22.734310150146484, + "loss": 0.4484, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.190727710723877, + "rewards/margins": 1.4873048067092896, + "rewards/rejected": 1.7034229040145874, + "step": 6922 + }, + { + "epoch": 1.12, + "learning_rate": 4.2445793206002325e-06, + "logits/chosen": -0.7919672131538391, + "logits/rejected": -0.7924376726150513, + "logps/chosen": -1.58949613571167, + "logps/rejected": -1.676095962524414, + "loss": 0.4901, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.14702120423316956, + "rewards/margins": -0.3131367564201355, + "rewards/rejected": 0.46015796065330505, + "step": 6923 + }, + { + "epoch": 1.12, + "learning_rate": 4.243280179199756e-06, + "logits/chosen": -1.2490031719207764, + "logits/rejected": -1.251608967781067, + "logps/chosen": -25.092679977416992, + "logps/rejected": -44.517921447753906, + "loss": 0.5104, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.498929738998413, + "rewards/margins": -0.07293701171875, + "rewards/rejected": 3.571866750717163, + "step": 6924 + }, + { + "epoch": 1.12, + "learning_rate": 4.241981090081458e-06, + "logits/chosen": -1.285861849784851, + "logits/rejected": -1.2650777101516724, + "logps/chosen": -49.33240509033203, + "logps/rejected": -46.88638687133789, + "loss": 0.9262, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.3179917335510254, + "rewards/margins": -0.582963228225708, + "rewards/rejected": 2.9009549617767334, + "step": 6925 + }, + { + "epoch": 1.12, + "learning_rate": 4.2406820533350955e-06, + "logits/chosen": -1.2783890962600708, + "logits/rejected": -1.2783890962600708, + "logps/chosen": -53.996131896972656, + "logps/rejected": -53.996131896972656, + "loss": 0.3483, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.0739388465881348, + "rewards/margins": 0.0, + "rewards/rejected": 2.0739388465881348, + "step": 6926 + }, + { + "epoch": 1.12, + "learning_rate": 4.239383069050417e-06, + "logits/chosen": -1.2325594425201416, + "logits/rejected": -1.2655487060546875, + "logps/chosen": -69.87908935546875, + "logps/rejected": -79.35647583007812, + "loss": 0.4363, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.9810959100723267, + "rewards/margins": -0.3204087018966675, + "rewards/rejected": 2.301504611968994, + "step": 6927 + }, + { + "epoch": 1.12, + "learning_rate": 4.238084137317171e-06, + "logits/chosen": -0.8709222078323364, + "logits/rejected": -0.8656173944473267, + "logps/chosen": -2.304049253463745, + "logps/rejected": -11.137962341308594, + "loss": 1.264, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.32757553458213806, + "rewards/margins": 0.06430363655090332, + "rewards/rejected": 0.26327189803123474, + "step": 6928 + }, + { + "epoch": 1.12, + "learning_rate": 4.236785258225103e-06, + "logits/chosen": -1.2044016122817993, + "logits/rejected": -1.221441388130188, + "logps/chosen": -100.19564819335938, + "logps/rejected": -97.87274169921875, + "loss": 0.9827, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.3267533779144287, + "rewards/margins": 0.7422349452972412, + "rewards/rejected": 1.5845184326171875, + "step": 6929 + }, + { + "epoch": 1.12, + "learning_rate": 4.235486431863951e-06, + "logits/chosen": -1.2666041851043701, + "logits/rejected": -1.204235553741455, + "logps/chosen": -76.05535888671875, + "logps/rejected": -75.13843536376953, + "loss": 0.451, + "rewards/accuracies": 1.0, + "rewards/chosen": 7.260952949523926, + "rewards/margins": 2.208564281463623, + "rewards/rejected": 5.052388668060303, + "step": 6930 + }, + { + "epoch": 1.12, + "learning_rate": 4.234187658323454e-06, + "logits/chosen": -1.1287686824798584, + "logits/rejected": -1.0729128122329712, + "logps/chosen": -45.87712860107422, + "logps/rejected": -47.756629943847656, + "loss": 2.6292, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.439136505126953, + "rewards/margins": -1.1537034511566162, + "rewards/rejected": 3.5928399562835693, + "step": 6931 + }, + { + "epoch": 1.13, + "learning_rate": 4.232888937693343e-06, + "logits/chosen": -1.2665013074874878, + "logits/rejected": -1.39714515209198, + "logps/chosen": -61.254127502441406, + "logps/rejected": -48.897705078125, + "loss": 0.8105, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.191765546798706, + "rewards/margins": 0.8812460899353027, + "rewards/rejected": 2.3105194568634033, + "step": 6932 + }, + { + "epoch": 1.13, + "learning_rate": 4.2315902700633495e-06, + "logits/chosen": -1.1653937101364136, + "logits/rejected": -1.1653937101364136, + "logps/chosen": -34.97403335571289, + "logps/rejected": -34.97403335571289, + "loss": 1.5166, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.0944173336029053, + "rewards/margins": 0.0, + "rewards/rejected": 3.0944173336029053, + "step": 6933 + }, + { + "epoch": 1.13, + "learning_rate": 4.230291655523197e-06, + "logits/chosen": -1.3873229026794434, + "logits/rejected": -1.2892019748687744, + "logps/chosen": -82.49620819091797, + "logps/rejected": -81.46575164794922, + "loss": 0.0309, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.1369853019714355, + "rewards/margins": 3.214160442352295, + "rewards/rejected": 2.9228248596191406, + "step": 6934 + }, + { + "epoch": 1.13, + "learning_rate": 4.228993094162607e-06, + "logits/chosen": -1.40541410446167, + "logits/rejected": -1.2912057638168335, + "logps/chosen": -48.44110107421875, + "logps/rejected": -27.570524215698242, + "loss": 0.2934, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.6540565490722656, + "rewards/margins": 2.394474506378174, + "rewards/rejected": 0.25958195328712463, + "step": 6935 + }, + { + "epoch": 1.13, + "learning_rate": 4.227694586071298e-06, + "logits/chosen": -1.1116136312484741, + "logits/rejected": -1.0678153038024902, + "logps/chosen": -37.512489318847656, + "logps/rejected": -72.38665771484375, + "loss": 1.0784, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.13525390625, + "rewards/margins": -1.0743529796600342, + "rewards/rejected": 2.209606885910034, + "step": 6936 + }, + { + "epoch": 1.13, + "learning_rate": 4.226396131338986e-06, + "logits/chosen": -1.150103211402893, + "logits/rejected": -1.2314153909683228, + "logps/chosen": -58.680992126464844, + "logps/rejected": -57.28849792480469, + "loss": 0.5206, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.913698673248291, + "rewards/margins": -0.5249266624450684, + "rewards/rejected": 3.4386253356933594, + "step": 6937 + }, + { + "epoch": 1.13, + "learning_rate": 4.225097730055383e-06, + "logits/chosen": -1.6778556108474731, + "logits/rejected": -1.5796509981155396, + "logps/chosen": -150.94664001464844, + "logps/rejected": -102.10498809814453, + "loss": 1.6845, + "rewards/accuracies": 0.0, + "rewards/chosen": 6.838400363922119, + "rewards/margins": -1.7659497261047363, + "rewards/rejected": 8.604350090026855, + "step": 6938 + }, + { + "epoch": 1.13, + "learning_rate": 4.223799382310193e-06, + "logits/chosen": -1.655394196510315, + "logits/rejected": -1.4983822107315063, + "logps/chosen": -141.04812622070312, + "logps/rejected": -15.530519485473633, + "loss": 0.0109, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.745202541351318, + "rewards/margins": 5.879738807678223, + "rewards/rejected": 0.8654638528823853, + "step": 6939 + }, + { + "epoch": 1.13, + "learning_rate": 4.222501088193121e-06, + "logits/chosen": -0.8241935968399048, + "logits/rejected": -0.8177578449249268, + "logps/chosen": -44.28240966796875, + "logps/rejected": -56.794342041015625, + "loss": 1.3224, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.7386879920959473, + "rewards/margins": -2.555344581604004, + "rewards/rejected": 5.294032573699951, + "step": 6940 + }, + { + "epoch": 1.13, + "learning_rate": 4.221202847793869e-06, + "logits/chosen": -1.0336647033691406, + "logits/rejected": -1.0635708570480347, + "logps/chosen": -109.89812469482422, + "logps/rejected": -75.53339385986328, + "loss": 0.25, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.394123077392578, + "rewards/margins": 0.4554961919784546, + "rewards/rejected": 1.9386268854141235, + "step": 6941 + }, + { + "epoch": 1.13, + "learning_rate": 4.21990466120213e-06, + "logits/chosen": -1.0581344366073608, + "logits/rejected": -1.0423821210861206, + "logps/chosen": -29.886199951171875, + "logps/rejected": -12.041162490844727, + "loss": 0.584, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.04566154628992081, + "rewards/margins": -0.5093391537666321, + "rewards/rejected": 0.5550007224082947, + "step": 6942 + }, + { + "epoch": 1.13, + "learning_rate": 4.218606528507597e-06, + "logits/chosen": -1.2164785861968994, + "logits/rejected": -1.0744779109954834, + "logps/chosen": -105.70454406738281, + "logps/rejected": -49.921974182128906, + "loss": 0.0372, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.47097635269165, + "rewards/margins": 3.4554665088653564, + "rewards/rejected": 2.015509843826294, + "step": 6943 + }, + { + "epoch": 1.13, + "learning_rate": 4.21730844979996e-06, + "logits/chosen": -1.2811394929885864, + "logits/rejected": -1.2437087297439575, + "logps/chosen": -51.54214096069336, + "logps/rejected": -38.02864074707031, + "loss": 0.1647, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.7588276863098145, + "rewards/margins": 1.847605586051941, + "rewards/rejected": 0.9112221002578735, + "step": 6944 + }, + { + "epoch": 1.13, + "learning_rate": 4.2160104251689026e-06, + "logits/chosen": -1.3079172372817993, + "logits/rejected": -1.2614729404449463, + "logps/chosen": -130.4445343017578, + "logps/rejected": -56.664268493652344, + "loss": 0.3984, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.083940267562866, + "rewards/margins": 0.20173120498657227, + "rewards/rejected": 1.882209062576294, + "step": 6945 + }, + { + "epoch": 1.13, + "learning_rate": 4.214712454704107e-06, + "logits/chosen": -1.0981593132019043, + "logits/rejected": -1.0981593132019043, + "logps/chosen": -1.3960736989974976, + "logps/rejected": -1.3960736989974976, + "loss": 0.4162, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.27639323472976685, + "rewards/margins": 0.0, + "rewards/rejected": 0.27639323472976685, + "step": 6946 + }, + { + "epoch": 1.13, + "learning_rate": 4.213414538495251e-06, + "logits/chosen": -1.1170681715011597, + "logits/rejected": -1.0704312324523926, + "logps/chosen": -48.821651458740234, + "logps/rejected": -52.88683319091797, + "loss": 0.2075, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.8214008808135986, + "rewards/margins": 0.6876673698425293, + "rewards/rejected": 2.1337335109710693, + "step": 6947 + }, + { + "epoch": 1.13, + "learning_rate": 4.212116676632007e-06, + "logits/chosen": -1.317942500114441, + "logits/rejected": -1.2060753107070923, + "logps/chosen": -47.9613037109375, + "logps/rejected": -52.269466400146484, + "loss": 0.2847, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.469473361968994, + "rewards/margins": 0.2995266914367676, + "rewards/rejected": 3.1699466705322266, + "step": 6948 + }, + { + "epoch": 1.13, + "learning_rate": 4.2108188692040444e-06, + "logits/chosen": -1.330739974975586, + "logits/rejected": -1.3334424495697021, + "logps/chosen": -125.92227172851562, + "logps/rejected": -49.83929443359375, + "loss": 0.0133, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.529522895812988, + "rewards/margins": 3.858769416809082, + "rewards/rejected": 2.6707534790039062, + "step": 6949 + }, + { + "epoch": 1.13, + "learning_rate": 4.209521116301032e-06, + "logits/chosen": -0.8271573781967163, + "logits/rejected": -0.8271573781967163, + "logps/chosen": -66.13554382324219, + "logps/rejected": -66.13554382324219, + "loss": 2.7116, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.9911407232284546, + "rewards/margins": 0.0, + "rewards/rejected": 1.9911407232284546, + "step": 6950 + }, + { + "epoch": 1.13, + "learning_rate": 4.20822341801263e-06, + "logits/chosen": -1.068924903869629, + "logits/rejected": -1.0329549312591553, + "logps/chosen": -45.00754928588867, + "logps/rejected": -49.28760528564453, + "loss": 0.6681, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.516404390335083, + "rewards/margins": 0.6994987726211548, + "rewards/rejected": 0.8169056177139282, + "step": 6951 + }, + { + "epoch": 1.13, + "learning_rate": 4.206925774428499e-06, + "logits/chosen": -1.418688416481018, + "logits/rejected": -1.393670678138733, + "logps/chosen": -29.48423957824707, + "logps/rejected": -13.195489883422852, + "loss": 0.3392, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.9902631640434265, + "rewards/margins": 0.12053525447845459, + "rewards/rejected": 0.8697279095649719, + "step": 6952 + }, + { + "epoch": 1.13, + "learning_rate": 4.205628185638293e-06, + "logits/chosen": -1.1859863996505737, + "logits/rejected": -1.1648635864257812, + "logps/chosen": -89.27105712890625, + "logps/rejected": -158.1515350341797, + "loss": 0.2636, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.378847599029541, + "rewards/margins": 0.47744524478912354, + "rewards/rejected": 1.9014023542404175, + "step": 6953 + }, + { + "epoch": 1.13, + "learning_rate": 4.204330651731662e-06, + "logits/chosen": -1.547945499420166, + "logits/rejected": -1.4630769491195679, + "logps/chosen": -133.16921997070312, + "logps/rejected": -89.22898864746094, + "loss": 0.5618, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.399652004241943, + "rewards/margins": 0.16756105422973633, + "rewards/rejected": 5.232090950012207, + "step": 6954 + }, + { + "epoch": 1.13, + "learning_rate": 4.203033172798256e-06, + "logits/chosen": -1.2354462146759033, + "logits/rejected": -1.1926300525665283, + "logps/chosen": -70.32576751708984, + "logps/rejected": -51.65752029418945, + "loss": 0.4948, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.587361216545105, + "rewards/margins": -0.3116649389266968, + "rewards/rejected": 1.8990261554718018, + "step": 6955 + }, + { + "epoch": 1.13, + "learning_rate": 4.201735748927714e-06, + "logits/chosen": -1.403499960899353, + "logits/rejected": -1.3655219078063965, + "logps/chosen": -220.709716796875, + "logps/rejected": -50.01506042480469, + "loss": 0.2799, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.895849704742432, + "rewards/margins": 1.9499099254608154, + "rewards/rejected": 2.945939779281616, + "step": 6956 + }, + { + "epoch": 1.13, + "learning_rate": 4.200438380209681e-06, + "logits/chosen": -1.0472207069396973, + "logits/rejected": -1.0226879119873047, + "logps/chosen": -89.84654998779297, + "logps/rejected": -79.81137084960938, + "loss": 1.5321, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.6499574184417725, + "rewards/margins": 2.1274490356445312, + "rewards/rejected": 0.5225082635879517, + "step": 6957 + }, + { + "epoch": 1.13, + "learning_rate": 4.1991410667337896e-06, + "logits/chosen": -1.1275699138641357, + "logits/rejected": -1.1117396354675293, + "logps/chosen": -170.5609130859375, + "logps/rejected": -68.22161102294922, + "loss": 0.13, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.811859130859375, + "rewards/margins": 1.8067800998687744, + "rewards/rejected": 2.0050790309906006, + "step": 6958 + }, + { + "epoch": 1.13, + "learning_rate": 4.1978438085896725e-06, + "logits/chosen": -0.9989480376243591, + "logits/rejected": -0.9730985164642334, + "logps/chosen": -45.536746978759766, + "logps/rejected": -16.226778030395508, + "loss": 0.6521, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.6950168609619141, + "rewards/margins": 0.13756275177001953, + "rewards/rejected": 0.5574541091918945, + "step": 6959 + }, + { + "epoch": 1.13, + "learning_rate": 4.196546605866958e-06, + "logits/chosen": -1.5693824291229248, + "logits/rejected": -1.698830485343933, + "logps/chosen": -80.72077178955078, + "logps/rejected": -130.08494567871094, + "loss": 1.3261, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.962306261062622, + "rewards/margins": -2.328565835952759, + "rewards/rejected": 5.290872097015381, + "step": 6960 + }, + { + "epoch": 1.13, + "learning_rate": 4.19524945865527e-06, + "logits/chosen": -1.1509594917297363, + "logits/rejected": -1.1532243490219116, + "logps/chosen": -99.889404296875, + "logps/rejected": -53.0333251953125, + "loss": 0.5968, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.708017110824585, + "rewards/margins": 1.7433290481567383, + "rewards/rejected": 0.9646881222724915, + "step": 6961 + }, + { + "epoch": 1.13, + "learning_rate": 4.193952367044232e-06, + "logits/chosen": -1.0797913074493408, + "logits/rejected": -1.0517319440841675, + "logps/chosen": -131.67408752441406, + "logps/rejected": -37.82082748413086, + "loss": 0.0738, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.934950351715088, + "rewards/margins": 1.9243450164794922, + "rewards/rejected": 3.0106053352355957, + "step": 6962 + }, + { + "epoch": 1.13, + "learning_rate": 4.192655331123457e-06, + "logits/chosen": -1.006507158279419, + "logits/rejected": -1.0110740661621094, + "logps/chosen": -42.25281524658203, + "logps/rejected": -51.60976791381836, + "loss": 0.9399, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.0219123363494873, + "rewards/margins": 0.7519038915634155, + "rewards/rejected": 1.2700084447860718, + "step": 6963 + }, + { + "epoch": 1.13, + "learning_rate": 4.191358350982559e-06, + "logits/chosen": -1.3968446254730225, + "logits/rejected": -1.4029115438461304, + "logps/chosen": -134.41531372070312, + "logps/rejected": -129.04763793945312, + "loss": 0.6788, + "rewards/accuracies": 0.0, + "rewards/chosen": 6.18896484375, + "rewards/margins": -1.0543060302734375, + "rewards/rejected": 7.2432708740234375, + "step": 6964 + }, + { + "epoch": 1.13, + "learning_rate": 4.190061426711149e-06, + "logits/chosen": -1.3221131563186646, + "logits/rejected": -1.3232064247131348, + "logps/chosen": -118.54957580566406, + "logps/rejected": -111.35102081298828, + "loss": 1.0311, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.0211013555526733, + "rewards/margins": -0.9056175947189331, + "rewards/rejected": 1.9267189502716064, + "step": 6965 + }, + { + "epoch": 1.13, + "learning_rate": 4.18876455839883e-06, + "logits/chosen": -1.269265055656433, + "logits/rejected": -1.3143748044967651, + "logps/chosen": -64.01731872558594, + "logps/rejected": -77.17363739013672, + "loss": 1.1197, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.3387603759765625, + "rewards/margins": -1.8016853332519531, + "rewards/rejected": 4.140445709228516, + "step": 6966 + }, + { + "epoch": 1.13, + "learning_rate": 4.187467746135204e-06, + "logits/chosen": -1.13210928440094, + "logits/rejected": -1.1334463357925415, + "logps/chosen": -57.6578254699707, + "logps/rejected": -87.81983184814453, + "loss": 0.2316, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.3368496894836426, + "rewards/margins": 1.663717269897461, + "rewards/rejected": 0.6731323599815369, + "step": 6967 + }, + { + "epoch": 1.13, + "learning_rate": 4.186170990009868e-06, + "logits/chosen": -1.2133235931396484, + "logits/rejected": -1.1455183029174805, + "logps/chosen": -87.80968475341797, + "logps/rejected": -143.13877868652344, + "loss": 0.321, + "rewards/accuracies": 1.0, + "rewards/chosen": 7.870148658752441, + "rewards/margins": 0.13649845123291016, + "rewards/rejected": 7.733650207519531, + "step": 6968 + }, + { + "epoch": 1.13, + "learning_rate": 4.184874290112417e-06, + "logits/chosen": -0.8318223357200623, + "logits/rejected": -0.8875447511672974, + "logps/chosen": -41.33553695678711, + "logps/rejected": -65.1004409790039, + "loss": 0.4026, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.0205745697021484, + "rewards/margins": -0.1564197540283203, + "rewards/rejected": 2.1769943237304688, + "step": 6969 + }, + { + "epoch": 1.13, + "learning_rate": 4.183577646532439e-06, + "logits/chosen": -1.1600747108459473, + "logits/rejected": -1.1600747108459473, + "logps/chosen": -77.10820007324219, + "logps/rejected": -77.10820007324219, + "loss": 0.3652, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.393711805343628, + "rewards/margins": 0.0, + "rewards/rejected": 2.393711805343628, + "step": 6970 + }, + { + "epoch": 1.13, + "learning_rate": 4.182281059359521e-06, + "logits/chosen": -1.22309148311615, + "logits/rejected": -1.3342660665512085, + "logps/chosen": -93.58984375, + "logps/rejected": -202.189697265625, + "loss": 0.7363, + "rewards/accuracies": 0.0, + "rewards/chosen": 6.522006511688232, + "rewards/margins": -0.11091279983520508, + "rewards/rejected": 6.6329193115234375, + "step": 6971 + }, + { + "epoch": 1.13, + "learning_rate": 4.180984528683244e-06, + "logits/chosen": -0.9320636987686157, + "logits/rejected": -0.9998098611831665, + "logps/chosen": -47.38898468017578, + "logps/rejected": -122.51250457763672, + "loss": 0.3012, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.7237160205841064, + "rewards/margins": 0.20399785041809082, + "rewards/rejected": 1.5197181701660156, + "step": 6972 + }, + { + "epoch": 1.13, + "learning_rate": 4.1796880545931865e-06, + "logits/chosen": -1.017838478088379, + "logits/rejected": -1.017838478088379, + "logps/chosen": -1.7240577936172485, + "logps/rejected": -1.7240577936172485, + "loss": 0.4947, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.36893510818481445, + "rewards/margins": 0.0, + "rewards/rejected": 0.36893510818481445, + "step": 6973 + }, + { + "epoch": 1.13, + "learning_rate": 4.178391637178923e-06, + "logits/chosen": -1.2956644296646118, + "logits/rejected": -1.229750633239746, + "logps/chosen": -98.7069091796875, + "logps/rejected": -102.56825256347656, + "loss": 0.9666, + "rewards/accuracies": 1.0, + "rewards/chosen": 7.459887981414795, + "rewards/margins": 0.36614227294921875, + "rewards/rejected": 7.093745708465576, + "step": 6974 + }, + { + "epoch": 1.13, + "learning_rate": 4.177095276530023e-06, + "logits/chosen": -1.0280927419662476, + "logits/rejected": -0.8362683057785034, + "logps/chosen": -35.71530532836914, + "logps/rejected": -34.95492172241211, + "loss": 0.0191, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.5375466346740723, + "rewards/margins": 3.7208662033081055, + "rewards/rejected": -1.1833194494247437, + "step": 6975 + }, + { + "epoch": 1.13, + "learning_rate": 4.175798972736053e-06, + "logits/chosen": -1.8070132732391357, + "logits/rejected": -1.7461819648742676, + "logps/chosen": -87.70278930664062, + "logps/rejected": -15.89100170135498, + "loss": 0.1857, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.458184719085693, + "rewards/margins": 5.571269989013672, + "rewards/rejected": 0.8869149088859558, + "step": 6976 + }, + { + "epoch": 1.13, + "learning_rate": 4.174502725886576e-06, + "logits/chosen": -1.2537009716033936, + "logits/rejected": -1.2612168788909912, + "logps/chosen": -60.150299072265625, + "logps/rejected": -68.39739227294922, + "loss": 0.5271, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.2072625160217285, + "rewards/margins": -0.1961042881011963, + "rewards/rejected": 2.403366804122925, + "step": 6977 + }, + { + "epoch": 1.13, + "learning_rate": 4.173206536071149e-06, + "logits/chosen": -1.264420986175537, + "logits/rejected": -1.2617645263671875, + "logps/chosen": -63.39069747924805, + "logps/rejected": -64.21232604980469, + "loss": 0.401, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.6254031658172607, + "rewards/margins": 1.526943564414978, + "rewards/rejected": 1.0984596014022827, + "step": 6978 + }, + { + "epoch": 1.13, + "learning_rate": 4.171910403379327e-06, + "logits/chosen": -1.1431277990341187, + "logits/rejected": -1.1004618406295776, + "logps/chosen": -91.28462219238281, + "logps/rejected": -58.317909240722656, + "loss": 0.1423, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.248802185058594, + "rewards/margins": 1.1315560340881348, + "rewards/rejected": 4.117246150970459, + "step": 6979 + }, + { + "epoch": 1.13, + "learning_rate": 4.1706143279006615e-06, + "logits/chosen": -1.4213563203811646, + "logits/rejected": -1.4472686052322388, + "logps/chosen": -91.1543197631836, + "logps/rejected": -130.39892578125, + "loss": 0.5488, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.6662148237228394, + "rewards/margins": -0.5281256437301636, + "rewards/rejected": 2.194340467453003, + "step": 6980 + }, + { + "epoch": 1.13, + "learning_rate": 4.169318309724697e-06, + "logits/chosen": -1.3220642805099487, + "logits/rejected": -1.2914332151412964, + "logps/chosen": -81.8591079711914, + "logps/rejected": -57.081661224365234, + "loss": 1.5347, + "rewards/accuracies": 0.0, + "rewards/chosen": 4.460792064666748, + "rewards/margins": -0.8141398429870605, + "rewards/rejected": 5.274931907653809, + "step": 6981 + }, + { + "epoch": 1.13, + "learning_rate": 4.168022348940978e-06, + "logits/chosen": -1.1192573308944702, + "logits/rejected": -1.122225046157837, + "logps/chosen": -55.59294891357422, + "logps/rejected": -54.420284271240234, + "loss": 0.1771, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0206284523010254, + "rewards/margins": 1.4257718324661255, + "rewards/rejected": 1.5948566198349, + "step": 6982 + }, + { + "epoch": 1.13, + "learning_rate": 4.166726445639043e-06, + "logits/chosen": -1.0534663200378418, + "logits/rejected": -1.0561118125915527, + "logps/chosen": -5.638429641723633, + "logps/rejected": -4.1659088134765625, + "loss": 2.8557, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.3438551127910614, + "rewards/margins": -0.15175962448120117, + "rewards/rejected": 0.4956147372722626, + "step": 6983 + }, + { + "epoch": 1.13, + "learning_rate": 4.165430599908424e-06, + "logits/chosen": -0.8845110535621643, + "logits/rejected": -0.8845110535621643, + "logps/chosen": -16.796871185302734, + "logps/rejected": -16.796871185302734, + "loss": 0.3969, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.2966514527797699, + "rewards/margins": 0.0, + "rewards/rejected": 0.2966514527797699, + "step": 6984 + }, + { + "epoch": 1.13, + "learning_rate": 4.164134811838656e-06, + "logits/chosen": -1.3255149126052856, + "logits/rejected": -1.2257601022720337, + "logps/chosen": -97.99229431152344, + "logps/rejected": -27.331422805786133, + "loss": 0.3016, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9722917079925537, + "rewards/margins": 1.5228095054626465, + "rewards/rejected": 1.4494822025299072, + "step": 6985 + }, + { + "epoch": 1.13, + "learning_rate": 4.162839081519262e-06, + "logits/chosen": -1.0367215871810913, + "logits/rejected": -0.9117876887321472, + "logps/chosen": -110.39006805419922, + "logps/rejected": -18.577312469482422, + "loss": 0.1429, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.7910438776016235, + "rewards/margins": 1.2321829795837402, + "rewards/rejected": 0.5588609576225281, + "step": 6986 + }, + { + "epoch": 1.13, + "learning_rate": 4.1615434090397675e-06, + "logits/chosen": -0.8641809821128845, + "logits/rejected": -1.0850383043289185, + "logps/chosen": -89.7506103515625, + "logps/rejected": -82.59894561767578, + "loss": 0.6611, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.6635223627090454, + "rewards/margins": -1.00835120677948, + "rewards/rejected": 2.6718735694885254, + "step": 6987 + }, + { + "epoch": 1.13, + "learning_rate": 4.160247794489689e-06, + "logits/chosen": -1.1531637907028198, + "logits/rejected": -1.2375307083129883, + "logps/chosen": -63.349151611328125, + "logps/rejected": -99.19233703613281, + "loss": 1.68, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.39634108543396, + "rewards/margins": -2.9489285945892334, + "rewards/rejected": 5.345269680023193, + "step": 6988 + }, + { + "epoch": 1.13, + "learning_rate": 4.158952237958542e-06, + "logits/chosen": -0.7230464816093445, + "logits/rejected": -0.7197436094284058, + "logps/chosen": -5.575733661651611, + "logps/rejected": -9.24474048614502, + "loss": 1.1189, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.40957188606262207, + "rewards/margins": 0.29071909189224243, + "rewards/rejected": 0.11885280907154083, + "step": 6989 + }, + { + "epoch": 1.13, + "learning_rate": 4.157656739535838e-06, + "logits/chosen": -1.3951491117477417, + "logits/rejected": -1.1763614416122437, + "logps/chosen": -138.69886779785156, + "logps/rejected": -49.048118591308594, + "loss": 0.0439, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.263453960418701, + "rewards/margins": 3.74239444732666, + "rewards/rejected": 2.521059513092041, + "step": 6990 + }, + { + "epoch": 1.13, + "learning_rate": 4.156361299311082e-06, + "logits/chosen": -1.0082732439041138, + "logits/rejected": -1.0082732439041138, + "logps/chosen": -20.314685821533203, + "logps/rejected": -20.314685821533203, + "loss": 0.4445, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.9961239099502563, + "rewards/margins": 0.0, + "rewards/rejected": 0.9961239099502563, + "step": 6991 + }, + { + "epoch": 1.13, + "learning_rate": 4.155065917373779e-06, + "logits/chosen": -0.9495673179626465, + "logits/rejected": -0.8896148800849915, + "logps/chosen": -49.1603889465332, + "logps/rejected": -17.267942428588867, + "loss": 0.2841, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.169371485710144, + "rewards/margins": 0.45397764444351196, + "rewards/rejected": 0.7153938412666321, + "step": 6992 + }, + { + "epoch": 1.14, + "learning_rate": 4.1537705938134255e-06, + "logits/chosen": -1.359190583229065, + "logits/rejected": -1.3148518800735474, + "logps/chosen": -68.49468994140625, + "logps/rejected": -88.18548583984375, + "loss": 1.5567, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.765223026275635, + "rewards/margins": 1.3603074550628662, + "rewards/rejected": 3.4049155712127686, + "step": 6993 + }, + { + "epoch": 1.14, + "learning_rate": 4.152475328719517e-06, + "logits/chosen": -1.0342565774917603, + "logits/rejected": -1.0492253303527832, + "logps/chosen": -7.6940388679504395, + "logps/rejected": -24.13074493408203, + "loss": 0.811, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.3478847146034241, + "rewards/margins": -0.5031757950782776, + "rewards/rejected": 0.8510605096817017, + "step": 6994 + }, + { + "epoch": 1.14, + "learning_rate": 4.1511801221815436e-06, + "logits/chosen": -1.3292735815048218, + "logits/rejected": -1.2609974145889282, + "logps/chosen": -82.32638549804688, + "logps/rejected": -58.13558578491211, + "loss": 0.183, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.239543437957764, + "rewards/margins": 1.3645124435424805, + "rewards/rejected": 2.875030994415283, + "step": 6995 + }, + { + "epoch": 1.14, + "learning_rate": 4.149884974288993e-06, + "logits/chosen": -1.3440794944763184, + "logits/rejected": -1.3067524433135986, + "logps/chosen": -74.55265808105469, + "logps/rejected": -63.160518646240234, + "loss": 0.7384, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2469193935394287, + "rewards/margins": 0.3191251754760742, + "rewards/rejected": 2.9277942180633545, + "step": 6996 + }, + { + "epoch": 1.14, + "learning_rate": 4.148589885131346e-06, + "logits/chosen": -1.3557900190353394, + "logits/rejected": -1.0346330404281616, + "logps/chosen": -123.79167938232422, + "logps/rejected": -35.7154541015625, + "loss": 1.2004, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.228577613830566, + "rewards/margins": 4.220998764038086, + "rewards/rejected": 1.0075790882110596, + "step": 6997 + }, + { + "epoch": 1.14, + "learning_rate": 4.1472948547980815e-06, + "logits/chosen": -1.258888840675354, + "logits/rejected": -1.2114168405532837, + "logps/chosen": -66.51395416259766, + "logps/rejected": -50.9241943359375, + "loss": 0.8482, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.9519569873809814, + "rewards/margins": -1.374016523361206, + "rewards/rejected": 3.3259735107421875, + "step": 6998 + }, + { + "epoch": 1.14, + "learning_rate": 4.145999883378675e-06, + "logits/chosen": -1.0091074705123901, + "logits/rejected": -1.062704086303711, + "logps/chosen": -3.448850154876709, + "logps/rejected": -90.9859619140625, + "loss": 0.2958, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5575380921363831, + "rewards/margins": 0.255758136510849, + "rewards/rejected": 0.30177995562553406, + "step": 6999 + }, + { + "epoch": 1.14, + "learning_rate": 4.1447049709625965e-06, + "logits/chosen": -1.1718535423278809, + "logits/rejected": -1.1259280443191528, + "logps/chosen": -47.717857360839844, + "logps/rejected": -40.05204772949219, + "loss": 0.5559, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.2244541645050049, + "rewards/margins": -0.043569207191467285, + "rewards/rejected": 1.2680233716964722, + "step": 7000 + }, + { + "epoch": 1.14, + "learning_rate": 4.14341011763931e-06, + "logits/chosen": -1.4176617860794067, + "logits/rejected": -1.5098341703414917, + "logps/chosen": -105.96202087402344, + "logps/rejected": -119.46073913574219, + "loss": 0.6261, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.66339111328125, + "rewards/margins": -0.8277268409729004, + "rewards/rejected": 4.49111795425415, + "step": 7001 + }, + { + "epoch": 1.14, + "learning_rate": 4.14211532349828e-06, + "logits/chosen": -0.9018633365631104, + "logits/rejected": -0.9004610776901245, + "logps/chosen": -2.673755168914795, + "logps/rejected": -12.917023658752441, + "loss": 0.6182, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.8352506756782532, + "rewards/margins": 0.013893425464630127, + "rewards/rejected": 0.821357250213623, + "step": 7002 + }, + { + "epoch": 1.14, + "learning_rate": 4.140820588628964e-06, + "logits/chosen": -1.4135515689849854, + "logits/rejected": -1.34250807762146, + "logps/chosen": -60.417144775390625, + "logps/rejected": -50.02195358276367, + "loss": 0.4216, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.0948853492736816, + "rewards/margins": -0.24229073524475098, + "rewards/rejected": 3.3371760845184326, + "step": 7003 + }, + { + "epoch": 1.14, + "learning_rate": 4.139525913120815e-06, + "logits/chosen": -1.3403537273406982, + "logits/rejected": -1.3117278814315796, + "logps/chosen": -42.809898376464844, + "logps/rejected": -22.40077018737793, + "loss": 1.2294, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.4535026550292969, + "rewards/margins": -0.5550702810287476, + "rewards/rejected": 1.0085729360580444, + "step": 7004 + }, + { + "epoch": 1.14, + "learning_rate": 4.138231297063285e-06, + "logits/chosen": -1.040946364402771, + "logits/rejected": -1.0458675622940063, + "logps/chosen": -80.860595703125, + "logps/rejected": -78.35813903808594, + "loss": 1.0934, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.8339874744415283, + "rewards/margins": -1.4953887462615967, + "rewards/rejected": 3.329376220703125, + "step": 7005 + }, + { + "epoch": 1.14, + "learning_rate": 4.136936740545817e-06, + "logits/chosen": -1.3071372509002686, + "logits/rejected": -1.2598267793655396, + "logps/chosen": -105.36112213134766, + "logps/rejected": -89.77227783203125, + "loss": 0.4886, + "rewards/accuracies": 0.0, + "rewards/chosen": 5.509469032287598, + "rewards/margins": -0.4838385581970215, + "rewards/rejected": 5.993307590484619, + "step": 7006 + }, + { + "epoch": 1.14, + "learning_rate": 4.135642243657854e-06, + "logits/chosen": -1.2167500257492065, + "logits/rejected": -1.2403064966201782, + "logps/chosen": -73.65438842773438, + "logps/rejected": -161.49679565429688, + "loss": 0.1983, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.4045867919921875, + "rewards/margins": 0.8164504766464233, + "rewards/rejected": 1.5881363153457642, + "step": 7007 + }, + { + "epoch": 1.14, + "learning_rate": 4.134347806488834e-06, + "logits/chosen": -1.1279834508895874, + "logits/rejected": -1.0993751287460327, + "logps/chosen": -47.74853515625, + "logps/rejected": -71.07090759277344, + "loss": 0.4122, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.9096237421035767, + "rewards/margins": 0.4067268371582031, + "rewards/rejected": 1.5028969049453735, + "step": 7008 + }, + { + "epoch": 1.14, + "learning_rate": 4.133053429128189e-06, + "logits/chosen": -0.7480210661888123, + "logits/rejected": -0.753775954246521, + "logps/chosen": -1.5403307676315308, + "logps/rejected": -26.26323699951172, + "loss": 0.9074, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.287626177072525, + "rewards/margins": -0.35744258761405945, + "rewards/rejected": 0.6450687646865845, + "step": 7009 + }, + { + "epoch": 1.14, + "learning_rate": 4.131759111665349e-06, + "logits/chosen": -1.4156337976455688, + "logits/rejected": -1.345476508140564, + "logps/chosen": -49.54078674316406, + "logps/rejected": -60.45641326904297, + "loss": 0.1925, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.750072479248047, + "rewards/margins": 1.689762830734253, + "rewards/rejected": 3.060309648513794, + "step": 7010 + }, + { + "epoch": 1.14, + "learning_rate": 4.130464854189739e-06, + "logits/chosen": -0.7662110924720764, + "logits/rejected": -0.769603967666626, + "logps/chosen": -8.954389572143555, + "logps/rejected": -4.889614582061768, + "loss": 0.5013, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.11635780334472656, + "rewards/margins": -0.35862159729003906, + "rewards/rejected": 0.2422637939453125, + "step": 7011 + }, + { + "epoch": 1.14, + "learning_rate": 4.1291706567907794e-06, + "logits/chosen": -1.4593404531478882, + "logits/rejected": -1.362261414527893, + "logps/chosen": -70.17694091796875, + "logps/rejected": -58.203392028808594, + "loss": 0.9552, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.7859985828399658, + "rewards/margins": -1.6577560901641846, + "rewards/rejected": 3.4437546730041504, + "step": 7012 + }, + { + "epoch": 1.14, + "learning_rate": 4.127876519557889e-06, + "logits/chosen": -1.208459496498108, + "logits/rejected": -1.1419012546539307, + "logps/chosen": -69.53726196289062, + "logps/rejected": -42.234867095947266, + "loss": 0.326, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.1841933727264404, + "rewards/margins": 0.16368746757507324, + "rewards/rejected": 2.020505905151367, + "step": 7013 + }, + { + "epoch": 1.14, + "learning_rate": 4.126582442580478e-06, + "logits/chosen": -1.1607722043991089, + "logits/rejected": -1.1746076345443726, + "logps/chosen": -38.103759765625, + "logps/rejected": -59.31694793701172, + "loss": 1.159, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.6795147061347961, + "rewards/margins": 0.23819810152053833, + "rewards/rejected": 0.4413166046142578, + "step": 7014 + }, + { + "epoch": 1.14, + "learning_rate": 4.125288425947956e-06, + "logits/chosen": -1.2852197885513306, + "logits/rejected": -1.2042111158370972, + "logps/chosen": -61.79191589355469, + "logps/rejected": -42.52774429321289, + "loss": 0.3053, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.523320198059082, + "rewards/margins": 2.4806416034698486, + "rewards/rejected": 2.0426785945892334, + "step": 7015 + }, + { + "epoch": 1.14, + "learning_rate": 4.123994469749727e-06, + "logits/chosen": -1.455438256263733, + "logits/rejected": -1.3983522653579712, + "logps/chosen": -82.75358581542969, + "logps/rejected": -56.81158447265625, + "loss": 0.6904, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.5797653198242188, + "rewards/margins": 0.011202216148376465, + "rewards/rejected": 1.5685631036758423, + "step": 7016 + }, + { + "epoch": 1.14, + "learning_rate": 4.122700574075192e-06, + "logits/chosen": -1.2517592906951904, + "logits/rejected": -1.216326355934143, + "logps/chosen": -117.36457824707031, + "logps/rejected": -69.54757690429688, + "loss": 0.0794, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.470933437347412, + "rewards/margins": 2.177114725112915, + "rewards/rejected": 3.293818712234497, + "step": 7017 + }, + { + "epoch": 1.14, + "learning_rate": 4.121406739013746e-06, + "logits/chosen": -1.1612368822097778, + "logits/rejected": -1.2371076345443726, + "logps/chosen": -78.9263687133789, + "logps/rejected": -54.667152404785156, + "loss": 1.4755, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.0904487371444702, + "rewards/margins": -2.7074198722839355, + "rewards/rejected": 3.797868490219116, + "step": 7018 + }, + { + "epoch": 1.14, + "learning_rate": 4.12011296465478e-06, + "logits/chosen": -2.478484630584717, + "logits/rejected": -2.4419479370117188, + "logps/chosen": -72.78363800048828, + "logps/rejected": -66.91197204589844, + "loss": 0.5247, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.8586479425430298, + "rewards/margins": -0.5687600374221802, + "rewards/rejected": 2.42740797996521, + "step": 7019 + }, + { + "epoch": 1.14, + "learning_rate": 4.118819251087682e-06, + "logits/chosen": -1.2105478048324585, + "logits/rejected": -0.9894810318946838, + "logps/chosen": -91.65959167480469, + "logps/rejected": -13.254902839660645, + "loss": 0.1841, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.809832811355591, + "rewards/margins": 1.9204388856887817, + "rewards/rejected": 0.8893939256668091, + "step": 7020 + }, + { + "epoch": 1.14, + "learning_rate": 4.117525598401838e-06, + "logits/chosen": -1.2396743297576904, + "logits/rejected": -1.353581190109253, + "logps/chosen": -118.1949691772461, + "logps/rejected": -79.46885681152344, + "loss": 0.1762, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.87664270401001, + "rewards/margins": 2.1899445056915283, + "rewards/rejected": 2.6866981983184814, + "step": 7021 + }, + { + "epoch": 1.14, + "learning_rate": 4.116232006686624e-06, + "logits/chosen": -1.055827260017395, + "logits/rejected": -1.0005296468734741, + "logps/chosen": -66.55550384521484, + "logps/rejected": -72.01177215576172, + "loss": 0.4784, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.8261741399765015, + "rewards/margins": -0.3583153486251831, + "rewards/rejected": 2.1844894886016846, + "step": 7022 + }, + { + "epoch": 1.14, + "learning_rate": 4.114938476031417e-06, + "logits/chosen": -1.1895865201950073, + "logits/rejected": -1.1613733768463135, + "logps/chosen": -33.85837936401367, + "logps/rejected": -21.014013290405273, + "loss": 0.6706, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.6901401877403259, + "rewards/margins": -0.35398346185684204, + "rewards/rejected": 1.044123649597168, + "step": 7023 + }, + { + "epoch": 1.14, + "learning_rate": 4.1136450065255855e-06, + "logits/chosen": -1.6911553144454956, + "logits/rejected": -1.6948184967041016, + "logps/chosen": -214.66392517089844, + "logps/rejected": -149.08306884765625, + "loss": 0.1365, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.553825378417969, + "rewards/margins": 1.1957077980041504, + "rewards/rejected": 5.358117580413818, + "step": 7024 + }, + { + "epoch": 1.14, + "learning_rate": 4.112351598258498e-06, + "logits/chosen": -1.2442322969436646, + "logits/rejected": -1.2086715698242188, + "logps/chosen": -111.10447692871094, + "logps/rejected": -38.749977111816406, + "loss": 0.1343, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0468506813049316, + "rewards/margins": 1.9994995594024658, + "rewards/rejected": 1.0473511219024658, + "step": 7025 + }, + { + "epoch": 1.14, + "learning_rate": 4.111058251319517e-06, + "logits/chosen": -1.0803475379943848, + "logits/rejected": -1.1163287162780762, + "logps/chosen": -83.41999816894531, + "logps/rejected": -103.30630493164062, + "loss": 1.366, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.1617707014083862, + "rewards/margins": -1.899481177330017, + "rewards/rejected": 3.0612518787384033, + "step": 7026 + }, + { + "epoch": 1.14, + "learning_rate": 4.109764965797997e-06, + "logits/chosen": -1.1322505474090576, + "logits/rejected": -1.1332279443740845, + "logps/chosen": -6.955081939697266, + "logps/rejected": -4.666154861450195, + "loss": 0.4859, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.03267192840576172, + "rewards/margins": -0.15272818505764008, + "rewards/rejected": 0.1854001134634018, + "step": 7027 + }, + { + "epoch": 1.14, + "learning_rate": 4.108471741783297e-06, + "logits/chosen": -1.131808876991272, + "logits/rejected": -0.7979869842529297, + "logps/chosen": -41.73822784423828, + "logps/rejected": -34.60118865966797, + "loss": 0.2789, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.812938690185547, + "rewards/margins": 0.3161032199859619, + "rewards/rejected": 2.496835470199585, + "step": 7028 + }, + { + "epoch": 1.14, + "learning_rate": 4.107178579364763e-06, + "logits/chosen": -1.0760796070098877, + "logits/rejected": -1.0760796070098877, + "logps/chosen": -63.74341583251953, + "logps/rejected": -63.74341583251953, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.829770803451538, + "rewards/margins": 0.0, + "rewards/rejected": 3.829770803451538, + "step": 7029 + }, + { + "epoch": 1.14, + "learning_rate": 4.105885478631741e-06, + "logits/chosen": -1.2955995798110962, + "logits/rejected": -1.2482250928878784, + "logps/chosen": -33.427268981933594, + "logps/rejected": -91.9774169921875, + "loss": 0.2459, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.4769084453582764, + "rewards/margins": 0.7446131706237793, + "rewards/rejected": 1.732295274734497, + "step": 7030 + }, + { + "epoch": 1.14, + "learning_rate": 4.104592439673572e-06, + "logits/chosen": -1.3934855461120605, + "logits/rejected": -1.3367313146591187, + "logps/chosen": -72.30635833740234, + "logps/rejected": -122.3299560546875, + "loss": 0.3297, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.3041183948516846, + "rewards/margins": 0.13147282600402832, + "rewards/rejected": 2.1726455688476562, + "step": 7031 + }, + { + "epoch": 1.14, + "learning_rate": 4.1032994625795934e-06, + "logits/chosen": -1.1255793571472168, + "logits/rejected": -1.0849640369415283, + "logps/chosen": -55.54285430908203, + "logps/rejected": -92.35264587402344, + "loss": 1.5631, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.6954383850097656, + "rewards/margins": -3.068418025970459, + "rewards/rejected": 4.763856410980225, + "step": 7032 + }, + { + "epoch": 1.14, + "learning_rate": 4.1020065474391375e-06, + "logits/chosen": -1.1838148832321167, + "logits/rejected": -1.5682021379470825, + "logps/chosen": -85.10065460205078, + "logps/rejected": -54.29506301879883, + "loss": 0.3175, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.75968337059021, + "rewards/margins": 1.4172123670578003, + "rewards/rejected": 1.3424710035324097, + "step": 7033 + }, + { + "epoch": 1.14, + "learning_rate": 4.1007136943415325e-06, + "logits/chosen": -1.0112104415893555, + "logits/rejected": -0.9859048128128052, + "logps/chosen": -72.55386352539062, + "logps/rejected": -54.77945327758789, + "loss": 0.1577, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.143353223800659, + "rewards/margins": 1.003076434135437, + "rewards/rejected": 1.1402767896652222, + "step": 7034 + }, + { + "epoch": 1.14, + "learning_rate": 4.0994209033761015e-06, + "logits/chosen": -1.3533521890640259, + "logits/rejected": -1.2206872701644897, + "logps/chosen": -153.9826202392578, + "logps/rejected": -157.86073303222656, + "loss": 1.8486, + "rewards/accuracies": 0.0, + "rewards/chosen": 5.9548797607421875, + "rewards/margins": -3.297245979309082, + "rewards/rejected": 9.25212574005127, + "step": 7035 + }, + { + "epoch": 1.14, + "learning_rate": 4.098128174632164e-06, + "logits/chosen": -1.164610505104065, + "logits/rejected": -1.2657767534255981, + "logps/chosen": -47.37642288208008, + "logps/rejected": -114.02627563476562, + "loss": 1.6971, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.947305679321289, + "rewards/margins": -3.012890338897705, + "rewards/rejected": 4.960196018218994, + "step": 7036 + }, + { + "epoch": 1.14, + "learning_rate": 4.0968355081990374e-06, + "logits/chosen": -1.350527048110962, + "logits/rejected": -1.3445122241973877, + "logps/chosen": -71.57215881347656, + "logps/rejected": -97.66726684570312, + "loss": 0.3307, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.5944557189941406, + "rewards/margins": 0.1015479564666748, + "rewards/rejected": 1.4929077625274658, + "step": 7037 + }, + { + "epoch": 1.14, + "learning_rate": 4.095542904166031e-06, + "logits/chosen": -1.1744896173477173, + "logits/rejected": -1.144808292388916, + "logps/chosen": -80.68948364257812, + "logps/rejected": -115.0456314086914, + "loss": 0.2124, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0039849281311035, + "rewards/margins": 1.223962426185608, + "rewards/rejected": 1.7800225019454956, + "step": 7038 + }, + { + "epoch": 1.14, + "learning_rate": 4.0942503626224515e-06, + "logits/chosen": -1.144008994102478, + "logits/rejected": -1.0009489059448242, + "logps/chosen": -106.63921356201172, + "logps/rejected": -88.57879638671875, + "loss": 0.0997, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.018108367919922, + "rewards/margins": 2.7743332386016846, + "rewards/rejected": 3.2437751293182373, + "step": 7039 + }, + { + "epoch": 1.14, + "learning_rate": 4.092957883657604e-06, + "logits/chosen": -0.7100602984428406, + "logits/rejected": -0.7138444185256958, + "logps/chosen": -1.6690007448196411, + "logps/rejected": -21.40056800842285, + "loss": 0.7367, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.25964006781578064, + "rewards/margins": -0.023511379957199097, + "rewards/rejected": 0.28315144777297974, + "step": 7040 + }, + { + "epoch": 1.14, + "learning_rate": 4.091665467360781e-06, + "logits/chosen": -1.1952780485153198, + "logits/rejected": -1.1202960014343262, + "logps/chosen": -112.54883575439453, + "logps/rejected": -43.60211944580078, + "loss": 0.0536, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.10339879989624, + "rewards/margins": 2.561187744140625, + "rewards/rejected": 1.5422111749649048, + "step": 7041 + }, + { + "epoch": 1.14, + "learning_rate": 4.090373113821281e-06, + "logits/chosen": -1.4741272926330566, + "logits/rejected": -1.3407080173492432, + "logps/chosen": -97.59468841552734, + "logps/rejected": -54.58555603027344, + "loss": 0.0791, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.836609840393066, + "rewards/margins": 3.0468180179595947, + "rewards/rejected": 2.7897918224334717, + "step": 7042 + }, + { + "epoch": 1.14, + "learning_rate": 4.0890808231283915e-06, + "logits/chosen": -1.2069835662841797, + "logits/rejected": -1.219836950302124, + "logps/chosen": -106.74591827392578, + "logps/rejected": -55.08312225341797, + "loss": 0.3586, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.808513641357422, + "rewards/margins": 1.5228302478790283, + "rewards/rejected": 2.2856833934783936, + "step": 7043 + }, + { + "epoch": 1.14, + "learning_rate": 4.087788595371397e-06, + "logits/chosen": -1.221043586730957, + "logits/rejected": -1.2779335975646973, + "logps/chosen": -63.031333923339844, + "logps/rejected": -64.7325439453125, + "loss": 0.4257, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.6903159618377686, + "rewards/margins": 0.008840084075927734, + "rewards/rejected": 2.681475877761841, + "step": 7044 + }, + { + "epoch": 1.14, + "learning_rate": 4.086496430639581e-06, + "logits/chosen": -1.2670660018920898, + "logits/rejected": -1.2285698652267456, + "logps/chosen": -154.13949584960938, + "logps/rejected": -82.74267578125, + "loss": 0.7717, + "rewards/accuracies": 1.0, + "rewards/chosen": 7.4431962966918945, + "rewards/margins": 0.513737678527832, + "rewards/rejected": 6.9294586181640625, + "step": 7045 + }, + { + "epoch": 1.14, + "learning_rate": 4.085204329022216e-06, + "logits/chosen": -0.9752064347267151, + "logits/rejected": -0.9732920527458191, + "logps/chosen": -3.1443276405334473, + "logps/rejected": -6.315860748291016, + "loss": 0.3364, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3300468921661377, + "rewards/margins": 0.21455827355384827, + "rewards/rejected": 0.11548862606287003, + "step": 7046 + }, + { + "epoch": 1.14, + "learning_rate": 4.083912290608577e-06, + "logits/chosen": -0.7846000790596008, + "logits/rejected": -0.8026823401451111, + "logps/chosen": -7.3214945793151855, + "logps/rejected": -22.617197036743164, + "loss": 0.9425, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.37821003794670105, + "rewards/margins": -0.8298403024673462, + "rewards/rejected": 1.2080503702163696, + "step": 7047 + }, + { + "epoch": 1.14, + "learning_rate": 4.082620315487931e-06, + "logits/chosen": -1.2713072299957275, + "logits/rejected": -1.2751209735870361, + "logps/chosen": -74.69698333740234, + "logps/rejected": -74.4661636352539, + "loss": 1.5446, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.6351478099823, + "rewards/margins": -0.8296701908111572, + "rewards/rejected": 4.464818000793457, + "step": 7048 + }, + { + "epoch": 1.14, + "learning_rate": 4.08132840374954e-06, + "logits/chosen": -1.5020649433135986, + "logits/rejected": -1.471886157989502, + "logps/chosen": -32.28584289550781, + "logps/rejected": -90.30702209472656, + "loss": 0.5869, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.6723037958145142, + "rewards/margins": -0.33161771297454834, + "rewards/rejected": 2.0039215087890625, + "step": 7049 + }, + { + "epoch": 1.14, + "learning_rate": 4.080036555482665e-06, + "logits/chosen": -1.3391518592834473, + "logits/rejected": -1.2589508295059204, + "logps/chosen": -100.3930435180664, + "logps/rejected": -65.00703430175781, + "loss": 0.1276, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.955690860748291, + "rewards/margins": 3.1472983360290527, + "rewards/rejected": 1.8083924055099487, + "step": 7050 + }, + { + "epoch": 1.14, + "learning_rate": 4.078744770776558e-06, + "logits/chosen": -1.220554232597351, + "logits/rejected": -1.3050390481948853, + "logps/chosen": -93.6842041015625, + "logps/rejected": -118.94697570800781, + "loss": 0.5175, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.6145951747894287, + "rewards/margins": -0.3239316940307617, + "rewards/rejected": 2.9385268688201904, + "step": 7051 + }, + { + "epoch": 1.14, + "learning_rate": 4.077453049720472e-06, + "logits/chosen": -1.2877591848373413, + "logits/rejected": -1.138850212097168, + "logps/chosen": -57.08938217163086, + "logps/rejected": -41.963558197021484, + "loss": 0.1745, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.5918736457824707, + "rewards/margins": 1.0782356262207031, + "rewards/rejected": 2.5136380195617676, + "step": 7052 + }, + { + "epoch": 1.14, + "learning_rate": 4.076161392403649e-06, + "logits/chosen": -1.7865616083145142, + "logits/rejected": -1.7497574090957642, + "logps/chosen": -86.6309814453125, + "logps/rejected": -60.777183532714844, + "loss": 0.42, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.0079307556152344, + "rewards/margins": -0.253481388092041, + "rewards/rejected": 3.2614121437072754, + "step": 7053 + }, + { + "epoch": 1.14, + "learning_rate": 4.074869798915333e-06, + "logits/chosen": -1.5746828317642212, + "logits/rejected": -1.3896076679229736, + "logps/chosen": -112.64021301269531, + "logps/rejected": -12.48257064819336, + "loss": 0.0299, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.77626371383667, + "rewards/margins": 4.3776140213012695, + "rewards/rejected": 1.3986494541168213, + "step": 7054 + }, + { + "epoch": 1.15, + "learning_rate": 4.07357826934476e-06, + "logits/chosen": -1.162523627281189, + "logits/rejected": -1.1572375297546387, + "logps/chosen": -54.36701965332031, + "logps/rejected": -76.59354400634766, + "loss": 0.1126, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.6558175086975098, + "rewards/margins": 1.382533311843872, + "rewards/rejected": 1.2732841968536377, + "step": 7055 + }, + { + "epoch": 1.15, + "learning_rate": 4.072286803781164e-06, + "logits/chosen": -1.1364657878875732, + "logits/rejected": -1.016300082206726, + "logps/chosen": -173.3133544921875, + "logps/rejected": -220.36911010742188, + "loss": 1.369, + "rewards/accuracies": 0.0, + "rewards/chosen": 5.60426664352417, + "rewards/margins": -1.317699909210205, + "rewards/rejected": 6.921966552734375, + "step": 7056 + }, + { + "epoch": 1.15, + "learning_rate": 4.07099540231377e-06, + "logits/chosen": -1.0589561462402344, + "logits/rejected": -1.0661451816558838, + "logps/chosen": -17.883209228515625, + "logps/rejected": -24.70088005065918, + "loss": 0.7705, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.7448108792304993, + "rewards/margins": 0.33088037371635437, + "rewards/rejected": 0.4139305055141449, + "step": 7057 + }, + { + "epoch": 1.15, + "learning_rate": 4.069704065031804e-06, + "logits/chosen": -0.6118764877319336, + "logits/rejected": -0.6118764877319336, + "logps/chosen": -44.41313934326172, + "logps/rejected": -44.41313934326172, + "loss": 0.5396, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.3533012866973877, + "rewards/margins": 0.0, + "rewards/rejected": 1.3533012866973877, + "step": 7058 + }, + { + "epoch": 1.15, + "learning_rate": 4.068412792024486e-06, + "logits/chosen": -1.4007071256637573, + "logits/rejected": -1.3880865573883057, + "logps/chosen": -83.97450256347656, + "logps/rejected": -55.726112365722656, + "loss": 0.8524, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.867720127105713, + "rewards/margins": 0.07881712913513184, + "rewards/rejected": 2.788902997970581, + "step": 7059 + }, + { + "epoch": 1.15, + "learning_rate": 4.067121583381027e-06, + "logits/chosen": -1.5473946332931519, + "logits/rejected": -1.5112501382827759, + "logps/chosen": -110.05032348632812, + "logps/rejected": -43.50425338745117, + "loss": 0.0346, + "rewards/accuracies": 1.0, + "rewards/chosen": 8.964552879333496, + "rewards/margins": 7.379549503326416, + "rewards/rejected": 1.5850032567977905, + "step": 7060 + }, + { + "epoch": 1.15, + "learning_rate": 4.065830439190641e-06, + "logits/chosen": -1.3383389711380005, + "logits/rejected": -1.3472014665603638, + "logps/chosen": -51.81520080566406, + "logps/rejected": -101.03282165527344, + "loss": 1.5612, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.4360519647598267, + "rewards/margins": -1.2599915266036987, + "rewards/rejected": 2.6960434913635254, + "step": 7061 + }, + { + "epoch": 1.15, + "learning_rate": 4.064539359542532e-06, + "logits/chosen": -1.0785565376281738, + "logits/rejected": -1.1057542562484741, + "logps/chosen": -64.54246520996094, + "logps/rejected": -73.96239471435547, + "loss": 1.3749, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.8679580688476562, + "rewards/margins": -2.3420281410217285, + "rewards/rejected": 5.209986209869385, + "step": 7062 + }, + { + "epoch": 1.15, + "learning_rate": 4.063248344525903e-06, + "logits/chosen": -1.3956888914108276, + "logits/rejected": -1.3629895448684692, + "logps/chosen": -48.23462677001953, + "logps/rejected": -53.56207275390625, + "loss": 0.3612, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.300581455230713, + "rewards/margins": -0.03045964241027832, + "rewards/rejected": 2.331041097640991, + "step": 7063 + }, + { + "epoch": 1.15, + "learning_rate": 4.061957394229949e-06, + "logits/chosen": -1.251833200454712, + "logits/rejected": -1.2097407579421997, + "logps/chosen": -46.77806854248047, + "logps/rejected": -50.4183349609375, + "loss": 0.6517, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.498642683029175, + "rewards/margins": -0.9848191738128662, + "rewards/rejected": 3.483461856842041, + "step": 7064 + }, + { + "epoch": 1.15, + "learning_rate": 4.060666508743863e-06, + "logits/chosen": -1.1625652313232422, + "logits/rejected": -1.1229872703552246, + "logps/chosen": -54.512840270996094, + "logps/rejected": -119.7962417602539, + "loss": 0.2319, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.4315574169158936, + "rewards/margins": 0.5732635259628296, + "rewards/rejected": 1.858293890953064, + "step": 7065 + }, + { + "epoch": 1.15, + "learning_rate": 4.059375688156833e-06, + "logits/chosen": -1.022790789604187, + "logits/rejected": -1.0806005001068115, + "logps/chosen": -42.70428466796875, + "logps/rejected": -72.57906341552734, + "loss": 0.8133, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.510615587234497, + "rewards/margins": -1.0411324501037598, + "rewards/rejected": 2.551748037338257, + "step": 7066 + }, + { + "epoch": 1.15, + "learning_rate": 4.058084932558042e-06, + "logits/chosen": -1.2326256036758423, + "logits/rejected": -1.2819080352783203, + "logps/chosen": -76.79973602294922, + "logps/rejected": -94.46369934082031, + "loss": 0.3283, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.1336441040039062, + "rewards/margins": 0.3694305419921875, + "rewards/rejected": 1.7642135620117188, + "step": 7067 + }, + { + "epoch": 1.15, + "learning_rate": 4.056794242036671e-06, + "logits/chosen": -1.2831215858459473, + "logits/rejected": -1.2643145322799683, + "logps/chosen": -41.33189392089844, + "logps/rejected": -68.79176330566406, + "loss": 0.2394, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.292607069015503, + "rewards/margins": 0.5958418846130371, + "rewards/rejected": 2.696765184402466, + "step": 7068 + }, + { + "epoch": 1.15, + "learning_rate": 4.055503616681893e-06, + "logits/chosen": -1.7464110851287842, + "logits/rejected": -1.7195944786071777, + "logps/chosen": -52.94987869262695, + "logps/rejected": -66.90913391113281, + "loss": 0.7226, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.2624988555908203, + "rewards/margins": -0.4868342876434326, + "rewards/rejected": 3.749333143234253, + "step": 7069 + }, + { + "epoch": 1.15, + "learning_rate": 4.054213056582877e-06, + "logits/chosen": -1.4746421575546265, + "logits/rejected": -1.4047269821166992, + "logps/chosen": -167.7520294189453, + "logps/rejected": -166.01942443847656, + "loss": 0.2537, + "rewards/accuracies": 1.0, + "rewards/chosen": 9.500041007995605, + "rewards/margins": 0.41880130767822266, + "rewards/rejected": 9.081239700317383, + "step": 7070 + }, + { + "epoch": 1.15, + "learning_rate": 4.052922561828792e-06, + "logits/chosen": -1.3280400037765503, + "logits/rejected": -1.1521154642105103, + "logps/chosen": -84.73406982421875, + "logps/rejected": -24.926895141601562, + "loss": 0.0545, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1785600185394287, + "rewards/margins": 2.23622989654541, + "rewards/rejected": 0.9423301815986633, + "step": 7071 + }, + { + "epoch": 1.15, + "learning_rate": 4.0516321325087945e-06, + "logits/chosen": -1.2752625942230225, + "logits/rejected": -1.3597058057785034, + "logps/chosen": -40.674652099609375, + "logps/rejected": -115.56633758544922, + "loss": 2.1835, + "rewards/accuracies": 0.0, + "rewards/chosen": 4.0467987060546875, + "rewards/margins": -3.778522491455078, + "rewards/rejected": 7.825321197509766, + "step": 7072 + }, + { + "epoch": 1.15, + "learning_rate": 4.050341768712044e-06, + "logits/chosen": -1.5836477279663086, + "logits/rejected": -1.4742287397384644, + "logps/chosen": -75.33816528320312, + "logps/rejected": -111.552490234375, + "loss": 0.0439, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.7282867431640625, + "rewards/margins": 2.7774107456207275, + "rewards/rejected": 2.950875997543335, + "step": 7073 + }, + { + "epoch": 1.15, + "learning_rate": 4.049051470527692e-06, + "logits/chosen": -1.4284002780914307, + "logits/rejected": -1.3884013891220093, + "logps/chosen": -165.60218811035156, + "logps/rejected": -154.3708953857422, + "loss": 0.4301, + "rewards/accuracies": 0.0, + "rewards/chosen": 5.029585361480713, + "rewards/margins": -0.220794677734375, + "rewards/rejected": 5.250380039215088, + "step": 7074 + }, + { + "epoch": 1.15, + "learning_rate": 4.047761238044884e-06, + "logits/chosen": -1.0995969772338867, + "logits/rejected": -1.1249345541000366, + "logps/chosen": -113.38336181640625, + "logps/rejected": -124.81238555908203, + "loss": 0.7097, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.492269992828369, + "rewards/margins": 0.35454487800598145, + "rewards/rejected": 3.1377251148223877, + "step": 7075 + }, + { + "epoch": 1.15, + "learning_rate": 4.0464710713527646e-06, + "logits/chosen": -1.449973464012146, + "logits/rejected": -1.4228105545043945, + "logps/chosen": -44.07145690917969, + "logps/rejected": -74.84687805175781, + "loss": 0.2624, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.187666416168213, + "rewards/margins": 1.1291618347167969, + "rewards/rejected": 4.058504581451416, + "step": 7076 + }, + { + "epoch": 1.15, + "learning_rate": 4.045180970540472e-06, + "logits/chosen": -1.4682494401931763, + "logits/rejected": -1.277836561203003, + "logps/chosen": -192.04315185546875, + "logps/rejected": -55.5047607421875, + "loss": 0.0706, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1010329723358154, + "rewards/margins": 2.5551817417144775, + "rewards/rejected": 0.5458511710166931, + "step": 7077 + }, + { + "epoch": 1.15, + "learning_rate": 4.0438909356971405e-06, + "logits/chosen": -1.1804561614990234, + "logits/rejected": -1.2602643966674805, + "logps/chosen": -221.86105346679688, + "logps/rejected": -80.93485260009766, + "loss": 0.0449, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.850302219390869, + "rewards/margins": 4.441301345825195, + "rewards/rejected": 2.409001111984253, + "step": 7078 + }, + { + "epoch": 1.15, + "learning_rate": 4.042600966911897e-06, + "logits/chosen": -1.0558006763458252, + "logits/rejected": -1.0191693305969238, + "logps/chosen": -44.98345947265625, + "logps/rejected": -38.65721130371094, + "loss": 0.455, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.112107992172241, + "rewards/margins": -0.3926239013671875, + "rewards/rejected": 3.5047318935394287, + "step": 7079 + }, + { + "epoch": 1.15, + "learning_rate": 4.041311064273869e-06, + "logits/chosen": -0.9800359606742859, + "logits/rejected": -0.9800359606742859, + "logps/chosen": -68.4776611328125, + "logps/rejected": -68.4776611328125, + "loss": 0.3475, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.618931531906128, + "rewards/margins": 0.0, + "rewards/rejected": 3.618931531906128, + "step": 7080 + }, + { + "epoch": 1.15, + "learning_rate": 4.040021227872175e-06, + "logits/chosen": -1.3450536727905273, + "logits/rejected": -1.3336520195007324, + "logps/chosen": -96.17303466796875, + "logps/rejected": -52.28898620605469, + "loss": 0.5598, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.6637496948242188, + "rewards/margins": -0.6997725963592529, + "rewards/rejected": 3.3635222911834717, + "step": 7081 + }, + { + "epoch": 1.15, + "learning_rate": 4.0387314577959315e-06, + "logits/chosen": -1.4312143325805664, + "logits/rejected": -1.4617366790771484, + "logps/chosen": -39.760780334472656, + "logps/rejected": -43.24684143066406, + "loss": 2.7159, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.7825400829315186, + "rewards/margins": -1.8588473796844482, + "rewards/rejected": 4.641387462615967, + "step": 7082 + }, + { + "epoch": 1.15, + "learning_rate": 4.037441754134247e-06, + "logits/chosen": -0.9789579510688782, + "logits/rejected": -0.9764392971992493, + "logps/chosen": -92.4750747680664, + "logps/rejected": -87.30435180664062, + "loss": 0.696, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.1886001825332642, + "rewards/margins": 0.22238922119140625, + "rewards/rejected": 0.9662109613418579, + "step": 7083 + }, + { + "epoch": 1.15, + "learning_rate": 4.036152116976231e-06, + "logits/chosen": -1.2245134115219116, + "logits/rejected": -1.0317424535751343, + "logps/chosen": -74.75071716308594, + "logps/rejected": -139.39031982421875, + "loss": 0.025, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.049363613128662, + "rewards/margins": 5.848710536956787, + "rewards/rejected": 0.200653076171875, + "step": 7084 + }, + { + "epoch": 1.15, + "learning_rate": 4.034862546410983e-06, + "logits/chosen": -1.2540934085845947, + "logits/rejected": -1.1953986883163452, + "logps/chosen": -53.61029815673828, + "logps/rejected": -75.32425689697266, + "loss": 0.6489, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.9086220264434814, + "rewards/margins": -0.8455147743225098, + "rewards/rejected": 2.754136800765991, + "step": 7085 + }, + { + "epoch": 1.15, + "learning_rate": 4.033573042527601e-06, + "logits/chosen": -1.179931640625, + "logits/rejected": -1.1659144163131714, + "logps/chosen": -44.9771842956543, + "logps/rejected": -120.11680603027344, + "loss": 0.5399, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.731417417526245, + "rewards/margins": 1.6294353008270264, + "rewards/rejected": 1.1019821166992188, + "step": 7086 + }, + { + "epoch": 1.15, + "learning_rate": 4.032283605415177e-06, + "logits/chosen": -1.335659146308899, + "logits/rejected": -1.335659146308899, + "logps/chosen": -36.920509338378906, + "logps/rejected": -36.920509338378906, + "loss": 0.3499, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.7772037386894226, + "rewards/margins": 0.0, + "rewards/rejected": 0.7772037386894226, + "step": 7087 + }, + { + "epoch": 1.15, + "learning_rate": 4.0309942351628e-06, + "logits/chosen": -1.0849393606185913, + "logits/rejected": -1.0952606201171875, + "logps/chosen": -1.6467450857162476, + "logps/rejected": -10.34144401550293, + "loss": 0.4734, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.25688037276268005, + "rewards/margins": -0.4517883360385895, + "rewards/rejected": 0.7086687088012695, + "step": 7088 + }, + { + "epoch": 1.15, + "learning_rate": 4.029704931859552e-06, + "logits/chosen": -1.3126472234725952, + "logits/rejected": -1.3518799543380737, + "logps/chosen": -100.252197265625, + "logps/rejected": -103.00098419189453, + "loss": 0.8608, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.2533905506134033, + "rewards/margins": -1.112004041671753, + "rewards/rejected": 2.3653945922851562, + "step": 7089 + }, + { + "epoch": 1.15, + "learning_rate": 4.028415695594512e-06, + "logits/chosen": -1.2685506343841553, + "logits/rejected": -1.1964596509933472, + "logps/chosen": -41.111873626708984, + "logps/rejected": -59.71146011352539, + "loss": 0.3864, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.5477046966552734, + "rewards/margins": -0.12290501594543457, + "rewards/rejected": 2.670609712600708, + "step": 7090 + }, + { + "epoch": 1.15, + "learning_rate": 4.027126526456755e-06, + "logits/chosen": -0.9237604737281799, + "logits/rejected": -0.8158342838287354, + "logps/chosen": -58.02996826171875, + "logps/rejected": -15.334199905395508, + "loss": 0.408, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.9240798950195312, + "rewards/margins": 0.14376789331436157, + "rewards/rejected": 0.7803120017051697, + "step": 7091 + }, + { + "epoch": 1.15, + "learning_rate": 4.025837424535348e-06, + "logits/chosen": -1.4550293684005737, + "logits/rejected": -1.430308222770691, + "logps/chosen": -88.33415222167969, + "logps/rejected": -31.82343101501465, + "loss": 0.8231, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.128544807434082, + "rewards/margins": 1.2212715148925781, + "rewards/rejected": 2.907273292541504, + "step": 7092 + }, + { + "epoch": 1.15, + "learning_rate": 4.02454838991936e-06, + "logits/chosen": -1.0727273225784302, + "logits/rejected": -1.0880078077316284, + "logps/chosen": -31.134654998779297, + "logps/rejected": -38.38385772705078, + "loss": 0.4868, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.7442684173583984, + "rewards/margins": -0.3525817394256592, + "rewards/rejected": 2.0968501567840576, + "step": 7093 + }, + { + "epoch": 1.15, + "learning_rate": 4.023259422697846e-06, + "logits/chosen": -1.1246885061264038, + "logits/rejected": -1.1405400037765503, + "logps/chosen": -68.48774719238281, + "logps/rejected": -73.89399719238281, + "loss": 0.576, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.270808458328247, + "rewards/margins": 0.4431755542755127, + "rewards/rejected": 1.8276329040527344, + "step": 7094 + }, + { + "epoch": 1.15, + "learning_rate": 4.021970522959865e-06, + "logits/chosen": -1.564043641090393, + "logits/rejected": -1.6397191286087036, + "logps/chosen": -85.02911376953125, + "logps/rejected": -122.63528442382812, + "loss": 2.8785, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.540791392326355, + "rewards/margins": -4.761044025421143, + "rewards/rejected": 6.301835536956787, + "step": 7095 + }, + { + "epoch": 1.15, + "learning_rate": 4.020681690794467e-06, + "logits/chosen": -1.5128816366195679, + "logits/rejected": -1.5284587144851685, + "logps/chosen": -47.97034454345703, + "logps/rejected": -63.001617431640625, + "loss": 0.1487, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.8372247219085693, + "rewards/margins": 1.5080453157424927, + "rewards/rejected": 1.3291794061660767, + "step": 7096 + }, + { + "epoch": 1.15, + "learning_rate": 4.019392926290697e-06, + "logits/chosen": -1.4744099378585815, + "logits/rejected": -1.3865524530410767, + "logps/chosen": -191.7743682861328, + "logps/rejected": -160.15255737304688, + "loss": 1.4907, + "rewards/accuracies": 0.0, + "rewards/chosen": 6.599198818206787, + "rewards/margins": -2.922372341156006, + "rewards/rejected": 9.521571159362793, + "step": 7097 + }, + { + "epoch": 1.15, + "learning_rate": 4.018104229537597e-06, + "logits/chosen": -1.080416202545166, + "logits/rejected": -1.0467332601547241, + "logps/chosen": -82.55572509765625, + "logps/rejected": -128.7731475830078, + "loss": 1.1051, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.808274030685425, + "rewards/margins": 1.5676352977752686, + "rewards/rejected": 1.2406387329101562, + "step": 7098 + }, + { + "epoch": 1.15, + "learning_rate": 4.016815600624204e-06, + "logits/chosen": -1.1024959087371826, + "logits/rejected": -1.1024959087371826, + "logps/chosen": -34.422935485839844, + "logps/rejected": -34.422935485839844, + "loss": 0.356, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.7220977544784546, + "rewards/margins": 0.0, + "rewards/rejected": 1.7220977544784546, + "step": 7099 + }, + { + "epoch": 1.15, + "learning_rate": 4.015527039639551e-06, + "logits/chosen": -1.281922459602356, + "logits/rejected": -1.1431814432144165, + "logps/chosen": -102.59321594238281, + "logps/rejected": -52.83769226074219, + "loss": 0.9227, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.55295729637146, + "rewards/margins": 0.2836320400238037, + "rewards/rejected": 2.2693252563476562, + "step": 7100 + }, + { + "epoch": 1.15, + "learning_rate": 4.014238546672664e-06, + "logits/chosen": -1.2594890594482422, + "logits/rejected": -1.1454226970672607, + "logps/chosen": -78.98828887939453, + "logps/rejected": -31.737808227539062, + "loss": 0.307, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.6425567865371704, + "rewards/margins": 1.167912244796753, + "rewards/rejected": 0.4746444821357727, + "step": 7101 + }, + { + "epoch": 1.15, + "learning_rate": 4.012950121812566e-06, + "logits/chosen": -1.2212227582931519, + "logits/rejected": -1.137832760810852, + "logps/chosen": -73.76145935058594, + "logps/rejected": -27.684032440185547, + "loss": 0.2021, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.5951592922210693, + "rewards/margins": 1.2135647535324097, + "rewards/rejected": 1.3815945386886597, + "step": 7102 + }, + { + "epoch": 1.15, + "learning_rate": 4.011661765148275e-06, + "logits/chosen": -1.0882807970046997, + "logits/rejected": -1.0404856204986572, + "logps/chosen": -47.46345138549805, + "logps/rejected": -23.735027313232422, + "loss": 0.2204, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.7759426832199097, + "rewards/margins": 1.6263169050216675, + "rewards/rejected": 0.1496257781982422, + "step": 7103 + }, + { + "epoch": 1.15, + "learning_rate": 4.010373476768803e-06, + "logits/chosen": -1.254304051399231, + "logits/rejected": -1.3033381700515747, + "logps/chosen": -41.8245849609375, + "logps/rejected": -59.54743194580078, + "loss": 1.4516, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.5447654724121094, + "rewards/margins": -2.767338752746582, + "rewards/rejected": 4.312104225158691, + "step": 7104 + }, + { + "epoch": 1.15, + "learning_rate": 4.009085256763162e-06, + "logits/chosen": -1.3107869625091553, + "logits/rejected": -1.3107869625091553, + "logps/chosen": -39.98365020751953, + "logps/rejected": -39.98365020751953, + "loss": 0.3472, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.958216905593872, + "rewards/margins": 0.0, + "rewards/rejected": 2.958216905593872, + "step": 7105 + }, + { + "epoch": 1.15, + "learning_rate": 4.007797105220352e-06, + "logits/chosen": -1.2681275606155396, + "logits/rejected": -1.1342686414718628, + "logps/chosen": -66.5087890625, + "logps/rejected": -14.064006805419922, + "loss": 0.022, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.097084999084473, + "rewards/margins": 3.233607292175293, + "rewards/rejected": 0.8634777069091797, + "step": 7106 + }, + { + "epoch": 1.15, + "learning_rate": 4.006509022229374e-06, + "logits/chosen": -1.2684038877487183, + "logits/rejected": -1.2581617832183838, + "logps/chosen": -53.50782012939453, + "logps/rejected": -63.2964973449707, + "loss": 0.2459, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.35649037361145, + "rewards/margins": 0.586147665977478, + "rewards/rejected": 1.7703427076339722, + "step": 7107 + }, + { + "epoch": 1.15, + "learning_rate": 4.0052210078792234e-06, + "logits/chosen": -1.6770012378692627, + "logits/rejected": -1.643280029296875, + "logps/chosen": -60.56669998168945, + "logps/rejected": -70.54608154296875, + "loss": 1.0688, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.372587203979492, + "rewards/margins": 1.6047687530517578, + "rewards/rejected": 2.7678184509277344, + "step": 7108 + }, + { + "epoch": 1.15, + "learning_rate": 4.003933062258887e-06, + "logits/chosen": -1.674172282218933, + "logits/rejected": -1.3141952753067017, + "logps/chosen": -38.16584014892578, + "logps/rejected": -95.62387084960938, + "loss": 1.2325, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.391644239425659, + "rewards/margins": -1.3847534656524658, + "rewards/rejected": 3.776397705078125, + "step": 7109 + }, + { + "epoch": 1.15, + "learning_rate": 4.0026451854573515e-06, + "logits/chosen": -1.4981358051300049, + "logits/rejected": -1.5178189277648926, + "logps/chosen": -35.43840789794922, + "logps/rejected": -60.55071258544922, + "loss": 0.3549, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.6240203380584717, + "rewards/margins": 0.09205007553100586, + "rewards/rejected": 3.531970262527466, + "step": 7110 + }, + { + "epoch": 1.15, + "learning_rate": 4.0013573775635965e-06, + "logits/chosen": -1.3277909755706787, + "logits/rejected": -1.1753277778625488, + "logps/chosen": -81.92731475830078, + "logps/rejected": -31.9525146484375, + "loss": 0.4112, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.388523817062378, + "rewards/margins": 2.6275148391723633, + "rewards/rejected": 0.7610088586807251, + "step": 7111 + }, + { + "epoch": 1.15, + "learning_rate": 4.000069638666597e-06, + "logits/chosen": -0.9374929666519165, + "logits/rejected": -0.8649899959564209, + "logps/chosen": -31.177656173706055, + "logps/rejected": -34.72185516357422, + "loss": 0.9491, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.5703619718551636, + "rewards/margins": -1.6854180097579956, + "rewards/rejected": 3.255779981613159, + "step": 7112 + }, + { + "epoch": 1.15, + "learning_rate": 3.998781968855325e-06, + "logits/chosen": -1.1395013332366943, + "logits/rejected": -1.1827433109283447, + "logps/chosen": -52.62199783325195, + "logps/rejected": -74.30021667480469, + "loss": 0.7262, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.682080864906311, + "rewards/margins": -1.098900556564331, + "rewards/rejected": 1.780981421470642, + "step": 7113 + }, + { + "epoch": 1.15, + "learning_rate": 3.997494368218745e-06, + "logits/chosen": -1.4110114574432373, + "logits/rejected": -1.4110114574432373, + "logps/chosen": -78.48196411132812, + "logps/rejected": -78.48196411132812, + "loss": 0.7606, + "rewards/accuracies": 0.0, + "rewards/chosen": 4.938914775848389, + "rewards/margins": 0.0, + "rewards/rejected": 4.938914775848389, + "step": 7114 + }, + { + "epoch": 1.15, + "learning_rate": 3.996206836845818e-06, + "logits/chosen": -0.8755646347999573, + "logits/rejected": -0.8965729475021362, + "logps/chosen": -62.32687759399414, + "logps/rejected": -126.32888793945312, + "loss": 1.2126, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.0903422832489014, + "rewards/margins": 0.659424901008606, + "rewards/rejected": 1.4309173822402954, + "step": 7115 + }, + { + "epoch": 1.16, + "learning_rate": 3.994919374825501e-06, + "logits/chosen": -1.2831522226333618, + "logits/rejected": -1.2423138618469238, + "logps/chosen": -52.5897216796875, + "logps/rejected": -46.396141052246094, + "loss": 1.1952, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.6702880859375, + "rewards/margins": -2.0232925415039062, + "rewards/rejected": 4.693580627441406, + "step": 7116 + }, + { + "epoch": 1.16, + "learning_rate": 3.993631982246745e-06, + "logits/chosen": -0.7747927904129028, + "logits/rejected": -0.7747927904129028, + "logps/chosen": -93.129638671875, + "logps/rejected": -93.129638671875, + "loss": 0.7033, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.2650344371795654, + "rewards/margins": 0.0, + "rewards/rejected": 2.2650344371795654, + "step": 7117 + }, + { + "epoch": 1.16, + "learning_rate": 3.992344659198497e-06, + "logits/chosen": -1.3476121425628662, + "logits/rejected": -1.3060566186904907, + "logps/chosen": -44.91364288330078, + "logps/rejected": -63.656585693359375, + "loss": 0.6633, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.6289265155792236, + "rewards/margins": -0.21681630611419678, + "rewards/rejected": 1.8457428216934204, + "step": 7118 + }, + { + "epoch": 1.16, + "learning_rate": 3.991057405769698e-06, + "logits/chosen": -1.4431933164596558, + "logits/rejected": -1.4279344081878662, + "logps/chosen": -168.1122283935547, + "logps/rejected": -268.93939208984375, + "loss": 0.3891, + "rewards/accuracies": 0.0, + "rewards/chosen": 8.555989265441895, + "rewards/margins": -0.13151073455810547, + "rewards/rejected": 8.6875, + "step": 7119 + }, + { + "epoch": 1.16, + "learning_rate": 3.989770222049286e-06, + "logits/chosen": -1.4377824068069458, + "logits/rejected": -1.2181178331375122, + "logps/chosen": -156.16995239257812, + "logps/rejected": -39.811187744140625, + "loss": 0.0239, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.371748447418213, + "rewards/margins": 4.569401741027832, + "rewards/rejected": 1.8023468255996704, + "step": 7120 + }, + { + "epoch": 1.16, + "learning_rate": 3.988483108126193e-06, + "logits/chosen": -1.2401107549667358, + "logits/rejected": -1.2011231184005737, + "logps/chosen": -72.67523193359375, + "logps/rejected": -71.16160583496094, + "loss": 0.6196, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.465660810470581, + "rewards/margins": 0.8264014720916748, + "rewards/rejected": 1.6392593383789062, + "step": 7121 + }, + { + "epoch": 1.16, + "learning_rate": 3.987196064089346e-06, + "logits/chosen": -1.273672103881836, + "logits/rejected": -1.196982979774475, + "logps/chosen": -88.71109008789062, + "logps/rejected": -65.96659088134766, + "loss": 0.6306, + "rewards/accuracies": 1.0, + "rewards/chosen": 7.745556831359863, + "rewards/margins": 2.1150259971618652, + "rewards/rejected": 5.630530834197998, + "step": 7122 + }, + { + "epoch": 1.16, + "learning_rate": 3.9859090900276675e-06, + "logits/chosen": -1.0213851928710938, + "logits/rejected": -0.9542393088340759, + "logps/chosen": -54.97098922729492, + "logps/rejected": -29.76279640197754, + "loss": 1.4905, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.0788601636886597, + "rewards/margins": -1.5185669660568237, + "rewards/rejected": 2.5974271297454834, + "step": 7123 + }, + { + "epoch": 1.16, + "learning_rate": 3.9846221860300745e-06, + "logits/chosen": -0.9403972625732422, + "logits/rejected": -0.7947718501091003, + "logps/chosen": -33.110328674316406, + "logps/rejected": -11.234911918640137, + "loss": 1.978, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.3495171070098877, + "rewards/margins": 1.812713623046875, + "rewards/rejected": 0.5368035435676575, + "step": 7124 + }, + { + "epoch": 1.16, + "learning_rate": 3.983335352185482e-06, + "logits/chosen": -1.184975028038025, + "logits/rejected": -1.161199927330017, + "logps/chosen": -39.99999237060547, + "logps/rejected": -24.79792022705078, + "loss": 0.7096, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.277485728263855, + "rewards/margins": 0.7788294553756714, + "rewards/rejected": 0.4986562728881836, + "step": 7125 + }, + { + "epoch": 1.16, + "learning_rate": 3.9820485885827974e-06, + "logits/chosen": -1.3257213830947876, + "logits/rejected": -1.1603325605392456, + "logps/chosen": -141.09593200683594, + "logps/rejected": -33.950408935546875, + "loss": 0.0395, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.494810581207275, + "rewards/margins": 3.1038269996643066, + "rewards/rejected": 2.3909835815429688, + "step": 7126 + }, + { + "epoch": 1.16, + "learning_rate": 3.9807618953109225e-06, + "logits/chosen": -0.7907062768936157, + "logits/rejected": -0.765535831451416, + "logps/chosen": -51.5130615234375, + "logps/rejected": -47.268577575683594, + "loss": 0.4162, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.883202314376831, + "rewards/margins": -0.22263646125793457, + "rewards/rejected": 3.1058387756347656, + "step": 7127 + }, + { + "epoch": 1.16, + "learning_rate": 3.979475272458757e-06, + "logits/chosen": -1.1316227912902832, + "logits/rejected": -1.1528674364089966, + "logps/chosen": -76.03659057617188, + "logps/rejected": -46.345298767089844, + "loss": 0.5174, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.5745537281036377, + "rewards/margins": 1.1817824840545654, + "rewards/rejected": 2.3927712440490723, + "step": 7128 + }, + { + "epoch": 1.16, + "learning_rate": 3.978188720115194e-06, + "logits/chosen": -1.1280522346496582, + "logits/rejected": -1.2536120414733887, + "logps/chosen": -109.41343688964844, + "logps/rejected": -100.67835235595703, + "loss": 2.5676, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.2783737182617188, + "rewards/margins": -4.863826751708984, + "rewards/rejected": 7.142200469970703, + "step": 7129 + }, + { + "epoch": 1.16, + "learning_rate": 3.976902238369121e-06, + "logits/chosen": -1.5000957250595093, + "logits/rejected": -1.5476850271224976, + "logps/chosen": -68.41326904296875, + "logps/rejected": -75.71951293945312, + "loss": 1.1728, + "rewards/accuracies": 0.0, + "rewards/chosen": 6.27734375, + "rewards/margins": -2.241424560546875, + "rewards/rejected": 8.518768310546875, + "step": 7130 + }, + { + "epoch": 1.16, + "learning_rate": 3.975615827309423e-06, + "logits/chosen": -1.1300441026687622, + "logits/rejected": -1.0879122018814087, + "logps/chosen": -23.333402633666992, + "logps/rejected": -15.208878517150879, + "loss": 0.2705, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.975197434425354, + "rewards/margins": 1.7047098875045776, + "rewards/rejected": 0.270487517118454, + "step": 7131 + }, + { + "epoch": 1.16, + "learning_rate": 3.974329487024979e-06, + "logits/chosen": -1.2287368774414062, + "logits/rejected": -1.208005428314209, + "logps/chosen": -96.77117919921875, + "logps/rejected": -62.04895782470703, + "loss": 0.3506, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.489111304283142, + "rewards/margins": 0.032372236251831055, + "rewards/rejected": 1.456739068031311, + "step": 7132 + }, + { + "epoch": 1.16, + "learning_rate": 3.973043217604662e-06, + "logits/chosen": -1.5547447204589844, + "logits/rejected": -1.4558351039886475, + "logps/chosen": -139.3519287109375, + "logps/rejected": -17.903549194335938, + "loss": 0.0066, + "rewards/accuracies": 1.0, + "rewards/chosen": 9.108299255371094, + "rewards/margins": 8.170549392700195, + "rewards/rejected": 0.9377496838569641, + "step": 7133 + }, + { + "epoch": 1.16, + "learning_rate": 3.971757019137342e-06, + "logits/chosen": -1.3875110149383545, + "logits/rejected": -1.5097941160202026, + "logps/chosen": -67.38775634765625, + "logps/rejected": -92.0103759765625, + "loss": 3.8455, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.0168893337249756, + "rewards/margins": -6.243788719177246, + "rewards/rejected": 8.2606782913208, + "step": 7134 + }, + { + "epoch": 1.16, + "learning_rate": 3.970470891711882e-06, + "logits/chosen": -1.2767384052276611, + "logits/rejected": -1.2892202138900757, + "logps/chosen": -102.37406921386719, + "logps/rejected": -48.10430908203125, + "loss": 0.3631, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.6636033058166504, + "rewards/margins": 1.482452392578125, + "rewards/rejected": 2.1811509132385254, + "step": 7135 + }, + { + "epoch": 1.16, + "learning_rate": 3.969184835417143e-06, + "logits/chosen": -1.2549841403961182, + "logits/rejected": -1.2549841403961182, + "logps/chosen": -6.047453880310059, + "logps/rejected": -6.047453880310059, + "loss": 0.8781, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.698776364326477, + "rewards/margins": 0.0, + "rewards/rejected": 0.698776364326477, + "step": 7136 + }, + { + "epoch": 1.16, + "learning_rate": 3.967898850341977e-06, + "logits/chosen": -0.9209522008895874, + "logits/rejected": -0.9079843759536743, + "logps/chosen": -85.56149291992188, + "logps/rejected": -41.57201385498047, + "loss": 1.5322, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.8217926025390625, + "rewards/margins": -1.5369727611541748, + "rewards/rejected": 2.3587653636932373, + "step": 7137 + }, + { + "epoch": 1.16, + "learning_rate": 3.966612936575235e-06, + "logits/chosen": -1.7718334197998047, + "logits/rejected": -1.8082865476608276, + "logps/chosen": -59.20509719848633, + "logps/rejected": -134.2987060546875, + "loss": 2.3331, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.637861967086792, + "rewards/margins": -4.451931953430176, + "rewards/rejected": 7.089793682098389, + "step": 7138 + }, + { + "epoch": 1.16, + "learning_rate": 3.9653270942057614e-06, + "logits/chosen": -1.3558932542800903, + "logits/rejected": -1.3268166780471802, + "logps/chosen": -122.49417877197266, + "logps/rejected": -132.70538330078125, + "loss": 0.5025, + "rewards/accuracies": 0.0, + "rewards/chosen": 5.752449989318848, + "rewards/margins": -0.5402350425720215, + "rewards/rejected": 6.292685031890869, + "step": 7139 + }, + { + "epoch": 1.16, + "learning_rate": 3.964041323322395e-06, + "logits/chosen": -1.2299528121948242, + "logits/rejected": -1.1640394926071167, + "logps/chosen": -73.57350158691406, + "logps/rejected": -57.84211349487305, + "loss": 0.3161, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1030349731445312, + "rewards/margins": 1.1802631616592407, + "rewards/rejected": 1.9227718114852905, + "step": 7140 + }, + { + "epoch": 1.16, + "learning_rate": 3.962755624013971e-06, + "logits/chosen": -1.5351295471191406, + "logits/rejected": -1.4632076025009155, + "logps/chosen": -117.70758056640625, + "logps/rejected": -58.05051803588867, + "loss": 0.0612, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.9405717849731445, + "rewards/margins": 2.488070487976074, + "rewards/rejected": 2.4525012969970703, + "step": 7141 + }, + { + "epoch": 1.16, + "learning_rate": 3.961469996369319e-06, + "logits/chosen": -1.345785140991211, + "logits/rejected": -1.2258630990982056, + "logps/chosen": -76.41424560546875, + "logps/rejected": -26.114883422851562, + "loss": 0.1624, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.6030609607696533, + "rewards/margins": 1.1560550928115845, + "rewards/rejected": 1.4470058679580688, + "step": 7142 + }, + { + "epoch": 1.16, + "learning_rate": 3.960184440477264e-06, + "logits/chosen": -1.143021821975708, + "logits/rejected": -1.1501736640930176, + "logps/chosen": -11.694355010986328, + "logps/rejected": -12.995654106140137, + "loss": 0.3501, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.3203243017196655, + "rewards/margins": -0.0036317110061645508, + "rewards/rejected": 1.32395601272583, + "step": 7143 + }, + { + "epoch": 1.16, + "learning_rate": 3.958898956426625e-06, + "logits/chosen": -1.4840242862701416, + "logits/rejected": -1.431234359741211, + "logps/chosen": -123.693603515625, + "logps/rejected": -85.53816986083984, + "loss": 0.5775, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.440600872039795, + "rewards/margins": 0.5324883460998535, + "rewards/rejected": 5.908112525939941, + "step": 7144 + }, + { + "epoch": 1.16, + "learning_rate": 3.957613544306216e-06, + "logits/chosen": -1.1000958681106567, + "logits/rejected": -1.142727017402649, + "logps/chosen": -166.38174438476562, + "logps/rejected": -91.84444427490234, + "loss": 0.1237, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.158471584320068, + "rewards/margins": 1.4277820587158203, + "rewards/rejected": 4.730689525604248, + "step": 7145 + }, + { + "epoch": 1.16, + "learning_rate": 3.95632820420485e-06, + "logits/chosen": -1.3214856386184692, + "logits/rejected": -1.298346757888794, + "logps/chosen": -73.09693145751953, + "logps/rejected": -75.4781723022461, + "loss": 0.3089, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0021705627441406, + "rewards/margins": 0.19184422492980957, + "rewards/rejected": 2.810326337814331, + "step": 7146 + }, + { + "epoch": 1.16, + "learning_rate": 3.955042936211329e-06, + "logits/chosen": -1.3804819583892822, + "logits/rejected": -1.182727575302124, + "logps/chosen": -130.65647888183594, + "logps/rejected": -34.67809295654297, + "loss": 0.0192, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.376884460449219, + "rewards/margins": 5.251582145690918, + "rewards/rejected": 1.1253021955490112, + "step": 7147 + }, + { + "epoch": 1.16, + "learning_rate": 3.953757740414453e-06, + "logits/chosen": -1.118237853050232, + "logits/rejected": -1.285278081893921, + "logps/chosen": -27.421579360961914, + "logps/rejected": -105.14541625976562, + "loss": 5.2255, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.5905885696411133, + "rewards/margins": -6.5289106369018555, + "rewards/rejected": 9.119499206542969, + "step": 7148 + }, + { + "epoch": 1.16, + "learning_rate": 3.952472616903018e-06, + "logits/chosen": -0.9057798385620117, + "logits/rejected": -0.9421851634979248, + "logps/chosen": -48.56904602050781, + "logps/rejected": -53.21736145019531, + "loss": 0.703, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.3727073669433594, + "rewards/margins": -0.8188972473144531, + "rewards/rejected": 1.1916046142578125, + "step": 7149 + }, + { + "epoch": 1.16, + "learning_rate": 3.9511875657658136e-06, + "logits/chosen": -1.111660122871399, + "logits/rejected": -1.111660122871399, + "logps/chosen": -40.90580749511719, + "logps/rejected": -40.90580749511719, + "loss": 0.4472, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.3175991773605347, + "rewards/margins": 0.0, + "rewards/rejected": 1.3175991773605347, + "step": 7150 + }, + { + "epoch": 1.16, + "learning_rate": 3.949902587091624e-06, + "logits/chosen": -1.2085638046264648, + "logits/rejected": -1.2117271423339844, + "logps/chosen": -193.84649658203125, + "logps/rejected": -96.9210433959961, + "loss": 0.0575, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.497386455535889, + "rewards/margins": 3.699159622192383, + "rewards/rejected": 0.7982269525527954, + "step": 7151 + }, + { + "epoch": 1.16, + "learning_rate": 3.94861768096923e-06, + "logits/chosen": -1.350705862045288, + "logits/rejected": -1.2186243534088135, + "logps/chosen": -84.86085510253906, + "logps/rejected": -53.29033279418945, + "loss": 0.7623, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.603381633758545, + "rewards/margins": 1.138838768005371, + "rewards/rejected": 4.464542865753174, + "step": 7152 + }, + { + "epoch": 1.16, + "learning_rate": 3.947332847487405e-06, + "logits/chosen": -1.2420328855514526, + "logits/rejected": -1.1118881702423096, + "logps/chosen": -47.12432098388672, + "logps/rejected": -24.984432220458984, + "loss": 0.5757, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.781454563140869, + "rewards/margins": 3.528463363647461, + "rewards/rejected": 0.25299111008644104, + "step": 7153 + }, + { + "epoch": 1.16, + "learning_rate": 3.946048086734921e-06, + "logits/chosen": -0.9812522530555725, + "logits/rejected": -0.9633811116218567, + "logps/chosen": -28.504119873046875, + "logps/rejected": -16.944133758544922, + "loss": 0.5161, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.7657955288887024, + "rewards/margins": -0.36049944162368774, + "rewards/rejected": 1.1262949705123901, + "step": 7154 + }, + { + "epoch": 1.16, + "learning_rate": 3.9447633988005405e-06, + "logits/chosen": -1.0763767957687378, + "logits/rejected": -1.0732780694961548, + "logps/chosen": -184.15396118164062, + "logps/rejected": -101.10786437988281, + "loss": 0.5601, + "rewards/accuracies": 0.0, + "rewards/chosen": 5.151593208312988, + "rewards/margins": -0.6780805587768555, + "rewards/rejected": 5.829673767089844, + "step": 7155 + }, + { + "epoch": 1.16, + "learning_rate": 3.9434787837730245e-06, + "logits/chosen": -1.4393450021743774, + "logits/rejected": -1.4369251728057861, + "logps/chosen": -44.19643020629883, + "logps/rejected": -117.65017700195312, + "loss": 0.5658, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.246417760848999, + "rewards/margins": 0.6557393074035645, + "rewards/rejected": 1.5906784534454346, + "step": 7156 + }, + { + "epoch": 1.16, + "learning_rate": 3.942194241741128e-06, + "logits/chosen": -1.0220237970352173, + "logits/rejected": -1.014042615890503, + "logps/chosen": -28.232547760009766, + "logps/rejected": -78.14813232421875, + "loss": 1.0927, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.128223180770874, + "rewards/margins": -0.8109569549560547, + "rewards/rejected": 2.9391801357269287, + "step": 7157 + }, + { + "epoch": 1.16, + "learning_rate": 3.9409097727936005e-06, + "logits/chosen": -1.1176303625106812, + "logits/rejected": -1.1240692138671875, + "logps/chosen": -81.64604187011719, + "logps/rejected": -30.81169891357422, + "loss": 0.6457, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.2377190589904785, + "rewards/margins": -0.9667425155639648, + "rewards/rejected": 4.204461574554443, + "step": 7158 + }, + { + "epoch": 1.16, + "learning_rate": 3.939625377019186e-06, + "logits/chosen": -1.0683577060699463, + "logits/rejected": -1.1129448413848877, + "logps/chosen": -91.26202392578125, + "logps/rejected": -71.86769104003906, + "loss": 0.9349, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.0716826915740967, + "rewards/margins": -0.42615747451782227, + "rewards/rejected": 2.497840166091919, + "step": 7159 + }, + { + "epoch": 1.16, + "learning_rate": 3.938341054506625e-06, + "logits/chosen": -1.6642498970031738, + "logits/rejected": -1.536776065826416, + "logps/chosen": -193.61474609375, + "logps/rejected": -120.48848724365234, + "loss": 0.1213, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.82464599609375, + "rewards/margins": 1.3080787658691406, + "rewards/rejected": 4.516567230224609, + "step": 7160 + }, + { + "epoch": 1.16, + "learning_rate": 3.937056805344652e-06, + "logits/chosen": -1.3978952169418335, + "logits/rejected": -1.3612720966339111, + "logps/chosen": -50.481082916259766, + "logps/rejected": -47.65974044799805, + "loss": 0.644, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.228667974472046, + "rewards/margins": -0.5649199485778809, + "rewards/rejected": 2.7935879230499268, + "step": 7161 + }, + { + "epoch": 1.16, + "learning_rate": 3.935772629621996e-06, + "logits/chosen": -1.2662895917892456, + "logits/rejected": -1.2110235691070557, + "logps/chosen": -46.27757263183594, + "logps/rejected": -92.8695068359375, + "loss": 1.4635, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.2903099060058594, + "rewards/margins": -2.7904458045959473, + "rewards/rejected": 5.080755710601807, + "step": 7162 + }, + { + "epoch": 1.16, + "learning_rate": 3.93448852742738e-06, + "logits/chosen": -1.1024959087371826, + "logits/rejected": -1.115111231803894, + "logps/chosen": -34.415321350097656, + "logps/rejected": -32.855804443359375, + "loss": 2.1297, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.7882964611053467, + "rewards/margins": 0.04007673263549805, + "rewards/rejected": 2.7482197284698486, + "step": 7163 + }, + { + "epoch": 1.16, + "learning_rate": 3.9332044988495265e-06, + "logits/chosen": -1.9541196823120117, + "logits/rejected": -1.913766860961914, + "logps/chosen": -119.85382080078125, + "logps/rejected": -95.14441680908203, + "loss": 0.0166, + "rewards/accuracies": 1.0, + "rewards/chosen": 7.338818550109863, + "rewards/margins": 3.4697656631469727, + "rewards/rejected": 3.8690528869628906, + "step": 7164 + }, + { + "epoch": 1.16, + "learning_rate": 3.9319205439771475e-06, + "logits/chosen": -1.5682579278945923, + "logits/rejected": -1.5694406032562256, + "logps/chosen": -126.97493743896484, + "logps/rejected": -37.95738220214844, + "loss": 0.259, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.7417899966239929, + "rewards/margins": 0.6098175048828125, + "rewards/rejected": 0.1319725066423416, + "step": 7165 + }, + { + "epoch": 1.16, + "learning_rate": 3.930636662898952e-06, + "logits/chosen": -1.2465999126434326, + "logits/rejected": -1.2506108283996582, + "logps/chosen": -6.659455299377441, + "logps/rejected": -5.48356819152832, + "loss": 0.3942, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3752295672893524, + "rewards/margins": 0.27967649698257446, + "rewards/rejected": 0.09555306285619736, + "step": 7166 + }, + { + "epoch": 1.16, + "learning_rate": 3.9293528557036445e-06, + "logits/chosen": -1.3447424173355103, + "logits/rejected": -1.38816237449646, + "logps/chosen": -125.07660675048828, + "logps/rejected": -192.95932006835938, + "loss": 1.056, + "rewards/accuracies": 0.0, + "rewards/chosen": 5.327553749084473, + "rewards/margins": -1.9415931701660156, + "rewards/rejected": 7.269146919250488, + "step": 7167 + }, + { + "epoch": 1.16, + "learning_rate": 3.928069122479925e-06, + "logits/chosen": -0.9403899312019348, + "logits/rejected": -0.8946789503097534, + "logps/chosen": -13.26844596862793, + "logps/rejected": -5.563178062438965, + "loss": 0.1758, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.9293123483657837, + "rewards/margins": 0.8703398704528809, + "rewards/rejected": 1.0589724779129028, + "step": 7168 + }, + { + "epoch": 1.16, + "learning_rate": 3.926785463316487e-06, + "logits/chosen": -1.51662278175354, + "logits/rejected": -1.5281147956848145, + "logps/chosen": -99.70562744140625, + "logps/rejected": -130.80490112304688, + "loss": 0.7739, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.4707412719726562, + "rewards/margins": -1.2301254272460938, + "rewards/rejected": 4.70086669921875, + "step": 7169 + }, + { + "epoch": 1.16, + "learning_rate": 3.925501878302017e-06, + "logits/chosen": -0.7137101888656616, + "logits/rejected": -0.7417188882827759, + "logps/chosen": -3.520777940750122, + "logps/rejected": -71.53192138671875, + "loss": 4.0084, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.6011913418769836, + "rewards/margins": -0.06743431091308594, + "rewards/rejected": 0.6686256527900696, + "step": 7170 + }, + { + "epoch": 1.16, + "learning_rate": 3.924218367525202e-06, + "logits/chosen": -1.2149184942245483, + "logits/rejected": -1.325369119644165, + "logps/chosen": -55.583045959472656, + "logps/rejected": -86.3824691772461, + "loss": 1.7735, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.2692711353302, + "rewards/margins": -2.450282335281372, + "rewards/rejected": 5.719553470611572, + "step": 7171 + }, + { + "epoch": 1.16, + "learning_rate": 3.9229349310747176e-06, + "logits/chosen": -1.3490959405899048, + "logits/rejected": -1.3673902750015259, + "logps/chosen": -67.89605712890625, + "logps/rejected": -65.75392150878906, + "loss": 0.3775, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.4594292640686035, + "rewards/margins": 1.604060411453247, + "rewards/rejected": 1.8553688526153564, + "step": 7172 + }, + { + "epoch": 1.16, + "learning_rate": 3.921651569039238e-06, + "logits/chosen": -1.2496272325515747, + "logits/rejected": -1.053524136543274, + "logps/chosen": -120.506103515625, + "logps/rejected": -37.73994445800781, + "loss": 0.0525, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.7250518798828125, + "rewards/margins": 2.3376517295837402, + "rewards/rejected": 3.3874001502990723, + "step": 7173 + }, + { + "epoch": 1.16, + "learning_rate": 3.920368281507431e-06, + "logits/chosen": -0.6632067561149597, + "logits/rejected": -0.6055102348327637, + "logps/chosen": -74.98917388916016, + "logps/rejected": -18.520505905151367, + "loss": 0.0962, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.5568435192108154, + "rewards/margins": 1.7623825073242188, + "rewards/rejected": 0.7944610714912415, + "step": 7174 + }, + { + "epoch": 1.16, + "learning_rate": 3.919085068567962e-06, + "logits/chosen": -1.4115474224090576, + "logits/rejected": -1.4207193851470947, + "logps/chosen": -218.42544555664062, + "logps/rejected": -111.391357421875, + "loss": 0.087, + "rewards/accuracies": 1.0, + "rewards/chosen": 7.337253093719482, + "rewards/margins": 1.6687593460083008, + "rewards/rejected": 5.668493747711182, + "step": 7175 + }, + { + "epoch": 1.16, + "learning_rate": 3.917801930309486e-06, + "logits/chosen": -1.111709713935852, + "logits/rejected": -1.0147596597671509, + "logps/chosen": -55.43555450439453, + "logps/rejected": -46.30876159667969, + "loss": 0.2243, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.3516502380371094, + "rewards/margins": 0.623497724533081, + "rewards/rejected": 2.7281525135040283, + "step": 7176 + }, + { + "epoch": 1.16, + "learning_rate": 3.916518866820657e-06, + "logits/chosen": -1.3770238161087036, + "logits/rejected": -1.4349114894866943, + "logps/chosen": -107.34672546386719, + "logps/rejected": -201.00399780273438, + "loss": 2.1845, + "rewards/accuracies": 0.0, + "rewards/chosen": 5.111289978027344, + "rewards/margins": -3.115144729614258, + "rewards/rejected": 8.226434707641602, + "step": 7177 + }, + { + "epoch": 1.17, + "learning_rate": 3.915235878190123e-06, + "logits/chosen": -1.3711199760437012, + "logits/rejected": -1.262511134147644, + "logps/chosen": -58.68367004394531, + "logps/rejected": -18.402572631835938, + "loss": 0.2656, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.56529700756073, + "rewards/margins": 0.37299585342407227, + "rewards/rejected": 1.1923011541366577, + "step": 7178 + }, + { + "epoch": 1.17, + "learning_rate": 3.913952964506524e-06, + "logits/chosen": -0.8633996248245239, + "logits/rejected": -0.8703420758247375, + "logps/chosen": -4.328135967254639, + "logps/rejected": -2.1238880157470703, + "loss": 2.2524, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.4732130467891693, + "rewards/margins": 0.12246516346931458, + "rewards/rejected": 0.35074788331985474, + "step": 7179 + }, + { + "epoch": 1.17, + "learning_rate": 3.912670125858501e-06, + "logits/chosen": -1.1461808681488037, + "logits/rejected": -1.1800017356872559, + "logps/chosen": -46.24815368652344, + "logps/rejected": -55.508270263671875, + "loss": 0.4848, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.8097190856933594, + "rewards/margins": -0.4905967712402344, + "rewards/rejected": 2.3003158569335938, + "step": 7180 + }, + { + "epoch": 1.17, + "learning_rate": 3.911387362334682e-06, + "logits/chosen": -0.6977865099906921, + "logits/rejected": -0.6250051856040955, + "logps/chosen": -38.720760345458984, + "logps/rejected": -15.54631233215332, + "loss": 0.5885, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.8940037488937378, + "rewards/margins": 0.6955320835113525, + "rewards/rejected": 1.1984716653823853, + "step": 7181 + }, + { + "epoch": 1.17, + "learning_rate": 3.9101046740236964e-06, + "logits/chosen": -0.9435229897499084, + "logits/rejected": -0.9677014946937561, + "logps/chosen": -14.488180160522461, + "logps/rejected": -48.63624572753906, + "loss": 1.0923, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.574614942073822, + "rewards/margins": -0.4254392981529236, + "rewards/rejected": 1.0000542402267456, + "step": 7182 + }, + { + "epoch": 1.17, + "learning_rate": 3.908822061014165e-06, + "logits/chosen": -1.2843964099884033, + "logits/rejected": -1.2319210767745972, + "logps/chosen": -90.36814880371094, + "logps/rejected": -11.897672653198242, + "loss": 0.3698, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.8261856436729431, + "rewards/margins": 0.06806260347366333, + "rewards/rejected": 0.7581230401992798, + "step": 7183 + }, + { + "epoch": 1.17, + "learning_rate": 3.907539523394704e-06, + "logits/chosen": -1.2981526851654053, + "logits/rejected": -1.3495301008224487, + "logps/chosen": -90.48226165771484, + "logps/rejected": -90.39024353027344, + "loss": 0.7992, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.695815324783325, + "rewards/margins": -0.37015604972839355, + "rewards/rejected": 3.0659713745117188, + "step": 7184 + }, + { + "epoch": 1.17, + "learning_rate": 3.906257061253926e-06, + "logits/chosen": -1.1494367122650146, + "logits/rejected": -1.1425435543060303, + "logps/chosen": -46.480552673339844, + "logps/rejected": -88.02469635009766, + "loss": 0.4052, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.4505622386932373, + "rewards/margins": 0.3751387596130371, + "rewards/rejected": 3.0754234790802, + "step": 7185 + }, + { + "epoch": 1.17, + "learning_rate": 3.904974674680436e-06, + "logits/chosen": -1.108008861541748, + "logits/rejected": -1.0989015102386475, + "logps/chosen": -36.421722412109375, + "logps/rejected": -51.17474365234375, + "loss": 1.0044, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.955251455307007, + "rewards/margins": -1.0960776805877686, + "rewards/rejected": 4.051329135894775, + "step": 7186 + }, + { + "epoch": 1.17, + "learning_rate": 3.9036923637628336e-06, + "logits/chosen": -1.1513221263885498, + "logits/rejected": -1.084860920906067, + "logps/chosen": -41.58733367919922, + "logps/rejected": -34.65043640136719, + "loss": 0.2616, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.794052839279175, + "rewards/margins": 0.7733030319213867, + "rewards/rejected": 2.020749807357788, + "step": 7187 + }, + { + "epoch": 1.17, + "learning_rate": 3.902410128589717e-06, + "logits/chosen": -1.0269522666931152, + "logits/rejected": -1.0159876346588135, + "logps/chosen": -68.16545867919922, + "logps/rejected": -25.27764892578125, + "loss": 1.3701, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.753912329673767, + "rewards/margins": -0.14762043952941895, + "rewards/rejected": 1.901532769203186, + "step": 7188 + }, + { + "epoch": 1.17, + "learning_rate": 3.901127969249674e-06, + "logits/chosen": -1.2438337802886963, + "logits/rejected": -1.1182578802108765, + "logps/chosen": -94.79336547851562, + "logps/rejected": -80.28223419189453, + "loss": 0.2761, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.856534004211426, + "rewards/margins": 1.497678279876709, + "rewards/rejected": 4.358855724334717, + "step": 7189 + }, + { + "epoch": 1.17, + "learning_rate": 3.899845885831291e-06, + "logits/chosen": -1.4278074502944946, + "logits/rejected": -1.442115306854248, + "logps/chosen": -71.3567886352539, + "logps/rejected": -48.438480377197266, + "loss": 1.0834, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.008965253829956, + "rewards/margins": -2.0079128742218018, + "rewards/rejected": 5.016878128051758, + "step": 7190 + }, + { + "epoch": 1.17, + "learning_rate": 3.898563878423147e-06, + "logits/chosen": -1.332271933555603, + "logits/rejected": -1.3389276266098022, + "logps/chosen": -78.00323486328125, + "logps/rejected": -95.44122314453125, + "loss": 0.2594, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.1316848993301392, + "rewards/margins": 0.5216881036758423, + "rewards/rejected": 0.6099967956542969, + "step": 7191 + }, + { + "epoch": 1.17, + "learning_rate": 3.897281947113817e-06, + "logits/chosen": -1.2294511795043945, + "logits/rejected": -1.2703226804733276, + "logps/chosen": -59.235984802246094, + "logps/rejected": -98.30465698242188, + "loss": 0.3654, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.6035735607147217, + "rewards/margins": 0.040030479431152344, + "rewards/rejected": 2.5635430812835693, + "step": 7192 + }, + { + "epoch": 1.17, + "learning_rate": 3.896000091991871e-06, + "logits/chosen": -1.5032967329025269, + "logits/rejected": -1.4744057655334473, + "logps/chosen": -51.97419738769531, + "logps/rejected": -23.963754653930664, + "loss": 0.9722, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.4155189990997314, + "rewards/margins": 1.056464433670044, + "rewards/rejected": 0.3590545654296875, + "step": 7193 + }, + { + "epoch": 1.17, + "learning_rate": 3.894718313145873e-06, + "logits/chosen": -1.1470348834991455, + "logits/rejected": -1.1409715414047241, + "logps/chosen": -25.45299530029297, + "logps/rejected": -55.626983642578125, + "loss": 1.5601, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.4272480010986328, + "rewards/margins": -2.3345088958740234, + "rewards/rejected": 3.7617568969726562, + "step": 7194 + }, + { + "epoch": 1.17, + "learning_rate": 3.893436610664381e-06, + "logits/chosen": -1.193219780921936, + "logits/rejected": -1.219408392906189, + "logps/chosen": -114.13359069824219, + "logps/rejected": -108.85572814941406, + "loss": 3.4034, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.7363266944885254, + "rewards/margins": 0.5069413185119629, + "rewards/rejected": 2.2293853759765625, + "step": 7195 + }, + { + "epoch": 1.17, + "learning_rate": 3.892154984635948e-06, + "logits/chosen": -1.1158931255340576, + "logits/rejected": -0.9962210059165955, + "logps/chosen": -75.9832992553711, + "logps/rejected": -32.5678596496582, + "loss": 0.4927, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.814971923828125, + "rewards/margins": 2.1140263080596924, + "rewards/rejected": 0.7009456753730774, + "step": 7196 + }, + { + "epoch": 1.17, + "learning_rate": 3.890873435149124e-06, + "logits/chosen": -1.31619393825531, + "logits/rejected": -1.3662232160568237, + "logps/chosen": -104.23169708251953, + "logps/rejected": -184.13748168945312, + "loss": 0.1286, + "rewards/accuracies": 1.0, + "rewards/chosen": 7.261455535888672, + "rewards/margins": 1.8072714805603027, + "rewards/rejected": 5.454184055328369, + "step": 7197 + }, + { + "epoch": 1.17, + "learning_rate": 3.88959196229245e-06, + "logits/chosen": -1.3697313070297241, + "logits/rejected": -1.3028361797332764, + "logps/chosen": -99.17716217041016, + "logps/rejected": -52.78028106689453, + "loss": 0.1102, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.9898674488067627, + "rewards/margins": 1.9515602588653564, + "rewards/rejected": 2.0383071899414062, + "step": 7198 + }, + { + "epoch": 1.17, + "learning_rate": 3.888310566154465e-06, + "logits/chosen": -0.7423825860023499, + "logits/rejected": -0.6542119383811951, + "logps/chosen": -35.60311508178711, + "logps/rejected": -13.013278007507324, + "loss": 0.111, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.1502177715301514, + "rewards/margins": 1.5050082206726074, + "rewards/rejected": 0.6452096104621887, + "step": 7199 + }, + { + "epoch": 1.17, + "learning_rate": 3.887029246823701e-06, + "logits/chosen": -1.375433087348938, + "logits/rejected": -1.005733609199524, + "logps/chosen": -117.80728149414062, + "logps/rejected": -11.052367210388184, + "loss": 0.2472, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.824864387512207, + "rewards/margins": 4.026032447814941, + "rewards/rejected": 0.7988320589065552, + "step": 7200 + }, + { + "epoch": 1.17, + "learning_rate": 3.885748004388686e-06, + "logits/chosen": -1.082412600517273, + "logits/rejected": -1.0108712911605835, + "logps/chosen": -48.52383041381836, + "logps/rejected": -33.74864196777344, + "loss": 0.4682, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.1974003314971924, + "rewards/margins": 0.6662994623184204, + "rewards/rejected": 1.531100869178772, + "step": 7201 + }, + { + "epoch": 1.17, + "learning_rate": 3.88446683893794e-06, + "logits/chosen": -1.1269564628601074, + "logits/rejected": -1.1269564628601074, + "logps/chosen": -50.321075439453125, + "logps/rejected": -50.321075439453125, + "loss": 0.3469, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.7816871404647827, + "rewards/margins": 0.0, + "rewards/rejected": 1.7816871404647827, + "step": 7202 + }, + { + "epoch": 1.17, + "learning_rate": 3.883185750559978e-06, + "logits/chosen": -0.8810255527496338, + "logits/rejected": -0.8299197554588318, + "logps/chosen": -96.63143920898438, + "logps/rejected": -52.059417724609375, + "loss": 0.1461, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.432232618331909, + "rewards/margins": 1.758825659751892, + "rewards/rejected": 1.673406958580017, + "step": 7203 + }, + { + "epoch": 1.17, + "learning_rate": 3.881904739343316e-06, + "logits/chosen": -1.3091061115264893, + "logits/rejected": -1.1896058320999146, + "logps/chosen": -97.53889465332031, + "logps/rejected": -61.127174377441406, + "loss": 0.0388, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.69512939453125, + "rewards/margins": 2.8766348361968994, + "rewards/rejected": 2.8184945583343506, + "step": 7204 + }, + { + "epoch": 1.17, + "learning_rate": 3.880623805376456e-06, + "logits/chosen": -0.9505972862243652, + "logits/rejected": -0.9094107151031494, + "logps/chosen": -94.69522094726562, + "logps/rejected": -45.01690673828125, + "loss": 0.1359, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.5844390392303467, + "rewards/margins": 1.168015956878662, + "rewards/rejected": 2.4164230823516846, + "step": 7205 + }, + { + "epoch": 1.17, + "learning_rate": 3.8793429487479e-06, + "logits/chosen": -1.1743991374969482, + "logits/rejected": -1.2382417917251587, + "logps/chosen": -130.88775634765625, + "logps/rejected": -124.9908676147461, + "loss": 0.1846, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.772090196609497, + "rewards/margins": 1.467374324798584, + "rewards/rejected": 2.304715871810913, + "step": 7206 + }, + { + "epoch": 1.17, + "learning_rate": 3.878062169546142e-06, + "logits/chosen": -1.2831826210021973, + "logits/rejected": -1.2664990425109863, + "logps/chosen": -71.8012466430664, + "logps/rejected": -85.47590637207031, + "loss": 0.3195, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.460376024246216, + "rewards/margins": 0.556060791015625, + "rewards/rejected": 2.904315233230591, + "step": 7207 + }, + { + "epoch": 1.17, + "learning_rate": 3.876781467859673e-06, + "logits/chosen": -1.3726862668991089, + "logits/rejected": -1.3640660047531128, + "logps/chosen": -232.391845703125, + "logps/rejected": -77.7927017211914, + "loss": 0.0039, + "rewards/accuracies": 1.0, + "rewards/chosen": 8.880365371704102, + "rewards/margins": 6.420917510986328, + "rewards/rejected": 2.4594476222991943, + "step": 7208 + }, + { + "epoch": 1.17, + "learning_rate": 3.875500843776976e-06, + "logits/chosen": -1.0834481716156006, + "logits/rejected": -1.112735629081726, + "logps/chosen": -44.5544319152832, + "logps/rejected": -51.64771270751953, + "loss": 0.6736, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.2616984844207764, + "rewards/margins": -0.606348991394043, + "rewards/rejected": 2.8680474758148193, + "step": 7209 + }, + { + "epoch": 1.17, + "learning_rate": 3.87422029738653e-06, + "logits/chosen": -1.4634944200515747, + "logits/rejected": -1.2741258144378662, + "logps/chosen": -102.44490051269531, + "logps/rejected": -14.318710327148438, + "loss": 0.0551, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.371450901031494, + "rewards/margins": 3.5198171138763428, + "rewards/rejected": 0.8516338467597961, + "step": 7210 + }, + { + "epoch": 1.17, + "learning_rate": 3.87293982877681e-06, + "logits/chosen": -1.1080293655395508, + "logits/rejected": -1.1080293655395508, + "logps/chosen": -18.77715492248535, + "logps/rejected": -18.77715492248535, + "loss": 0.3588, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.5017187595367432, + "rewards/margins": 0.0, + "rewards/rejected": 1.5017187595367432, + "step": 7211 + }, + { + "epoch": 1.17, + "learning_rate": 3.871659438036283e-06, + "logits/chosen": -1.2960461378097534, + "logits/rejected": -1.2960461378097534, + "logps/chosen": -67.46177673339844, + "logps/rejected": -67.46177673339844, + "loss": 0.3798, + "rewards/accuracies": 0.0, + "rewards/chosen": 4.592530250549316, + "rewards/margins": 0.0, + "rewards/rejected": 4.592530250549316, + "step": 7212 + }, + { + "epoch": 1.17, + "learning_rate": 3.870379125253413e-06, + "logits/chosen": -1.3735525608062744, + "logits/rejected": -1.3711551427841187, + "logps/chosen": -42.80453872680664, + "logps/rejected": -60.68613052368164, + "loss": 0.1807, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.324674606323242, + "rewards/margins": 0.9091796875, + "rewards/rejected": 1.4154949188232422, + "step": 7213 + }, + { + "epoch": 1.17, + "learning_rate": 3.869098890516656e-06, + "logits/chosen": -1.4804222583770752, + "logits/rejected": -1.5572052001953125, + "logps/chosen": -42.81867218017578, + "logps/rejected": -86.4485092163086, + "loss": 0.7399, + "rewards/accuracies": 0.0, + "rewards/chosen": 4.032717227935791, + "rewards/margins": -0.6907868385314941, + "rewards/rejected": 4.723504066467285, + "step": 7214 + }, + { + "epoch": 1.17, + "learning_rate": 3.867818733914467e-06, + "logits/chosen": -1.227222204208374, + "logits/rejected": -1.2653647661209106, + "logps/chosen": -46.82972717285156, + "logps/rejected": -28.788326263427734, + "loss": 0.2615, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.831329345703125, + "rewards/margins": 0.7483730316162109, + "rewards/rejected": 2.082956314086914, + "step": 7215 + }, + { + "epoch": 1.17, + "learning_rate": 3.866538655535288e-06, + "logits/chosen": -1.643523097038269, + "logits/rejected": -1.5458287000656128, + "logps/chosen": -107.86595153808594, + "logps/rejected": -132.6461944580078, + "loss": 2.0712, + "rewards/accuracies": 0.0, + "rewards/chosen": 5.75935697555542, + "rewards/margins": -3.4495558738708496, + "rewards/rejected": 9.20891284942627, + "step": 7216 + }, + { + "epoch": 1.17, + "learning_rate": 3.8652586554675644e-06, + "logits/chosen": -1.3937216997146606, + "logits/rejected": -1.560204029083252, + "logps/chosen": -100.4632568359375, + "logps/rejected": -91.09725952148438, + "loss": 3.0425, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.6597747802734375, + "rewards/margins": -6.075660705566406, + "rewards/rejected": 9.735435485839844, + "step": 7217 + }, + { + "epoch": 1.17, + "learning_rate": 3.86397873379973e-06, + "logits/chosen": -1.1141610145568848, + "logits/rejected": -1.1141610145568848, + "logps/chosen": -73.57447052001953, + "logps/rejected": -73.57447052001953, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.806874990463257, + "rewards/margins": 0.0, + "rewards/rejected": 3.806874990463257, + "step": 7218 + }, + { + "epoch": 1.17, + "learning_rate": 3.8626988906202165e-06, + "logits/chosen": -1.173551321029663, + "logits/rejected": -1.1765587329864502, + "logps/chosen": -28.579206466674805, + "logps/rejected": -29.52859878540039, + "loss": 0.5314, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.0491151809692383, + "rewards/margins": -0.628185510635376, + "rewards/rejected": 1.6773006916046143, + "step": 7219 + }, + { + "epoch": 1.17, + "learning_rate": 3.861419126017449e-06, + "logits/chosen": -1.5225962400436401, + "logits/rejected": -1.496498942375183, + "logps/chosen": -72.3001480102539, + "logps/rejected": -66.84175109863281, + "loss": 0.1927, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.242544651031494, + "rewards/margins": 1.6326501369476318, + "rewards/rejected": 3.6098945140838623, + "step": 7220 + }, + { + "epoch": 1.17, + "learning_rate": 3.860139440079846e-06, + "logits/chosen": -0.9924204349517822, + "logits/rejected": -1.0780885219573975, + "logps/chosen": -193.66513061523438, + "logps/rejected": -133.94639587402344, + "loss": 0.062, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.721441745758057, + "rewards/margins": 2.1436479091644287, + "rewards/rejected": 3.577793836593628, + "step": 7221 + }, + { + "epoch": 1.17, + "learning_rate": 3.8588598328958225e-06, + "logits/chosen": -1.191864013671875, + "logits/rejected": -1.1576977968215942, + "logps/chosen": -22.30376625061035, + "logps/rejected": -8.705739974975586, + "loss": 0.5073, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.0434061288833618, + "rewards/margins": 0.4982876777648926, + "rewards/rejected": 0.5451184511184692, + "step": 7222 + }, + { + "epoch": 1.17, + "learning_rate": 3.857580304553787e-06, + "logits/chosen": -1.0685166120529175, + "logits/rejected": -1.0721580982208252, + "logps/chosen": -113.4776840209961, + "logps/rejected": -103.80711364746094, + "loss": 0.3079, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.186502933502197, + "rewards/margins": 0.8327383995056152, + "rewards/rejected": 4.353764533996582, + "step": 7223 + }, + { + "epoch": 1.17, + "learning_rate": 3.8563008551421415e-06, + "logits/chosen": -0.9778034090995789, + "logits/rejected": -0.9564515948295593, + "logps/chosen": -50.45118713378906, + "logps/rejected": -39.389068603515625, + "loss": 0.8911, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.9189742803573608, + "rewards/margins": 1.0182242393493652, + "rewards/rejected": 0.9007499814033508, + "step": 7224 + }, + { + "epoch": 1.17, + "learning_rate": 3.855021484749286e-06, + "logits/chosen": -1.2089108228683472, + "logits/rejected": -1.142262578010559, + "logps/chosen": -33.11795425415039, + "logps/rejected": -64.20686340332031, + "loss": 0.5139, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.7158474922180176, + "rewards/margins": 0.9229137897491455, + "rewards/rejected": 1.792933702468872, + "step": 7225 + }, + { + "epoch": 1.17, + "learning_rate": 3.853742193463612e-06, + "logits/chosen": -1.483646035194397, + "logits/rejected": -1.217719554901123, + "logps/chosen": -128.00299072265625, + "logps/rejected": -34.47385025024414, + "loss": 0.0621, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.281383037567139, + "rewards/margins": 4.510850429534912, + "rewards/rejected": 0.7705326080322266, + "step": 7226 + }, + { + "epoch": 1.17, + "learning_rate": 3.852462981373506e-06, + "logits/chosen": -1.2704375982284546, + "logits/rejected": -1.1374818086624146, + "logps/chosen": -49.62367248535156, + "logps/rejected": -26.954580307006836, + "loss": 0.3693, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.1273807287216187, + "rewards/margins": 0.16009730100631714, + "rewards/rejected": 0.9672834277153015, + "step": 7227 + }, + { + "epoch": 1.17, + "learning_rate": 3.851183848567351e-06, + "logits/chosen": -1.222165822982788, + "logits/rejected": -1.0351523160934448, + "logps/chosen": -73.58184814453125, + "logps/rejected": -27.265830993652344, + "loss": 1.3427, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.099086046218872, + "rewards/margins": 2.3162529468536377, + "rewards/rejected": 0.7828330993652344, + "step": 7228 + }, + { + "epoch": 1.17, + "learning_rate": 3.849904795133521e-06, + "logits/chosen": -1.1564841270446777, + "logits/rejected": -1.1553032398223877, + "logps/chosen": -1.4371120929718018, + "logps/rejected": -2.7462525367736816, + "loss": 0.7567, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3063318431377411, + "rewards/margins": 0.016899019479751587, + "rewards/rejected": 0.2894328236579895, + "step": 7229 + }, + { + "epoch": 1.17, + "learning_rate": 3.848625821160388e-06, + "logits/chosen": -1.3480623960494995, + "logits/rejected": -1.3600505590438843, + "logps/chosen": -56.65398406982422, + "logps/rejected": -63.51808547973633, + "loss": 0.4396, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.6948035955429077, + "rewards/margins": -0.2847752571105957, + "rewards/rejected": 1.9795788526535034, + "step": 7230 + }, + { + "epoch": 1.17, + "learning_rate": 3.847346926736315e-06, + "logits/chosen": -1.2311992645263672, + "logits/rejected": -1.2394835948944092, + "logps/chosen": -44.40211486816406, + "logps/rejected": -35.87888717651367, + "loss": 0.6815, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.525730848312378, + "rewards/margins": -0.49027442932128906, + "rewards/rejected": 3.016005277633667, + "step": 7231 + }, + { + "epoch": 1.17, + "learning_rate": 3.846068111949665e-06, + "logits/chosen": -1.2794734239578247, + "logits/rejected": -1.2794734239578247, + "logps/chosen": -28.540931701660156, + "logps/rejected": -28.540931701660156, + "loss": 1.1798, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.6383254528045654, + "rewards/margins": 0.0, + "rewards/rejected": 2.6383254528045654, + "step": 7232 + }, + { + "epoch": 1.17, + "learning_rate": 3.844789376888788e-06, + "logits/chosen": -1.1625933647155762, + "logits/rejected": -1.185702919960022, + "logps/chosen": -97.96456146240234, + "logps/rejected": -66.00732421875, + "loss": 0.35, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.6327462196350098, + "rewards/margins": 0.24393081665039062, + "rewards/rejected": 3.388815402984619, + "step": 7233 + }, + { + "epoch": 1.17, + "learning_rate": 3.843510721642036e-06, + "logits/chosen": -1.2173672914505005, + "logits/rejected": -1.082838773727417, + "logps/chosen": -253.10427856445312, + "logps/rejected": -64.003662109375, + "loss": 0.0562, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.294088840484619, + "rewards/margins": 5.251219272613525, + "rewards/rejected": 1.0428695678710938, + "step": 7234 + }, + { + "epoch": 1.17, + "learning_rate": 3.842232146297749e-06, + "logits/chosen": -1.1861603260040283, + "logits/rejected": -1.1074750423431396, + "logps/chosen": -46.73265838623047, + "logps/rejected": -69.3238296508789, + "loss": 0.4626, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.4196128845214844, + "rewards/margins": 0.05550980567932129, + "rewards/rejected": 2.364103078842163, + "step": 7235 + }, + { + "epoch": 1.17, + "learning_rate": 3.840953650944266e-06, + "logits/chosen": -1.2108795642852783, + "logits/rejected": -1.1951478719711304, + "logps/chosen": -86.35214233398438, + "logps/rejected": -66.59986114501953, + "loss": 0.1234, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.314573049545288, + "rewards/margins": 1.2767090797424316, + "rewards/rejected": 2.0378639698028564, + "step": 7236 + }, + { + "epoch": 1.17, + "learning_rate": 3.839675235669918e-06, + "logits/chosen": -0.9707356691360474, + "logits/rejected": -0.9803752899169922, + "logps/chosen": -20.04859161376953, + "logps/rejected": -51.56998825073242, + "loss": 0.5118, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.9434181451797485, + "rewards/margins": 0.20882266759872437, + "rewards/rejected": 0.7345954775810242, + "step": 7237 + }, + { + "epoch": 1.17, + "learning_rate": 3.838396900563033e-06, + "logits/chosen": -1.1343634128570557, + "logits/rejected": -1.1391264200210571, + "logps/chosen": -50.370933532714844, + "logps/rejected": -85.10106658935547, + "loss": 0.7834, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.1251046657562256, + "rewards/margins": -0.30425262451171875, + "rewards/rejected": 2.4293572902679443, + "step": 7238 + }, + { + "epoch": 1.17, + "learning_rate": 3.83711864571193e-06, + "logits/chosen": -1.1207321882247925, + "logits/rejected": -0.9856252074241638, + "logps/chosen": -252.654296875, + "logps/rejected": -79.09700775146484, + "loss": 0.0075, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.691461086273193, + "rewards/margins": 4.906366348266602, + "rewards/rejected": 1.7850944995880127, + "step": 7239 + }, + { + "epoch": 1.18, + "learning_rate": 3.835840471204927e-06, + "logits/chosen": -1.2782871723175049, + "logits/rejected": -1.2782871723175049, + "logps/chosen": -47.88557434082031, + "logps/rejected": -47.88557434082031, + "loss": 0.4771, + "rewards/accuracies": 0.0, + "rewards/chosen": 4.926284313201904, + "rewards/margins": 0.0, + "rewards/rejected": 4.926284313201904, + "step": 7240 + }, + { + "epoch": 1.18, + "learning_rate": 3.834562377130331e-06, + "logits/chosen": -1.1765691041946411, + "logits/rejected": -1.2426559925079346, + "logps/chosen": -52.98260498046875, + "logps/rejected": -53.724449157714844, + "loss": 0.4272, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.4429908990859985, + "rewards/margins": -0.2804253101348877, + "rewards/rejected": 1.7234162092208862, + "step": 7241 + }, + { + "epoch": 1.18, + "learning_rate": 3.833284363576447e-06, + "logits/chosen": -0.9586518406867981, + "logits/rejected": -1.0128952264785767, + "logps/chosen": -54.93193817138672, + "logps/rejected": -100.05876159667969, + "loss": 0.3105, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.218416690826416, + "rewards/margins": 0.3986259698867798, + "rewards/rejected": 1.8197907209396362, + "step": 7242 + }, + { + "epoch": 1.18, + "learning_rate": 3.8320064306315754e-06, + "logits/chosen": -1.4281494617462158, + "logits/rejected": -1.3950954675674438, + "logps/chosen": -70.61749267578125, + "logps/rejected": -86.80638122558594, + "loss": 0.8932, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.50003981590271, + "rewards/margins": -1.3679404258728027, + "rewards/rejected": 3.8679802417755127, + "step": 7243 + }, + { + "epoch": 1.18, + "learning_rate": 3.830728578384006e-06, + "logits/chosen": -1.0670030117034912, + "logits/rejected": -1.0785638093948364, + "logps/chosen": -91.60533142089844, + "logps/rejected": -91.35223388671875, + "loss": 0.4498, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.284273624420166, + "rewards/margins": 0.18323898315429688, + "rewards/rejected": 2.101034641265869, + "step": 7244 + }, + { + "epoch": 1.18, + "learning_rate": 3.829450806922029e-06, + "logits/chosen": -0.9376124143600464, + "logits/rejected": -0.9385285377502441, + "logps/chosen": -2.0793285369873047, + "logps/rejected": -4.419944763183594, + "loss": 1.9205, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.22671322524547577, + "rewards/margins": -0.14445482194423676, + "rewards/rejected": 0.3711680471897125, + "step": 7245 + }, + { + "epoch": 1.18, + "learning_rate": 3.828173116333925e-06, + "logits/chosen": -1.463486909866333, + "logits/rejected": -1.3856384754180908, + "logps/chosen": -155.18763732910156, + "logps/rejected": -13.813592910766602, + "loss": 0.0231, + "rewards/accuracies": 1.0, + "rewards/chosen": 8.867398262023926, + "rewards/margins": 8.13446044921875, + "rewards/rejected": 0.7329378128051758, + "step": 7246 + }, + { + "epoch": 1.18, + "learning_rate": 3.8268955067079715e-06, + "logits/chosen": -1.1371101140975952, + "logits/rejected": -1.1758586168289185, + "logps/chosen": -110.73628234863281, + "logps/rejected": -71.75112915039062, + "loss": 1.806, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.5295486450195312, + "rewards/margins": -2.863372802734375, + "rewards/rejected": 6.392921447753906, + "step": 7247 + }, + { + "epoch": 1.18, + "learning_rate": 3.825617978132438e-06, + "logits/chosen": -1.4878530502319336, + "logits/rejected": -1.5254861116409302, + "logps/chosen": -39.757545471191406, + "logps/rejected": -102.56344604492188, + "loss": 0.9112, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.1841514110565186, + "rewards/margins": -1.6299233436584473, + "rewards/rejected": 3.814074754714966, + "step": 7248 + }, + { + "epoch": 1.18, + "learning_rate": 3.8243405306955905e-06, + "logits/chosen": -1.3796356916427612, + "logits/rejected": -1.4701478481292725, + "logps/chosen": -129.42616271972656, + "logps/rejected": -65.42274475097656, + "loss": 0.4618, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.283285617828369, + "rewards/margins": 1.4798498153686523, + "rewards/rejected": 4.803435802459717, + "step": 7249 + }, + { + "epoch": 1.18, + "learning_rate": 3.8230631644856875e-06, + "logits/chosen": -1.159704327583313, + "logits/rejected": -1.1798875331878662, + "logps/chosen": -73.02682495117188, + "logps/rejected": -69.53868103027344, + "loss": 2.1177, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.6523345708847046, + "rewards/margins": -0.08483123779296875, + "rewards/rejected": 1.7371658086776733, + "step": 7250 + }, + { + "epoch": 1.18, + "learning_rate": 3.821785879590984e-06, + "logits/chosen": -1.4197770357131958, + "logits/rejected": -1.1673418283462524, + "logps/chosen": -144.68243408203125, + "logps/rejected": -36.933685302734375, + "loss": 0.4086, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.1289520263671875, + "rewards/margins": 5.148024082183838, + "rewards/rejected": 0.9809280633926392, + "step": 7251 + }, + { + "epoch": 1.18, + "learning_rate": 3.820508676099728e-06, + "logits/chosen": -0.7426106929779053, + "logits/rejected": -0.7869681715965271, + "logps/chosen": -6.844715118408203, + "logps/rejected": -54.88011169433594, + "loss": 0.7267, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.389624685049057, + "rewards/margins": -0.3996518552303314, + "rewards/rejected": 0.7892765402793884, + "step": 7252 + }, + { + "epoch": 1.18, + "learning_rate": 3.819231554100162e-06, + "logits/chosen": -1.4397133588790894, + "logits/rejected": -1.3626139163970947, + "logps/chosen": -30.637054443359375, + "logps/rejected": -48.35053253173828, + "loss": 0.2387, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.864820957183838, + "rewards/margins": 1.3489174842834473, + "rewards/rejected": 1.5159034729003906, + "step": 7253 + }, + { + "epoch": 1.18, + "learning_rate": 3.817954513680524e-06, + "logits/chosen": -1.0175302028656006, + "logits/rejected": -0.9675103425979614, + "logps/chosen": -64.02598571777344, + "logps/rejected": -54.88439178466797, + "loss": 1.3394, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.8401397466659546, + "rewards/margins": -0.06703269481658936, + "rewards/rejected": 1.907172441482544, + "step": 7254 + }, + { + "epoch": 1.18, + "learning_rate": 3.8166775549290434e-06, + "logits/chosen": -1.438762903213501, + "logits/rejected": -1.4479365348815918, + "logps/chosen": -132.1555633544922, + "logps/rejected": -97.96078491210938, + "loss": 0.5549, + "rewards/accuracies": 0.0, + "rewards/chosen": 7.598884582519531, + "rewards/margins": -0.6988935470581055, + "rewards/rejected": 8.297778129577637, + "step": 7255 + }, + { + "epoch": 1.18, + "learning_rate": 3.815400677933948e-06, + "logits/chosen": -1.3216139078140259, + "logits/rejected": -1.19892156124115, + "logps/chosen": -115.47593688964844, + "logps/rejected": -79.89108276367188, + "loss": 0.0156, + "rewards/accuracies": 1.0, + "rewards/chosen": 7.438061714172363, + "rewards/margins": 3.5592050552368164, + "rewards/rejected": 3.878856658935547, + "step": 7256 + }, + { + "epoch": 1.18, + "learning_rate": 3.8141238827834556e-06, + "logits/chosen": -1.286422610282898, + "logits/rejected": -1.247070550918579, + "logps/chosen": -67.27397918701172, + "logps/rejected": -34.436363220214844, + "loss": 0.1994, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.891860246658325, + "rewards/margins": 0.754380464553833, + "rewards/rejected": 2.137479782104492, + "step": 7257 + }, + { + "epoch": 1.18, + "learning_rate": 3.812847169565782e-06, + "logits/chosen": -1.4221540689468384, + "logits/rejected": -1.4477425813674927, + "logps/chosen": -111.64839172363281, + "logps/rejected": -165.75161743164062, + "loss": 0.2246, + "rewards/accuracies": 1.0, + "rewards/chosen": 7.988429546356201, + "rewards/margins": 0.6082992553710938, + "rewards/rejected": 7.380130290985107, + "step": 7258 + }, + { + "epoch": 1.18, + "learning_rate": 3.8115705383691354e-06, + "logits/chosen": -1.3396199941635132, + "logits/rejected": -1.3277705907821655, + "logps/chosen": -49.37703323364258, + "logps/rejected": -106.24032592773438, + "loss": 1.4921, + "rewards/accuracies": 0.0, + "rewards/chosen": 4.997018337249756, + "rewards/margins": -2.868884563446045, + "rewards/rejected": 7.865902900695801, + "step": 7259 + }, + { + "epoch": 1.18, + "learning_rate": 3.8102939892817205e-06, + "logits/chosen": -0.8788261413574219, + "logits/rejected": -0.8741511106491089, + "logps/chosen": -0.6354221105575562, + "logps/rejected": -27.796142578125, + "loss": 1.08, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.3287917673587799, + "rewards/margins": -0.010753482580184937, + "rewards/rejected": 0.33954524993896484, + "step": 7260 + }, + { + "epoch": 1.18, + "learning_rate": 3.8090175223917342e-06, + "logits/chosen": -1.31895112991333, + "logits/rejected": -1.2712910175323486, + "logps/chosen": -68.48075866699219, + "logps/rejected": -61.89845275878906, + "loss": 0.3848, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.7847726345062256, + "rewards/margins": -0.11630773544311523, + "rewards/rejected": 3.901080369949341, + "step": 7261 + }, + { + "epoch": 1.18, + "learning_rate": 3.8077411377873675e-06, + "logits/chosen": -1.1142207384109497, + "logits/rejected": -1.2570844888687134, + "logps/chosen": -79.20032501220703, + "logps/rejected": -154.63995361328125, + "loss": 1.1486, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.5689475536346436, + "rewards/margins": -1.9702799320220947, + "rewards/rejected": 4.539227485656738, + "step": 7262 + }, + { + "epoch": 1.18, + "learning_rate": 3.8064648355568068e-06, + "logits/chosen": -1.2025845050811768, + "logits/rejected": -1.223085880279541, + "logps/chosen": -48.02259063720703, + "logps/rejected": -45.80888366699219, + "loss": 0.4029, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.181929111480713, + "rewards/margins": 0.5238300561904907, + "rewards/rejected": 1.6580990552902222, + "step": 7263 + }, + { + "epoch": 1.18, + "learning_rate": 3.8051886157882335e-06, + "logits/chosen": -1.376084566116333, + "logits/rejected": -1.3257262706756592, + "logps/chosen": -79.93833923339844, + "logps/rejected": -73.3556137084961, + "loss": 0.0534, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.760931491851807, + "rewards/margins": 3.748018741607666, + "rewards/rejected": 3.0129127502441406, + "step": 7264 + }, + { + "epoch": 1.18, + "learning_rate": 3.8039124785698205e-06, + "logits/chosen": -1.1009891033172607, + "logits/rejected": -1.11002516746521, + "logps/chosen": -53.803592681884766, + "logps/rejected": -32.49773025512695, + "loss": 2.3498, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.788269519805908, + "rewards/margins": 0.6222496032714844, + "rewards/rejected": 2.166019916534424, + "step": 7265 + }, + { + "epoch": 1.18, + "learning_rate": 3.802636423989738e-06, + "logits/chosen": -1.2249677181243896, + "logits/rejected": -1.1569510698318481, + "logps/chosen": -61.722198486328125, + "logps/rejected": -37.62751388549805, + "loss": 0.8994, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.8573014736175537, + "rewards/margins": 0.3911018371582031, + "rewards/rejected": 2.4661996364593506, + "step": 7266 + }, + { + "epoch": 1.18, + "learning_rate": 3.801360452136149e-06, + "logits/chosen": -1.4076650142669678, + "logits/rejected": -1.3113913536071777, + "logps/chosen": -114.7465591430664, + "logps/rejected": -206.18115234375, + "loss": 1.9619, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.3190057277679443, + "rewards/margins": -3.846851110458374, + "rewards/rejected": 7.165856838226318, + "step": 7267 + }, + { + "epoch": 1.18, + "learning_rate": 3.8000845630972117e-06, + "logits/chosen": -1.3423576354980469, + "logits/rejected": -1.313027024269104, + "logps/chosen": -70.29722595214844, + "logps/rejected": -35.946388244628906, + "loss": 0.3921, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.437772512435913, + "rewards/margins": -0.1618654727935791, + "rewards/rejected": 2.599637985229492, + "step": 7268 + }, + { + "epoch": 1.18, + "learning_rate": 3.7988087569610766e-06, + "logits/chosen": -1.220282793045044, + "logits/rejected": -1.3454747200012207, + "logps/chosen": -223.55816650390625, + "logps/rejected": -242.14376831054688, + "loss": 1.2965, + "rewards/accuracies": 0.0, + "rewards/chosen": 6.8427276611328125, + "rewards/margins": -2.465914726257324, + "rewards/rejected": 9.308642387390137, + "step": 7269 + }, + { + "epoch": 1.18, + "learning_rate": 3.7975330338158913e-06, + "logits/chosen": -0.7649865746498108, + "logits/rejected": -0.7635972499847412, + "logps/chosen": -9.490655899047852, + "logps/rejected": -2.3305909633636475, + "loss": 0.6439, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.4304618835449219, + "rewards/margins": 0.20778462290763855, + "rewards/rejected": 0.22267726063728333, + "step": 7270 + }, + { + "epoch": 1.18, + "learning_rate": 3.7962573937497947e-06, + "logits/chosen": -1.4176762104034424, + "logits/rejected": -1.562212347984314, + "logps/chosen": -81.94677734375, + "logps/rejected": -111.49458312988281, + "loss": 4.0617, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.9644532203674316, + "rewards/margins": -4.9047532081604, + "rewards/rejected": 7.869206428527832, + "step": 7271 + }, + { + "epoch": 1.18, + "learning_rate": 3.794981836850923e-06, + "logits/chosen": -1.514190673828125, + "logits/rejected": -1.4913218021392822, + "logps/chosen": -36.227989196777344, + "logps/rejected": -55.32963943481445, + "loss": 0.3102, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.9818557500839233, + "rewards/margins": 0.158156156539917, + "rewards/rejected": 1.8236995935440063, + "step": 7272 + }, + { + "epoch": 1.18, + "learning_rate": 3.7937063632074037e-06, + "logits/chosen": -1.3063055276870728, + "logits/rejected": -1.2096675634384155, + "logps/chosen": -154.68063354492188, + "logps/rejected": -196.97317504882812, + "loss": 0.4628, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.349440097808838, + "rewards/margins": 1.6306809186935425, + "rewards/rejected": 1.7187591791152954, + "step": 7273 + }, + { + "epoch": 1.18, + "learning_rate": 3.7924309729073616e-06, + "logits/chosen": -1.0925664901733398, + "logits/rejected": -1.0034135580062866, + "logps/chosen": -71.47716522216797, + "logps/rejected": -40.364845275878906, + "loss": 0.2339, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.5762765407562256, + "rewards/margins": 0.7905553579330444, + "rewards/rejected": 1.7857211828231812, + "step": 7274 + }, + { + "epoch": 1.18, + "learning_rate": 3.7911556660389124e-06, + "logits/chosen": -1.2002383470535278, + "logits/rejected": -1.2130374908447266, + "logps/chosen": -62.34684753417969, + "logps/rejected": -48.92441177368164, + "loss": 0.3913, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.6502511501312256, + "rewards/margins": -0.13434481620788574, + "rewards/rejected": 3.7845959663391113, + "step": 7275 + }, + { + "epoch": 1.18, + "learning_rate": 3.7898804426901687e-06, + "logits/chosen": -1.5657463073730469, + "logits/rejected": -1.557532548904419, + "logps/chosen": -60.8979606628418, + "logps/rejected": -84.43262481689453, + "loss": 0.5237, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.941893458366394, + "rewards/margins": -0.009431838989257812, + "rewards/rejected": 1.9513252973556519, + "step": 7276 + }, + { + "epoch": 1.18, + "learning_rate": 3.788605302949236e-06, + "logits/chosen": -0.6934731006622314, + "logits/rejected": -0.7476707100868225, + "logps/chosen": -52.510276794433594, + "logps/rejected": -105.60310363769531, + "loss": 0.6571, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.8738594055175781, + "rewards/margins": 0.8206466436386108, + "rewards/rejected": 1.0532127618789673, + "step": 7277 + }, + { + "epoch": 1.18, + "learning_rate": 3.787330246904215e-06, + "logits/chosen": -1.0605014562606812, + "logits/rejected": -1.0605014562606812, + "logps/chosen": -33.89186096191406, + "logps/rejected": -33.89186096191406, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.5113705396652222, + "rewards/margins": 0.0, + "rewards/rejected": 1.5113705396652222, + "step": 7278 + }, + { + "epoch": 1.18, + "learning_rate": 3.786055274643199e-06, + "logits/chosen": -1.151031494140625, + "logits/rejected": -1.1578915119171143, + "logps/chosen": -71.96448516845703, + "logps/rejected": -116.7510986328125, + "loss": 0.1806, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.8317283391952515, + "rewards/margins": 0.9936835765838623, + "rewards/rejected": 0.8380447626113892, + "step": 7279 + }, + { + "epoch": 1.18, + "learning_rate": 3.7847803862542776e-06, + "logits/chosen": -1.3779510259628296, + "logits/rejected": -1.347622275352478, + "logps/chosen": -99.95288848876953, + "logps/rejected": -60.99592590332031, + "loss": 0.1822, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.4031455516815186, + "rewards/margins": 0.8978996276855469, + "rewards/rejected": 2.5052459239959717, + "step": 7280 + }, + { + "epoch": 1.18, + "learning_rate": 3.7835055818255324e-06, + "logits/chosen": -2.1699066162109375, + "logits/rejected": -2.2430129051208496, + "logps/chosen": -146.78778076171875, + "logps/rejected": -103.11172485351562, + "loss": 0.227, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.9591064453125, + "rewards/margins": 0.6659116744995117, + "rewards/rejected": 6.293194770812988, + "step": 7281 + }, + { + "epoch": 1.18, + "learning_rate": 3.782230861445041e-06, + "logits/chosen": -1.1905484199523926, + "logits/rejected": -1.1971330642700195, + "logps/chosen": -37.483089447021484, + "logps/rejected": -5.356312274932861, + "loss": 0.2648, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.8994163870811462, + "rewards/margins": 0.4851304888725281, + "rewards/rejected": 0.41428589820861816, + "step": 7282 + }, + { + "epoch": 1.18, + "learning_rate": 3.7809562252008745e-06, + "logits/chosen": -1.4908232688903809, + "logits/rejected": -1.4023100137710571, + "logps/chosen": -98.772216796875, + "logps/rejected": -72.38162231445312, + "loss": 0.2365, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.9199538230896, + "rewards/margins": 3.7937896251678467, + "rewards/rejected": 3.126164197921753, + "step": 7283 + }, + { + "epoch": 1.18, + "learning_rate": 3.7796816731810985e-06, + "logits/chosen": -1.2590816020965576, + "logits/rejected": -1.2590816020965576, + "logps/chosen": -36.54590606689453, + "logps/rejected": -36.54590606689453, + "loss": 0.3542, + "rewards/accuracies": 0.0, + "rewards/chosen": 4.233144283294678, + "rewards/margins": 0.0, + "rewards/rejected": 4.233144283294678, + "step": 7284 + }, + { + "epoch": 1.18, + "learning_rate": 3.778407205473772e-06, + "logits/chosen": -1.4904447793960571, + "logits/rejected": -1.5372644662857056, + "logps/chosen": -60.42258834838867, + "logps/rejected": -79.43202209472656, + "loss": 1.9143, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.3390071392059326, + "rewards/margins": -3.6164252758026123, + "rewards/rejected": 5.955432415008545, + "step": 7285 + }, + { + "epoch": 1.18, + "learning_rate": 3.777132822166949e-06, + "logits/chosen": -1.3418720960617065, + "logits/rejected": -1.1768780946731567, + "logps/chosen": -116.34806823730469, + "logps/rejected": -39.180267333984375, + "loss": 0.5453, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.459974765777588, + "rewards/margins": -0.19670414924621582, + "rewards/rejected": 2.6566789150238037, + "step": 7286 + }, + { + "epoch": 1.18, + "learning_rate": 3.7758585233486766e-06, + "logits/chosen": -1.083288311958313, + "logits/rejected": -0.8380085229873657, + "logps/chosen": -84.27572631835938, + "logps/rejected": -23.028051376342773, + "loss": 0.0123, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.310662746429443, + "rewards/margins": 4.8903889656066895, + "rewards/rejected": 0.4202737808227539, + "step": 7287 + }, + { + "epoch": 1.18, + "learning_rate": 3.7745843091069976e-06, + "logits/chosen": -1.1641594171524048, + "logits/rejected": -1.1384358406066895, + "logps/chosen": -50.436275482177734, + "logps/rejected": -49.27027130126953, + "loss": 0.6582, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.201977252960205, + "rewards/margins": -0.12165641784667969, + "rewards/rejected": 2.3236336708068848, + "step": 7288 + }, + { + "epoch": 1.18, + "learning_rate": 3.7733101795299476e-06, + "logits/chosen": -1.19243586063385, + "logits/rejected": -1.1702511310577393, + "logps/chosen": -82.24211883544922, + "logps/rejected": -115.9691162109375, + "loss": 1.787, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.7963387966156006, + "rewards/margins": -1.8241536617279053, + "rewards/rejected": 4.620492458343506, + "step": 7289 + }, + { + "epoch": 1.18, + "learning_rate": 3.7720361347055573e-06, + "logits/chosen": -0.8125651478767395, + "logits/rejected": -0.813501238822937, + "logps/chosen": -40.31062316894531, + "logps/rejected": -40.25970458984375, + "loss": 0.8334, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.5410897731781006, + "rewards/margins": -0.2722175121307373, + "rewards/rejected": 2.813307285308838, + "step": 7290 + }, + { + "epoch": 1.18, + "learning_rate": 3.770762174721851e-06, + "logits/chosen": -1.2259626388549805, + "logits/rejected": -1.2990533113479614, + "logps/chosen": -63.54328536987305, + "logps/rejected": -112.01954650878906, + "loss": 1.5449, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.7254574298858643, + "rewards/margins": -2.160829782485962, + "rewards/rejected": 4.886287212371826, + "step": 7291 + }, + { + "epoch": 1.18, + "learning_rate": 3.7694882996668475e-06, + "logits/chosen": -0.8533637523651123, + "logits/rejected": -0.8533637523651123, + "logps/chosen": -0.8742688894271851, + "logps/rejected": -0.8742688894271851, + "loss": 0.3625, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.338449627161026, + "rewards/margins": 0.0, + "rewards/rejected": 0.338449627161026, + "step": 7292 + }, + { + "epoch": 1.18, + "learning_rate": 3.7682145096285588e-06, + "logits/chosen": -1.026662826538086, + "logits/rejected": -0.8588222861289978, + "logps/chosen": -84.66749572753906, + "logps/rejected": -43.64424133300781, + "loss": 0.0451, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.969861030578613, + "rewards/margins": 2.8019731044769287, + "rewards/rejected": 2.1678879261016846, + "step": 7293 + }, + { + "epoch": 1.18, + "learning_rate": 3.766940804694992e-06, + "logits/chosen": -1.6806198358535767, + "logits/rejected": -1.5827746391296387, + "logps/chosen": -103.82207489013672, + "logps/rejected": -104.49303436279297, + "loss": 0.0698, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.283666133880615, + "rewards/margins": 2.6231000423431396, + "rewards/rejected": 3.6605660915374756, + "step": 7294 + }, + { + "epoch": 1.18, + "learning_rate": 3.765667184954148e-06, + "logits/chosen": -1.1371879577636719, + "logits/rejected": -1.1371879577636719, + "logps/chosen": -61.235450744628906, + "logps/rejected": -61.235450744628906, + "loss": 0.4139, + "rewards/accuracies": 0.0, + "rewards/chosen": 4.286479949951172, + "rewards/margins": 0.0, + "rewards/rejected": 4.286479949951172, + "step": 7295 + }, + { + "epoch": 1.18, + "learning_rate": 3.764393650494023e-06, + "logits/chosen": -1.0806530714035034, + "logits/rejected": -1.0825169086456299, + "logps/chosen": -65.95606994628906, + "logps/rejected": -48.86823272705078, + "loss": 0.3601, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.7336692810058594, + "rewards/margins": 2.0269782543182373, + "rewards/rejected": 0.7066909670829773, + "step": 7296 + }, + { + "epoch": 1.18, + "learning_rate": 3.7631202014026057e-06, + "logits/chosen": -1.3361976146697998, + "logits/rejected": -1.1772513389587402, + "logps/chosen": -99.04107666015625, + "logps/rejected": -42.87476348876953, + "loss": 0.1329, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.157834053039551, + "rewards/margins": 1.6881976127624512, + "rewards/rejected": 4.4696364402771, + "step": 7297 + }, + { + "epoch": 1.18, + "learning_rate": 3.7618468377678787e-06, + "logits/chosen": -1.0034257173538208, + "logits/rejected": -1.0034257173538208, + "logps/chosen": -25.59497833251953, + "logps/rejected": -25.59497833251953, + "loss": 2.1201, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.4808429777622223, + "rewards/margins": 0.0, + "rewards/rejected": 0.4808429777622223, + "step": 7298 + }, + { + "epoch": 1.18, + "learning_rate": 3.7605735596778194e-06, + "logits/chosen": -1.3484736680984497, + "logits/rejected": -1.3242594003677368, + "logps/chosen": -61.85085678100586, + "logps/rejected": -45.44194412231445, + "loss": 0.5716, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.3680310249328613, + "rewards/margins": 0.35212182998657227, + "rewards/rejected": 3.015909194946289, + "step": 7299 + }, + { + "epoch": 1.18, + "learning_rate": 3.7593003672204e-06, + "logits/chosen": -1.3342305421829224, + "logits/rejected": -1.3962022066116333, + "logps/chosen": -73.18435668945312, + "logps/rejected": -90.99236297607422, + "loss": 1.703, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.241408586502075, + "rewards/margins": -1.8361799716949463, + "rewards/rejected": 4.0775885581970215, + "step": 7300 + }, + { + "epoch": 1.19, + "learning_rate": 3.7580272604835847e-06, + "logits/chosen": -1.0693808794021606, + "logits/rejected": -0.9745543003082275, + "logps/chosen": -72.64448547363281, + "logps/rejected": -33.21331024169922, + "loss": 0.8181, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.5339020490646362, + "rewards/margins": -0.44969630241394043, + "rewards/rejected": 1.9835983514785767, + "step": 7301 + }, + { + "epoch": 1.19, + "learning_rate": 3.7567542395553346e-06, + "logits/chosen": -1.108197569847107, + "logits/rejected": -1.108197569847107, + "logps/chosen": -35.691253662109375, + "logps/rejected": -35.691253662109375, + "loss": 0.6666, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.06636118888855, + "rewards/margins": 0.0, + "rewards/rejected": 2.06636118888855, + "step": 7302 + }, + { + "epoch": 1.19, + "learning_rate": 3.7554813045236034e-06, + "logits/chosen": -1.2732479572296143, + "logits/rejected": -1.197052001953125, + "logps/chosen": -71.34963989257812, + "logps/rejected": -13.519712448120117, + "loss": 1.0165, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.5311249494552612, + "rewards/margins": 1.00319242477417, + "rewards/rejected": 0.5279325842857361, + "step": 7303 + }, + { + "epoch": 1.19, + "learning_rate": 3.7542084554763373e-06, + "logits/chosen": -1.5360760688781738, + "logits/rejected": -1.3119701147079468, + "logps/chosen": -85.67577362060547, + "logps/rejected": -40.86747360229492, + "loss": 0.0836, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.268796443939209, + "rewards/margins": 5.17593240737915, + "rewards/rejected": 1.092863917350769, + "step": 7304 + }, + { + "epoch": 1.19, + "learning_rate": 3.7529356925014803e-06, + "logits/chosen": -1.0780175924301147, + "logits/rejected": -1.0948783159255981, + "logps/chosen": -83.70648956298828, + "logps/rejected": -39.255706787109375, + "loss": 0.5151, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.2110602855682373, + "rewards/margins": 0.9145621061325073, + "rewards/rejected": 1.29649817943573, + "step": 7305 + }, + { + "epoch": 1.19, + "learning_rate": 3.751663015686966e-06, + "logits/chosen": -1.13043212890625, + "logits/rejected": -0.9602475166320801, + "logps/chosen": -58.82182312011719, + "logps/rejected": -25.23683738708496, + "loss": 0.0734, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.6234261989593506, + "rewards/margins": 2.244274377822876, + "rewards/rejected": 0.3791519105434418, + "step": 7306 + }, + { + "epoch": 1.19, + "learning_rate": 3.7503904251207263e-06, + "logits/chosen": -1.2634094953536987, + "logits/rejected": -1.3887252807617188, + "logps/chosen": -30.72184181213379, + "logps/rejected": -180.5355224609375, + "loss": 3.4277, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.5523359775543213, + "rewards/margins": -5.962352752685547, + "rewards/rejected": 8.514688491821289, + "step": 7307 + }, + { + "epoch": 1.19, + "learning_rate": 3.7491179208906835e-06, + "logits/chosen": -1.4519187211990356, + "logits/rejected": -1.3547438383102417, + "logps/chosen": -75.27601623535156, + "logps/rejected": -24.886703491210938, + "loss": 0.6662, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.559215545654297, + "rewards/margins": 1.8519847393035889, + "rewards/rejected": 1.707230806350708, + "step": 7308 + }, + { + "epoch": 1.19, + "learning_rate": 3.747845503084757e-06, + "logits/chosen": -1.2474168539047241, + "logits/rejected": -1.1026970148086548, + "logps/chosen": -60.0346794128418, + "logps/rejected": -14.850351333618164, + "loss": 0.0217, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.569974422454834, + "rewards/margins": 3.466310739517212, + "rewards/rejected": 1.103663682937622, + "step": 7309 + }, + { + "epoch": 1.19, + "learning_rate": 3.746573171790858e-06, + "logits/chosen": -1.3268344402313232, + "logits/rejected": -1.364338755607605, + "logps/chosen": -68.91791534423828, + "logps/rejected": -85.87821960449219, + "loss": 0.428, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.6830642223358154, + "rewards/margins": 0.7730879783630371, + "rewards/rejected": 1.9099762439727783, + "step": 7310 + }, + { + "epoch": 1.19, + "learning_rate": 3.745300927096893e-06, + "logits/chosen": -1.4674383401870728, + "logits/rejected": -1.6566598415374756, + "logps/chosen": -76.53601837158203, + "logps/rejected": -174.58926391601562, + "loss": 2.6197, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.922094702720642, + "rewards/margins": -4.758094787597656, + "rewards/rejected": 6.680189609527588, + "step": 7311 + }, + { + "epoch": 1.19, + "learning_rate": 3.744028769090762e-06, + "logits/chosen": -1.200995922088623, + "logits/rejected": -1.2401210069656372, + "logps/chosen": -69.1192398071289, + "logps/rejected": -92.09677124023438, + "loss": 0.1209, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.4174981117248535, + "rewards/margins": 1.504219889640808, + "rewards/rejected": 0.9132782220840454, + "step": 7312 + }, + { + "epoch": 1.19, + "learning_rate": 3.7427566978603592e-06, + "logits/chosen": -1.387423038482666, + "logits/rejected": -1.360090732574463, + "logps/chosen": -52.533851623535156, + "logps/rejected": -64.81163787841797, + "loss": 0.4114, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.756694793701172, + "rewards/margins": 0.9724150896072388, + "rewards/rejected": 1.784279704093933, + "step": 7313 + }, + { + "epoch": 1.19, + "learning_rate": 3.7414847134935716e-06, + "logits/chosen": -1.468745470046997, + "logits/rejected": -1.5045199394226074, + "logps/chosen": -61.05609893798828, + "logps/rejected": -100.49662780761719, + "loss": 0.7202, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.5047775506973267, + "rewards/margins": -0.7362015247344971, + "rewards/rejected": 1.2409790754318237, + "step": 7314 + }, + { + "epoch": 1.19, + "learning_rate": 3.7402128160782825e-06, + "logits/chosen": -1.2244218587875366, + "logits/rejected": -1.2136605978012085, + "logps/chosen": -37.075740814208984, + "logps/rejected": -40.882171630859375, + "loss": 0.7197, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.419445514678955, + "rewards/margins": -0.4189441204071045, + "rewards/rejected": 3.8383896350860596, + "step": 7315 + }, + { + "epoch": 1.19, + "learning_rate": 3.7389410057023672e-06, + "logits/chosen": -1.364656686782837, + "logits/rejected": -1.2548414468765259, + "logps/chosen": -107.70166778564453, + "logps/rejected": -65.76815795898438, + "loss": 0.2062, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.438118934631348, + "rewards/margins": 3.939058780670166, + "rewards/rejected": 2.4990601539611816, + "step": 7316 + }, + { + "epoch": 1.19, + "learning_rate": 3.7376692824536962e-06, + "logits/chosen": -1.2118936777114868, + "logits/rejected": -1.1352548599243164, + "logps/chosen": -78.99190521240234, + "logps/rejected": -32.51367950439453, + "loss": 0.9102, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.5390450954437256, + "rewards/margins": 2.3843584060668945, + "rewards/rejected": 0.15468673408031464, + "step": 7317 + }, + { + "epoch": 1.19, + "learning_rate": 3.7363976464201348e-06, + "logits/chosen": -1.4828358888626099, + "logits/rejected": -1.4839829206466675, + "logps/chosen": -78.7359619140625, + "logps/rejected": -75.76335144042969, + "loss": 0.1899, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.8310317993164062, + "rewards/margins": 1.1882537603378296, + "rewards/rejected": 1.6427780389785767, + "step": 7318 + }, + { + "epoch": 1.19, + "learning_rate": 3.7351260976895383e-06, + "logits/chosen": -1.3985259532928467, + "logits/rejected": -1.354841947555542, + "logps/chosen": -107.51844787597656, + "logps/rejected": -116.56684875488281, + "loss": 0.485, + "rewards/accuracies": 0.0, + "rewards/chosen": 6.218928813934326, + "rewards/margins": -0.4771881103515625, + "rewards/rejected": 6.696116924285889, + "step": 7319 + }, + { + "epoch": 1.19, + "learning_rate": 3.733854636349761e-06, + "logits/chosen": -1.1305606365203857, + "logits/rejected": -1.167641282081604, + "logps/chosen": -47.93712615966797, + "logps/rejected": -118.33724975585938, + "loss": 0.5368, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.8182792663574219, + "rewards/margins": 0.6849418878555298, + "rewards/rejected": 1.133337378501892, + "step": 7320 + }, + { + "epoch": 1.19, + "learning_rate": 3.732583262488647e-06, + "logits/chosen": -0.7312527298927307, + "logits/rejected": -0.7843717932701111, + "logps/chosen": -3.723954916000366, + "logps/rejected": -49.158485412597656, + "loss": 0.5206, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.6110483407974243, + "rewards/margins": 0.13309919834136963, + "rewards/rejected": 0.4779491424560547, + "step": 7321 + }, + { + "epoch": 1.19, + "learning_rate": 3.7313119761940375e-06, + "logits/chosen": -0.6709061861038208, + "logits/rejected": -0.6660796403884888, + "logps/chosen": -4.1831889152526855, + "logps/rejected": -19.104393005371094, + "loss": 0.4291, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.1513778269290924, + "rewards/margins": -0.17567676305770874, + "rewards/rejected": 0.32705458998680115, + "step": 7322 + }, + { + "epoch": 1.19, + "learning_rate": 3.7300407775537663e-06, + "logits/chosen": -1.3670040369033813, + "logits/rejected": -1.3380786180496216, + "logps/chosen": -236.13226318359375, + "logps/rejected": -87.98113250732422, + "loss": 0.098, + "rewards/accuracies": 1.0, + "rewards/chosen": 7.54274320602417, + "rewards/margins": 1.7938423156738281, + "rewards/rejected": 5.748900890350342, + "step": 7323 + }, + { + "epoch": 1.19, + "learning_rate": 3.7287696666556607e-06, + "logits/chosen": -1.2674272060394287, + "logits/rejected": -1.179551362991333, + "logps/chosen": -175.66326904296875, + "logps/rejected": -99.20234680175781, + "loss": 0.0389, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.3314011096954346, + "rewards/margins": 2.52374267578125, + "rewards/rejected": 0.8076583743095398, + "step": 7324 + }, + { + "epoch": 1.19, + "learning_rate": 3.727498643587543e-06, + "logits/chosen": -1.5738232135772705, + "logits/rejected": -1.6363446712493896, + "logps/chosen": -200.64511108398438, + "logps/rejected": -131.3712158203125, + "loss": 0.0718, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.389062404632568, + "rewards/margins": 2.442112684249878, + "rewards/rejected": 2.9469497203826904, + "step": 7325 + }, + { + "epoch": 1.19, + "learning_rate": 3.726227708437228e-06, + "logits/chosen": -1.115965723991394, + "logits/rejected": -1.17217218875885, + "logps/chosen": -49.660640716552734, + "logps/rejected": -72.54051971435547, + "loss": 0.9657, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.5309375524520874, + "rewards/margins": -1.6327732801437378, + "rewards/rejected": 3.163710832595825, + "step": 7326 + }, + { + "epoch": 1.19, + "learning_rate": 3.724956861292526e-06, + "logits/chosen": -1.2686365842819214, + "logits/rejected": -1.3112435340881348, + "logps/chosen": -29.2296085357666, + "logps/rejected": -82.0333480834961, + "loss": 1.8516, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.0528860092163086, + "rewards/margins": -3.1163973808288574, + "rewards/rejected": 5.169283390045166, + "step": 7327 + }, + { + "epoch": 1.19, + "learning_rate": 3.7236861022412394e-06, + "logits/chosen": -1.409599781036377, + "logits/rejected": -1.289185643196106, + "logps/chosen": -96.08352661132812, + "logps/rejected": -46.13822937011719, + "loss": 0.1574, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.578014373779297, + "rewards/margins": 1.2004814147949219, + "rewards/rejected": 1.377532958984375, + "step": 7328 + }, + { + "epoch": 1.19, + "learning_rate": 3.722415431371168e-06, + "logits/chosen": -0.7335047125816345, + "logits/rejected": -0.7335047125816345, + "logps/chosen": -35.800621032714844, + "logps/rejected": -35.800621032714844, + "loss": 0.5098, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.0480011701583862, + "rewards/margins": 0.0, + "rewards/rejected": 1.0480011701583862, + "step": 7329 + }, + { + "epoch": 1.19, + "learning_rate": 3.7211448487701002e-06, + "logits/chosen": -1.3335046768188477, + "logits/rejected": -1.3079725503921509, + "logps/chosen": -82.24603271484375, + "logps/rejected": -68.25593566894531, + "loss": 0.3882, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.7364723682403564, + "rewards/margins": 0.9781684875488281, + "rewards/rejected": 1.7583038806915283, + "step": 7330 + }, + { + "epoch": 1.19, + "learning_rate": 3.7198743545258233e-06, + "logits/chosen": -1.1370247602462769, + "logits/rejected": -1.0778619050979614, + "logps/chosen": -59.6495246887207, + "logps/rejected": -86.31037139892578, + "loss": 1.3658, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.3092968463897705, + "rewards/margins": -0.12746775150299072, + "rewards/rejected": 1.4367645978927612, + "step": 7331 + }, + { + "epoch": 1.19, + "learning_rate": 3.7186039487261162e-06, + "logits/chosen": -1.2418832778930664, + "logits/rejected": -1.20204496383667, + "logps/chosen": -81.87184143066406, + "logps/rejected": -43.513282775878906, + "loss": 1.6519, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.087099552154541, + "rewards/margins": 0.2053307294845581, + "rewards/rejected": 1.881768822669983, + "step": 7332 + }, + { + "epoch": 1.19, + "learning_rate": 3.7173336314587514e-06, + "logits/chosen": -1.3794010877609253, + "logits/rejected": -0.9745878577232361, + "logps/chosen": -96.81879425048828, + "logps/rejected": -63.3030891418457, + "loss": 0.1858, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.578554630279541, + "rewards/margins": 1.6125898361206055, + "rewards/rejected": 4.9659647941589355, + "step": 7333 + }, + { + "epoch": 1.19, + "learning_rate": 3.716063402811496e-06, + "logits/chosen": -1.2345411777496338, + "logits/rejected": -1.2345411777496338, + "logps/chosen": -47.506744384765625, + "logps/rejected": -47.506744384765625, + "loss": 0.6152, + "rewards/accuracies": 0.0, + "rewards/chosen": 4.301985263824463, + "rewards/margins": 0.0, + "rewards/rejected": 4.301985263824463, + "step": 7334 + }, + { + "epoch": 1.19, + "learning_rate": 3.7147932628721114e-06, + "logits/chosen": -1.2766399383544922, + "logits/rejected": -1.2766399383544922, + "logps/chosen": -72.68265533447266, + "logps/rejected": -72.68265533447266, + "loss": 0.347, + "rewards/accuracies": 0.0, + "rewards/chosen": 4.243326663970947, + "rewards/margins": 0.0, + "rewards/rejected": 4.243326663970947, + "step": 7335 + }, + { + "epoch": 1.19, + "learning_rate": 3.713523211728351e-06, + "logits/chosen": -1.2588108777999878, + "logits/rejected": -1.268214464187622, + "logps/chosen": -122.02499389648438, + "logps/rejected": -124.00215148925781, + "loss": 0.3733, + "rewards/accuracies": 1.0, + "rewards/chosen": 8.471304893493652, + "rewards/margins": 0.41208839416503906, + "rewards/rejected": 8.059216499328613, + "step": 7336 + }, + { + "epoch": 1.19, + "learning_rate": 3.7122532494679643e-06, + "logits/chosen": -1.2103681564331055, + "logits/rejected": -1.118459701538086, + "logps/chosen": -54.05179977416992, + "logps/rejected": -48.20277404785156, + "loss": 0.1057, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.601445436477661, + "rewards/margins": 1.7220356464385986, + "rewards/rejected": 1.8794097900390625, + "step": 7337 + }, + { + "epoch": 1.19, + "learning_rate": 3.710983376178693e-06, + "logits/chosen": -1.091408610343933, + "logits/rejected": -1.0490812063217163, + "logps/chosen": -69.22674560546875, + "logps/rejected": -40.74327850341797, + "loss": 0.5245, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.9673317670822144, + "rewards/margins": 0.0375518798828125, + "rewards/rejected": 1.9297798871994019, + "step": 7338 + }, + { + "epoch": 1.19, + "learning_rate": 3.709713591948273e-06, + "logits/chosen": -1.285200834274292, + "logits/rejected": -1.285200834274292, + "logps/chosen": -55.96207809448242, + "logps/rejected": -55.96207809448242, + "loss": 0.6651, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.0232722759246826, + "rewards/margins": 0.0, + "rewards/rejected": 3.0232722759246826, + "step": 7339 + }, + { + "epoch": 1.19, + "learning_rate": 3.7084438968644356e-06, + "logits/chosen": -1.5894001722335815, + "logits/rejected": -1.5327492952346802, + "logps/chosen": -100.28604125976562, + "logps/rejected": -95.26303100585938, + "loss": 0.4884, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.838464260101318, + "rewards/margins": 1.3035047054290771, + "rewards/rejected": 3.534959554672241, + "step": 7340 + }, + { + "epoch": 1.19, + "learning_rate": 3.707174291014904e-06, + "logits/chosen": -1.0130139589309692, + "logits/rejected": -1.0130139589309692, + "logps/chosen": -19.425464630126953, + "logps/rejected": -19.425464630126953, + "loss": 0.4681, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.472306489944458, + "rewards/margins": 0.0, + "rewards/rejected": 2.472306489944458, + "step": 7341 + }, + { + "epoch": 1.19, + "learning_rate": 3.705904774487396e-06, + "logits/chosen": -1.1647326946258545, + "logits/rejected": -1.033448338508606, + "logps/chosen": -60.66496658325195, + "logps/rejected": -24.316619873046875, + "loss": 0.4139, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.9627255201339722, + "rewards/margins": 1.4647917747497559, + "rewards/rejected": 0.4979337751865387, + "step": 7342 + }, + { + "epoch": 1.19, + "learning_rate": 3.7046353473696232e-06, + "logits/chosen": -1.3057128190994263, + "logits/rejected": -1.1470175981521606, + "logps/chosen": -119.8366928100586, + "logps/rejected": -53.941078186035156, + "loss": 0.0168, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.544666290283203, + "rewards/margins": 3.715665340423584, + "rewards/rejected": 1.8290008306503296, + "step": 7343 + }, + { + "epoch": 1.19, + "learning_rate": 3.703366009749292e-06, + "logits/chosen": -0.9928404688835144, + "logits/rejected": -1.002435564994812, + "logps/chosen": -74.70989990234375, + "logps/rejected": -30.81484603881836, + "loss": 0.3906, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.4964828491210938, + "rewards/margins": -0.1281430721282959, + "rewards/rejected": 2.6246259212493896, + "step": 7344 + }, + { + "epoch": 1.19, + "learning_rate": 3.702096761714099e-06, + "logits/chosen": -1.2065068483352661, + "logits/rejected": -1.1799817085266113, + "logps/chosen": -129.3239288330078, + "logps/rejected": -105.30412292480469, + "loss": 0.617, + "rewards/accuracies": 0.0, + "rewards/chosen": 8.316998481750488, + "rewards/margins": -0.8212566375732422, + "rewards/rejected": 9.13825511932373, + "step": 7345 + }, + { + "epoch": 1.19, + "learning_rate": 3.70082760335174e-06, + "logits/chosen": -1.0943831205368042, + "logits/rejected": -1.1525040864944458, + "logps/chosen": -84.86802673339844, + "logps/rejected": -105.19578552246094, + "loss": 3.0219, + "rewards/accuracies": 0.0, + "rewards/chosen": 4.780160427093506, + "rewards/margins": -3.1440234184265137, + "rewards/rejected": 7.9241838455200195, + "step": 7346 + }, + { + "epoch": 1.19, + "learning_rate": 3.699558534749901e-06, + "logits/chosen": -0.9759090542793274, + "logits/rejected": -0.9035002589225769, + "logps/chosen": -75.20612335205078, + "logps/rejected": -59.19664001464844, + "loss": 0.5098, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.4455918073654175, + "rewards/margins": -0.4996504783630371, + "rewards/rejected": 1.9452422857284546, + "step": 7347 + }, + { + "epoch": 1.19, + "learning_rate": 3.6982895559962617e-06, + "logits/chosen": -1.3100955486297607, + "logits/rejected": -1.428972601890564, + "logps/chosen": -159.51394653320312, + "logps/rejected": -114.22665405273438, + "loss": 0.1805, + "rewards/accuracies": 1.0, + "rewards/chosen": 9.458864212036133, + "rewards/margins": 4.1658616065979, + "rewards/rejected": 5.293002605438232, + "step": 7348 + }, + { + "epoch": 1.19, + "learning_rate": 3.697020667178497e-06, + "logits/chosen": -1.2961900234222412, + "logits/rejected": -1.2152576446533203, + "logps/chosen": -92.99345397949219, + "logps/rejected": -89.51100158691406, + "loss": 0.9138, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.519206523895264, + "rewards/margins": 2.3339176177978516, + "rewards/rejected": 4.185288906097412, + "step": 7349 + }, + { + "epoch": 1.19, + "learning_rate": 3.6957518683842753e-06, + "logits/chosen": -1.0466177463531494, + "logits/rejected": -1.070493459701538, + "logps/chosen": -40.19535827636719, + "logps/rejected": -81.65540313720703, + "loss": 0.5404, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.6587311029434204, + "rewards/margins": 1.016359806060791, + "rewards/rejected": 0.6423713564872742, + "step": 7350 + }, + { + "epoch": 1.19, + "learning_rate": 3.6944831597012594e-06, + "logits/chosen": -1.231846809387207, + "logits/rejected": -1.207682490348816, + "logps/chosen": -118.09632110595703, + "logps/rejected": -59.67591857910156, + "loss": 1.8534, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.4775688648223877, + "rewards/margins": 0.7577835321426392, + "rewards/rejected": 1.7197853326797485, + "step": 7351 + }, + { + "epoch": 1.19, + "learning_rate": 3.693214541217104e-06, + "logits/chosen": -1.0049580335617065, + "logits/rejected": -0.9493845105171204, + "logps/chosen": -60.671173095703125, + "logps/rejected": -73.14319610595703, + "loss": 0.2928, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.980302572250366, + "rewards/margins": 1.9272706508636475, + "rewards/rejected": 1.0530319213867188, + "step": 7352 + }, + { + "epoch": 1.19, + "learning_rate": 3.6919460130194586e-06, + "logits/chosen": -0.9271015524864197, + "logits/rejected": -0.9094080328941345, + "logps/chosen": -27.363492965698242, + "logps/rejected": -23.404939651489258, + "loss": 1.1878, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.43192997574806213, + "rewards/margins": -0.19370177388191223, + "rewards/rejected": 0.6256317496299744, + "step": 7353 + }, + { + "epoch": 1.19, + "learning_rate": 3.6906775751959667e-06, + "logits/chosen": -1.0551681518554688, + "logits/rejected": -1.0551681518554688, + "logps/chosen": -13.096090316772461, + "logps/rejected": -13.096090316772461, + "loss": 0.4163, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.7254881858825684, + "rewards/margins": 0.0, + "rewards/rejected": 2.7254881858825684, + "step": 7354 + }, + { + "epoch": 1.19, + "learning_rate": 3.689409227834265e-06, + "logits/chosen": -1.2177858352661133, + "logits/rejected": -1.2014009952545166, + "logps/chosen": -55.85712432861328, + "logps/rejected": -8.157658576965332, + "loss": 0.3781, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.921862781047821, + "rewards/margins": 0.3119945526123047, + "rewards/rejected": 0.6098682284355164, + "step": 7355 + }, + { + "epoch": 1.19, + "learning_rate": 3.688140971021985e-06, + "logits/chosen": -1.0298322439193726, + "logits/rejected": -0.9072534441947937, + "logps/chosen": -30.57918357849121, + "logps/rejected": -13.984238624572754, + "loss": 0.2523, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.2699549198150635, + "rewards/margins": 1.6630488634109497, + "rewards/rejected": 0.6069060564041138, + "step": 7356 + }, + { + "epoch": 1.19, + "learning_rate": 3.6868728048467516e-06, + "logits/chosen": -1.29633629322052, + "logits/rejected": -1.1914256811141968, + "logps/chosen": -71.93379211425781, + "logps/rejected": -97.60819244384766, + "loss": 0.0797, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.181935787200928, + "rewards/margins": 2.7051680088043213, + "rewards/rejected": 1.4767677783966064, + "step": 7357 + }, + { + "epoch": 1.19, + "learning_rate": 3.6856047293961823e-06, + "logits/chosen": -0.8926501870155334, + "logits/rejected": -0.8926313519477844, + "logps/chosen": -5.802732467651367, + "logps/rejected": -4.451968193054199, + "loss": 1.1923, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.6280586123466492, + "rewards/margins": -0.0877264142036438, + "rewards/rejected": 0.715785026550293, + "step": 7358 + }, + { + "epoch": 1.19, + "learning_rate": 3.6843367447578894e-06, + "logits/chosen": -1.413691759109497, + "logits/rejected": -1.3070136308670044, + "logps/chosen": -82.79850769042969, + "logps/rejected": -28.11457633972168, + "loss": 0.1133, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.016221761703491, + "rewards/margins": 1.4163482189178467, + "rewards/rejected": 0.5998735427856445, + "step": 7359 + }, + { + "epoch": 1.19, + "learning_rate": 3.683068851019479e-06, + "logits/chosen": -1.2992802858352661, + "logits/rejected": -1.446170449256897, + "logps/chosen": -136.45187377929688, + "logps/rejected": -190.10177612304688, + "loss": 0.9321, + "rewards/accuracies": 0.0, + "rewards/chosen": 6.869021892547607, + "rewards/margins": -1.6755280494689941, + "rewards/rejected": 8.544549942016602, + "step": 7360 + }, + { + "epoch": 1.19, + "learning_rate": 3.681801048268549e-06, + "logits/chosen": -1.181276798248291, + "logits/rejected": -1.2443188428878784, + "logps/chosen": -58.27290344238281, + "logps/rejected": -89.85570526123047, + "loss": 2.0675, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.6890839338302612, + "rewards/margins": 0.19702231884002686, + "rewards/rejected": 1.4920616149902344, + "step": 7361 + }, + { + "epoch": 1.19, + "learning_rate": 3.6805333365926943e-06, + "logits/chosen": -1.1748522520065308, + "logits/rejected": -1.051272988319397, + "logps/chosen": -56.94994354248047, + "logps/rejected": -30.808250427246094, + "loss": 0.1482, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.4964730739593506, + "rewards/margins": 1.8193082809448242, + "rewards/rejected": 0.6771648526191711, + "step": 7362 + }, + { + "epoch": 1.2, + "learning_rate": 3.679265716079501e-06, + "logits/chosen": -1.0388784408569336, + "logits/rejected": -1.0352036952972412, + "logps/chosen": -2.012361764907837, + "logps/rejected": -4.955413818359375, + "loss": 0.6363, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.3632065951824188, + "rewards/margins": -0.020890504121780396, + "rewards/rejected": 0.3840970993041992, + "step": 7363 + }, + { + "epoch": 1.2, + "learning_rate": 3.6779981868165493e-06, + "logits/chosen": -1.1217001676559448, + "logits/rejected": -1.10115385055542, + "logps/chosen": -53.47283172607422, + "logps/rejected": -40.93748474121094, + "loss": 0.3284, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.8996682167053223, + "rewards/margins": 1.6921833753585815, + "rewards/rejected": 1.2074848413467407, + "step": 7364 + }, + { + "epoch": 1.2, + "learning_rate": 3.6767307488914137e-06, + "logits/chosen": -1.1965317726135254, + "logits/rejected": -1.11521577835083, + "logps/chosen": -57.863128662109375, + "logps/rejected": -46.44988250732422, + "loss": 0.8816, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.4812134504318237, + "rewards/margins": -1.5538138151168823, + "rewards/rejected": 3.035027265548706, + "step": 7365 + }, + { + "epoch": 1.2, + "learning_rate": 3.6754634023916636e-06, + "logits/chosen": -1.6042536497116089, + "logits/rejected": -1.5272431373596191, + "logps/chosen": -87.67118835449219, + "logps/rejected": -62.5140380859375, + "loss": 0.0971, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.250794887542725, + "rewards/margins": 1.6967673301696777, + "rewards/rejected": 3.554027557373047, + "step": 7366 + }, + { + "epoch": 1.2, + "learning_rate": 3.674196147404859e-06, + "logits/chosen": -1.4951876401901245, + "logits/rejected": -1.4577105045318604, + "logps/chosen": -107.25962829589844, + "logps/rejected": -87.00462341308594, + "loss": 0.0541, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.8182053565979, + "rewards/margins": 3.453183889389038, + "rewards/rejected": 2.3650214672088623, + "step": 7367 + }, + { + "epoch": 1.2, + "learning_rate": 3.6729289840185557e-06, + "logits/chosen": -1.2209718227386475, + "logits/rejected": -1.145287036895752, + "logps/chosen": -54.04536819458008, + "logps/rejected": -39.810420989990234, + "loss": 1.4903, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.130370855331421, + "rewards/margins": -0.43716955184936523, + "rewards/rejected": 2.567540407180786, + "step": 7368 + }, + { + "epoch": 1.2, + "learning_rate": 3.6716619123203024e-06, + "logits/chosen": -1.1571409702301025, + "logits/rejected": -1.1202526092529297, + "logps/chosen": -86.36329650878906, + "logps/rejected": -60.07720184326172, + "loss": 0.1788, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.3376502990722656, + "rewards/margins": 0.8813034296035767, + "rewards/rejected": 1.456346869468689, + "step": 7369 + }, + { + "epoch": 1.2, + "learning_rate": 3.6703949323976425e-06, + "logits/chosen": -1.2791193723678589, + "logits/rejected": -1.2170274257659912, + "logps/chosen": -35.6386833190918, + "logps/rejected": -15.861268043518066, + "loss": 0.7165, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.1361370086669922, + "rewards/margins": 0.575747013092041, + "rewards/rejected": 0.5603899955749512, + "step": 7370 + }, + { + "epoch": 1.2, + "learning_rate": 3.6691280443381117e-06, + "logits/chosen": -0.916875422000885, + "logits/rejected": -0.916875422000885, + "logps/chosen": -17.803117752075195, + "logps/rejected": -17.803117752075195, + "loss": 0.5758, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.9276922345161438, + "rewards/margins": 0.0, + "rewards/rejected": 0.9276922345161438, + "step": 7371 + }, + { + "epoch": 1.2, + "learning_rate": 3.6678612482292403e-06, + "logits/chosen": -0.7740697860717773, + "logits/rejected": -0.7716763615608215, + "logps/chosen": -2.061281204223633, + "logps/rejected": -3.689708948135376, + "loss": 0.5755, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.24491678178310394, + "rewards/margins": -0.2695923447608948, + "rewards/rejected": 0.5145091414451599, + "step": 7372 + }, + { + "epoch": 1.2, + "learning_rate": 3.666594544158552e-06, + "logits/chosen": -1.1624339818954468, + "logits/rejected": -1.1730546951293945, + "logps/chosen": -74.44727325439453, + "logps/rejected": -133.14068603515625, + "loss": 1.9309, + "rewards/accuracies": 0.0, + "rewards/chosen": 5.585708141326904, + "rewards/margins": -3.364722728729248, + "rewards/rejected": 8.950430870056152, + "step": 7373 + }, + { + "epoch": 1.2, + "learning_rate": 3.6653279322135637e-06, + "logits/chosen": -1.369777798652649, + "logits/rejected": -1.183491826057434, + "logps/chosen": -49.18520736694336, + "logps/rejected": -86.46928405761719, + "loss": 0.5653, + "rewards/accuracies": 0.0, + "rewards/chosen": 5.851869106292725, + "rewards/margins": -0.3458223342895508, + "rewards/rejected": 6.197691440582275, + "step": 7374 + }, + { + "epoch": 1.2, + "learning_rate": 3.6640614124817864e-06, + "logits/chosen": -1.4647201299667358, + "logits/rejected": -1.503361463546753, + "logps/chosen": -49.372215270996094, + "logps/rejected": -58.33399200439453, + "loss": 0.6367, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.2456130981445312, + "rewards/margins": -0.539238691329956, + "rewards/rejected": 3.7848517894744873, + "step": 7375 + }, + { + "epoch": 1.2, + "learning_rate": 3.662794985050725e-06, + "logits/chosen": -1.1685707569122314, + "logits/rejected": -1.2389814853668213, + "logps/chosen": -91.29932403564453, + "logps/rejected": -94.11316680908203, + "loss": 0.5606, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.722210645675659, + "rewards/margins": -0.5439698696136475, + "rewards/rejected": 3.2661805152893066, + "step": 7376 + }, + { + "epoch": 1.2, + "learning_rate": 3.6615286500078774e-06, + "logits/chosen": -1.199048638343811, + "logits/rejected": -1.1423438787460327, + "logps/chosen": -107.88493347167969, + "logps/rejected": -64.4410171508789, + "loss": 0.3874, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.8122971057891846, + "rewards/margins": 0.9918464422225952, + "rewards/rejected": 1.8204506635665894, + "step": 7377 + }, + { + "epoch": 1.2, + "learning_rate": 3.6602624074407354e-06, + "logits/chosen": -1.4779020547866821, + "logits/rejected": -1.390730857849121, + "logps/chosen": -60.27420425415039, + "logps/rejected": -22.197847366333008, + "loss": 0.42, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.417365550994873, + "rewards/margins": 0.8764688968658447, + "rewards/rejected": 3.5408966541290283, + "step": 7378 + }, + { + "epoch": 1.2, + "learning_rate": 3.6589962574367843e-06, + "logits/chosen": -0.8516911864280701, + "logits/rejected": -0.8863538503646851, + "logps/chosen": -16.13819122314453, + "logps/rejected": -29.23141098022461, + "loss": 0.6482, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.3513208627700806, + "rewards/margins": -0.45000362396240234, + "rewards/rejected": 1.801324486732483, + "step": 7379 + }, + { + "epoch": 1.2, + "learning_rate": 3.657730200083503e-06, + "logits/chosen": -1.32794189453125, + "logits/rejected": -1.3415366411209106, + "logps/chosen": -55.22827911376953, + "logps/rejected": -124.70425415039062, + "loss": 0.316, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.034261465072632, + "rewards/margins": 0.13228392601013184, + "rewards/rejected": 1.9019775390625, + "step": 7380 + }, + { + "epoch": 1.2, + "learning_rate": 3.6564642354683644e-06, + "logits/chosen": -1.469436526298523, + "logits/rejected": -1.3579738140106201, + "logps/chosen": -65.50537109375, + "logps/rejected": -14.469304084777832, + "loss": 2.2147, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.6606369018554688, + "rewards/margins": 2.476653814315796, + "rewards/rejected": 0.18398313224315643, + "step": 7381 + }, + { + "epoch": 1.2, + "learning_rate": 3.655198363678834e-06, + "logits/chosen": -1.2143219709396362, + "logits/rejected": -1.19109046459198, + "logps/chosen": -101.69500732421875, + "logps/rejected": -125.9711685180664, + "loss": 0.6972, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.7420151233673096, + "rewards/margins": 1.4139518737792969, + "rewards/rejected": 1.3280632495880127, + "step": 7382 + }, + { + "epoch": 1.2, + "learning_rate": 3.653932584802372e-06, + "logits/chosen": -1.2246941328048706, + "logits/rejected": -1.209730863571167, + "logps/chosen": -125.27970886230469, + "logps/rejected": -153.00991821289062, + "loss": 0.0374, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.35410475730896, + "rewards/margins": 2.5703492164611816, + "rewards/rejected": 0.7837554812431335, + "step": 7383 + }, + { + "epoch": 1.2, + "learning_rate": 3.6526668989264323e-06, + "logits/chosen": -1.1656043529510498, + "logits/rejected": -1.1504932641983032, + "logps/chosen": -61.88483428955078, + "logps/rejected": -49.27290344238281, + "loss": 0.3245, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.524811744689941, + "rewards/margins": 0.6691200733184814, + "rewards/rejected": 3.85569167137146, + "step": 7384 + }, + { + "epoch": 1.2, + "learning_rate": 3.6514013061384613e-06, + "logits/chosen": -1.0770127773284912, + "logits/rejected": -1.0426485538482666, + "logps/chosen": -63.866943359375, + "logps/rejected": -40.44871520996094, + "loss": 1.184, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.4159622192382812, + "rewards/margins": -2.139674663543701, + "rewards/rejected": 4.555636882781982, + "step": 7385 + }, + { + "epoch": 1.2, + "learning_rate": 3.650135806525898e-06, + "logits/chosen": -1.5744097232818604, + "logits/rejected": -1.5468119382858276, + "logps/chosen": -56.220489501953125, + "logps/rejected": -50.74397659301758, + "loss": 1.8577, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.309053897857666, + "rewards/margins": 0.9044033288955688, + "rewards/rejected": 1.4046505689620972, + "step": 7386 + }, + { + "epoch": 1.2, + "learning_rate": 3.648870400176179e-06, + "logits/chosen": -0.8393908143043518, + "logits/rejected": -0.9476068615913391, + "logps/chosen": -48.338706970214844, + "logps/rejected": -99.08191680908203, + "loss": 0.5649, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.3017224073410034, + "rewards/margins": -0.7257472276687622, + "rewards/rejected": 2.0274696350097656, + "step": 7387 + }, + { + "epoch": 1.2, + "learning_rate": 3.647605087176731e-06, + "logits/chosen": -1.1493537425994873, + "logits/rejected": -1.198080062866211, + "logps/chosen": -54.51769256591797, + "logps/rejected": -57.0997314453125, + "loss": 0.362, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.835253953933716, + "rewards/margins": 1.2259429693222046, + "rewards/rejected": 1.6093109846115112, + "step": 7388 + }, + { + "epoch": 1.2, + "learning_rate": 3.646339867614975e-06, + "logits/chosen": -1.2643661499023438, + "logits/rejected": -1.1850813627243042, + "logps/chosen": -90.38287353515625, + "logps/rejected": -122.10221099853516, + "loss": 0.2592, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.9380340576171875, + "rewards/margins": 0.38681840896606445, + "rewards/rejected": 5.551215648651123, + "step": 7389 + }, + { + "epoch": 1.2, + "learning_rate": 3.645074741578326e-06, + "logits/chosen": -0.6718387007713318, + "logits/rejected": -0.6004986763000488, + "logps/chosen": -37.16999816894531, + "logps/rejected": -47.6783447265625, + "loss": 0.4222, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.4847538471221924, + "rewards/margins": 1.812677025794983, + "rewards/rejected": 0.6720768213272095, + "step": 7390 + }, + { + "epoch": 1.2, + "learning_rate": 3.643809709154192e-06, + "logits/chosen": -1.0766972303390503, + "logits/rejected": -1.0821648836135864, + "logps/chosen": -25.13253402709961, + "logps/rejected": -83.82733154296875, + "loss": 1.1288, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.4130862951278687, + "rewards/margins": -0.9621480703353882, + "rewards/rejected": 2.375234365463257, + "step": 7391 + }, + { + "epoch": 1.2, + "learning_rate": 3.6425447704299745e-06, + "logits/chosen": -0.9473898410797119, + "logits/rejected": -0.9583486914634705, + "logps/chosen": -52.811649322509766, + "logps/rejected": -54.93122863769531, + "loss": 0.214, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.022298812866211, + "rewards/margins": 0.6279704570770264, + "rewards/rejected": 0.3943283259868622, + "step": 7392 + }, + { + "epoch": 1.2, + "learning_rate": 3.6412799254930686e-06, + "logits/chosen": -1.0092593431472778, + "logits/rejected": -0.9317359328269958, + "logps/chosen": -38.97923278808594, + "logps/rejected": -29.808849334716797, + "loss": 0.3382, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.253124952316284, + "rewards/margins": 0.11560654640197754, + "rewards/rejected": 2.1375184059143066, + "step": 7393 + }, + { + "epoch": 1.2, + "learning_rate": 3.640015174430864e-06, + "logits/chosen": -1.1934731006622314, + "logits/rejected": -1.1157065629959106, + "logps/chosen": -40.323543548583984, + "logps/rejected": -48.15105056762695, + "loss": 0.4109, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.285597324371338, + "rewards/margins": 0.008429288864135742, + "rewards/rejected": 2.277168035507202, + "step": 7394 + }, + { + "epoch": 1.2, + "learning_rate": 3.6387505173307424e-06, + "logits/chosen": -1.1409729719161987, + "logits/rejected": -1.1478465795516968, + "logps/chosen": -67.22490692138672, + "logps/rejected": -98.3642349243164, + "loss": 0.9951, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.594716787338257, + "rewards/margins": -1.840951681137085, + "rewards/rejected": 5.435668468475342, + "step": 7395 + }, + { + "epoch": 1.2, + "learning_rate": 3.63748595428008e-06, + "logits/chosen": -1.507784128189087, + "logits/rejected": -1.4373936653137207, + "logps/chosen": -144.41651916503906, + "logps/rejected": -146.53501892089844, + "loss": 1.0087, + "rewards/accuracies": 0.0, + "rewards/chosen": 7.108360290527344, + "rewards/margins": -1.8641881942749023, + "rewards/rejected": 8.972548484802246, + "step": 7396 + }, + { + "epoch": 1.2, + "learning_rate": 3.636221485366245e-06, + "logits/chosen": -1.1774399280548096, + "logits/rejected": -1.0383657217025757, + "logps/chosen": -42.93694305419922, + "logps/rejected": -56.12447738647461, + "loss": 0.3128, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.4408042430877686, + "rewards/margins": 0.2804415225982666, + "rewards/rejected": 2.160362720489502, + "step": 7397 + }, + { + "epoch": 1.2, + "learning_rate": 3.6349571106766023e-06, + "logits/chosen": -1.2046232223510742, + "logits/rejected": -1.2422261238098145, + "logps/chosen": -38.85841369628906, + "logps/rejected": -73.50300598144531, + "loss": 0.9849, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.9447953701019287, + "rewards/margins": -0.50826096534729, + "rewards/rejected": 3.4530563354492188, + "step": 7398 + }, + { + "epoch": 1.2, + "learning_rate": 3.6336928302985063e-06, + "logits/chosen": -1.422851800918579, + "logits/rejected": -1.4797223806381226, + "logps/chosen": -117.76394653320312, + "logps/rejected": -84.48355865478516, + "loss": 0.8693, + "rewards/accuracies": 0.0, + "rewards/chosen": 4.670843601226807, + "rewards/margins": -0.8933768272399902, + "rewards/rejected": 5.564220428466797, + "step": 7399 + }, + { + "epoch": 1.2, + "learning_rate": 3.632428644319308e-06, + "logits/chosen": -1.3600820302963257, + "logits/rejected": -1.3236773014068604, + "logps/chosen": -136.28665161132812, + "logps/rejected": -174.507080078125, + "loss": 0.1853, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.085746765136719, + "rewards/margins": 2.490090847015381, + "rewards/rejected": 3.595655918121338, + "step": 7400 + }, + { + "epoch": 1.2, + "learning_rate": 3.6311645528263507e-06, + "logits/chosen": -0.9581745266914368, + "logits/rejected": -0.9394515752792358, + "logps/chosen": -22.065841674804688, + "logps/rejected": -1.9624950885772705, + "loss": 0.5243, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.4015359878540039, + "rewards/margins": 0.05709168314933777, + "rewards/rejected": 0.34444430470466614, + "step": 7401 + }, + { + "epoch": 1.2, + "learning_rate": 3.62990055590697e-06, + "logits/chosen": -0.8941853642463684, + "logits/rejected": -0.8911751508712769, + "logps/chosen": -1.6926630735397339, + "logps/rejected": -2.018951416015625, + "loss": 0.661, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3553219735622406, + "rewards/margins": 0.13300280272960663, + "rewards/rejected": 0.22231917083263397, + "step": 7402 + }, + { + "epoch": 1.2, + "learning_rate": 3.6286366536484975e-06, + "logits/chosen": -1.1933741569519043, + "logits/rejected": -1.1373934745788574, + "logps/chosen": -65.7331314086914, + "logps/rejected": -49.99967956542969, + "loss": 0.5817, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.7798835635185242, + "rewards/margins": 0.17494350671768188, + "rewards/rejected": 0.6049400568008423, + "step": 7403 + }, + { + "epoch": 1.2, + "learning_rate": 3.627372846138256e-06, + "logits/chosen": -1.2398954629898071, + "logits/rejected": -1.2052137851715088, + "logps/chosen": -36.89820861816406, + "logps/rejected": -25.772174835205078, + "loss": 0.8029, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.5636790990829468, + "rewards/margins": 0.6384398937225342, + "rewards/rejected": 0.9252392053604126, + "step": 7404 + }, + { + "epoch": 1.2, + "learning_rate": 3.6261091334635624e-06, + "logits/chosen": -1.3148472309112549, + "logits/rejected": -1.3394023180007935, + "logps/chosen": -44.20545959472656, + "logps/rejected": -51.1076774597168, + "loss": 0.1061, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.493968963623047, + "rewards/margins": 1.5266177654266357, + "rewards/rejected": 0.9673511385917664, + "step": 7405 + }, + { + "epoch": 1.2, + "learning_rate": 3.6248455157117284e-06, + "logits/chosen": -0.8336889743804932, + "logits/rejected": -0.920449435710907, + "logps/chosen": -63.35879898071289, + "logps/rejected": -104.1507797241211, + "loss": 0.8017, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.0900081396102905, + "rewards/margins": -0.7184696197509766, + "rewards/rejected": 1.808477759361267, + "step": 7406 + }, + { + "epoch": 1.2, + "learning_rate": 3.623581992970058e-06, + "logits/chosen": -0.9260603189468384, + "logits/rejected": -0.8412250280380249, + "logps/chosen": -41.954830169677734, + "logps/rejected": -25.252649307250977, + "loss": 0.497, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.704521656036377, + "rewards/margins": 1.6530226469039917, + "rewards/rejected": 1.0514990091323853, + "step": 7407 + }, + { + "epoch": 1.2, + "learning_rate": 3.6223185653258467e-06, + "logits/chosen": -1.1400682926177979, + "logits/rejected": -1.1906906366348267, + "logps/chosen": -87.35692596435547, + "logps/rejected": -112.068603515625, + "loss": 0.2271, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.5828118324279785, + "rewards/margins": 0.640723466873169, + "rewards/rejected": 1.9420883655548096, + "step": 7408 + }, + { + "epoch": 1.2, + "learning_rate": 3.6210552328663874e-06, + "logits/chosen": -0.7987571358680725, + "logits/rejected": -0.7987571358680725, + "logps/chosen": -3.0019795894622803, + "logps/rejected": -3.0019795894622803, + "loss": 0.3649, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.10857298225164413, + "rewards/margins": 0.0, + "rewards/rejected": 0.10857298225164413, + "step": 7409 + }, + { + "epoch": 1.2, + "learning_rate": 3.619791995678964e-06, + "logits/chosen": -1.3000614643096924, + "logits/rejected": -1.3615020513534546, + "logps/chosen": -66.25590515136719, + "logps/rejected": -120.20728302001953, + "loss": 1.017, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.2794463634490967, + "rewards/margins": 0.40356969833374023, + "rewards/rejected": 1.8758766651153564, + "step": 7410 + }, + { + "epoch": 1.2, + "learning_rate": 3.6185288538508545e-06, + "logits/chosen": -1.3179004192352295, + "logits/rejected": -1.330678939819336, + "logps/chosen": -23.27486801147461, + "logps/rejected": -47.328392028808594, + "loss": 0.5312, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.5476672649383545, + "rewards/margins": 0.16354012489318848, + "rewards/rejected": 3.384127140045166, + "step": 7411 + }, + { + "epoch": 1.2, + "learning_rate": 3.617265807469329e-06, + "logits/chosen": -1.6185280084609985, + "logits/rejected": -1.5966949462890625, + "logps/chosen": -96.09381103515625, + "logps/rejected": -27.919315338134766, + "loss": 0.2676, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.3457809686660767, + "rewards/margins": 0.621569812297821, + "rewards/rejected": 0.7242111563682556, + "step": 7412 + }, + { + "epoch": 1.2, + "learning_rate": 3.6160028566216523e-06, + "logits/chosen": -1.2221105098724365, + "logits/rejected": -1.2212212085723877, + "logps/chosen": -29.53183364868164, + "logps/rejected": -33.578643798828125, + "loss": 0.7718, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.1250760555267334, + "rewards/margins": 0.24103295803070068, + "rewards/rejected": 1.8840430974960327, + "step": 7413 + }, + { + "epoch": 1.2, + "learning_rate": 3.6147400013950833e-06, + "logits/chosen": -1.3184638023376465, + "logits/rejected": -1.3184638023376465, + "logps/chosen": -58.87958908081055, + "logps/rejected": -58.87958908081055, + "loss": 3.0694, + "rewards/accuracies": 0.0, + "rewards/chosen": 4.486528396606445, + "rewards/margins": 0.0, + "rewards/rejected": 4.486528396606445, + "step": 7414 + }, + { + "epoch": 1.2, + "learning_rate": 3.6134772418768728e-06, + "logits/chosen": -1.0983071327209473, + "logits/rejected": -1.2641406059265137, + "logps/chosen": -52.18288803100586, + "logps/rejected": -40.286075592041016, + "loss": 0.5811, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.571868658065796, + "rewards/margins": -0.7832465171813965, + "rewards/rejected": 3.3551151752471924, + "step": 7415 + }, + { + "epoch": 1.2, + "learning_rate": 3.612214578154265e-06, + "logits/chosen": -1.1997096538543701, + "logits/rejected": -1.399288296699524, + "logps/chosen": -124.31256103515625, + "logps/rejected": -199.10476684570312, + "loss": 2.9337, + "rewards/accuracies": 0.0, + "rewards/chosen": 4.399875164031982, + "rewards/margins": -3.04901123046875, + "rewards/rejected": 7.448886394500732, + "step": 7416 + }, + { + "epoch": 1.2, + "learning_rate": 3.610952010314499e-06, + "logits/chosen": -0.8307949900627136, + "logits/rejected": -0.8957027792930603, + "logps/chosen": -68.49919891357422, + "logps/rejected": -85.56211853027344, + "loss": 0.4232, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.9898041486740112, + "rewards/margins": -0.26944267749786377, + "rewards/rejected": 2.259246826171875, + "step": 7417 + }, + { + "epoch": 1.2, + "learning_rate": 3.6096895384448057e-06, + "logits/chosen": -1.3369120359420776, + "logits/rejected": -1.2548770904541016, + "logps/chosen": -81.44607543945312, + "logps/rejected": -98.79898071289062, + "loss": 0.029, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.535406589508057, + "rewards/margins": 2.9683685302734375, + "rewards/rejected": 2.567038059234619, + "step": 7418 + }, + { + "epoch": 1.2, + "learning_rate": 3.6084271626324097e-06, + "logits/chosen": -1.529508352279663, + "logits/rejected": -1.42092764377594, + "logps/chosen": -62.58349609375, + "logps/rejected": -28.773456573486328, + "loss": 0.0616, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.601320743560791, + "rewards/margins": 2.1043701171875, + "rewards/rejected": 0.49695053696632385, + "step": 7419 + }, + { + "epoch": 1.2, + "learning_rate": 3.6071648829645302e-06, + "logits/chosen": -1.0764765739440918, + "logits/rejected": -1.0411863327026367, + "logps/chosen": -66.35791778564453, + "logps/rejected": -58.471832275390625, + "loss": 0.1515, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.7558915615081787, + "rewards/margins": 1.1351350545883179, + "rewards/rejected": 1.6207565069198608, + "step": 7420 + }, + { + "epoch": 1.2, + "learning_rate": 3.6059026995283764e-06, + "logits/chosen": -1.3064525127410889, + "logits/rejected": -1.4911926984786987, + "logps/chosen": -50.473846435546875, + "logps/rejected": -131.56895446777344, + "loss": 2.1818, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.420461416244507, + "rewards/margins": -4.347250938415527, + "rewards/rejected": 6.767712593078613, + "step": 7421 + }, + { + "epoch": 1.2, + "learning_rate": 3.604640612411156e-06, + "logits/chosen": -1.4326086044311523, + "logits/rejected": -1.3609135150909424, + "logps/chosen": -38.58854675292969, + "logps/rejected": -20.48802375793457, + "loss": 0.1512, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.391714096069336, + "rewards/margins": 1.2338807582855225, + "rewards/rejected": 0.1578332930803299, + "step": 7422 + }, + { + "epoch": 1.2, + "learning_rate": 3.6033786217000667e-06, + "logits/chosen": -1.5035332441329956, + "logits/rejected": -1.4545546770095825, + "logps/chosen": -84.45762634277344, + "logps/rejected": -82.36672973632812, + "loss": 0.2932, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.7463111877441406, + "rewards/margins": 0.8574744462966919, + "rewards/rejected": 1.8888367414474487, + "step": 7423 + }, + { + "epoch": 1.2, + "learning_rate": 3.6021167274822997e-06, + "logits/chosen": -1.3944227695465088, + "logits/rejected": -1.4414819478988647, + "logps/chosen": -101.57564544677734, + "logps/rejected": -80.4422607421875, + "loss": 1.368, + "rewards/accuracies": 0.0, + "rewards/chosen": 4.045609474182129, + "rewards/margins": -2.6624550819396973, + "rewards/rejected": 6.708064556121826, + "step": 7424 + }, + { + "epoch": 1.21, + "learning_rate": 3.6008549298450403e-06, + "logits/chosen": -0.632361650466919, + "logits/rejected": -0.6114710569381714, + "logps/chosen": -32.24285125732422, + "logps/rejected": -52.073097229003906, + "loss": 0.9979, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.505976915359497, + "rewards/margins": 1.5603256225585938, + "rewards/rejected": -0.05434875562787056, + "step": 7425 + }, + { + "epoch": 1.21, + "learning_rate": 3.5995932288754655e-06, + "logits/chosen": -0.9883347749710083, + "logits/rejected": -1.041715383529663, + "logps/chosen": -64.81388854980469, + "logps/rejected": -49.524017333984375, + "loss": 1.1036, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.5814841985702515, + "rewards/margins": -1.8810981512069702, + "rewards/rejected": 3.4625823497772217, + "step": 7426 + }, + { + "epoch": 1.21, + "learning_rate": 3.5983316246607482e-06, + "logits/chosen": -1.076802372932434, + "logits/rejected": -1.1503829956054688, + "logps/chosen": -53.033748626708984, + "logps/rejected": -99.1806411743164, + "loss": 0.468, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.756540298461914, + "rewards/margins": -0.4344959259033203, + "rewards/rejected": 3.1910362243652344, + "step": 7427 + }, + { + "epoch": 1.21, + "learning_rate": 3.5970701172880528e-06, + "logits/chosen": -1.501610517501831, + "logits/rejected": -1.4493329524993896, + "logps/chosen": -47.137516021728516, + "logps/rejected": -39.63368225097656, + "loss": 2.0839, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.4723966121673584, + "rewards/margins": 1.487690806388855, + "rewards/rejected": 1.9847058057785034, + "step": 7428 + }, + { + "epoch": 1.21, + "learning_rate": 3.595808706844538e-06, + "logits/chosen": -1.11272394657135, + "logits/rejected": -1.114629864692688, + "logps/chosen": -62.63982009887695, + "logps/rejected": -52.122100830078125, + "loss": 0.1969, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.4692630767822266, + "rewards/margins": 2.0857341289520264, + "rewards/rejected": 0.3835289180278778, + "step": 7429 + }, + { + "epoch": 1.21, + "learning_rate": 3.5945473934173547e-06, + "logits/chosen": -0.978693962097168, + "logits/rejected": -0.9999846816062927, + "logps/chosen": -55.019134521484375, + "logps/rejected": -56.178062438964844, + "loss": 0.741, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.8567795753479004, + "rewards/margins": -0.43979740142822266, + "rewards/rejected": 4.296576976776123, + "step": 7430 + }, + { + "epoch": 1.21, + "learning_rate": 3.5932861770936485e-06, + "logits/chosen": -1.1400762796401978, + "logits/rejected": -1.1400762796401978, + "logps/chosen": -31.344993591308594, + "logps/rejected": -31.344993591308594, + "loss": 0.394, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.1312973499298096, + "rewards/margins": 0.0, + "rewards/rejected": 3.1312973499298096, + "step": 7431 + }, + { + "epoch": 1.21, + "learning_rate": 3.5920250579605564e-06, + "logits/chosen": -1.1327556371688843, + "logits/rejected": -1.1408371925354004, + "logps/chosen": -58.80878448486328, + "logps/rejected": -62.30649948120117, + "loss": 0.7083, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.7193787097930908, + "rewards/margins": 0.20909619331359863, + "rewards/rejected": 1.5102825164794922, + "step": 7432 + }, + { + "epoch": 1.21, + "learning_rate": 3.5907640361052108e-06, + "logits/chosen": -1.049257755279541, + "logits/rejected": -1.0531460046768188, + "logps/chosen": -41.951934814453125, + "logps/rejected": -63.592281341552734, + "loss": 0.8159, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.141949415206909, + "rewards/margins": -1.361276626586914, + "rewards/rejected": 3.5032260417938232, + "step": 7433 + }, + { + "epoch": 1.21, + "learning_rate": 3.5895031116147355e-06, + "logits/chosen": -1.3128031492233276, + "logits/rejected": -1.3465075492858887, + "logps/chosen": -203.046142578125, + "logps/rejected": -87.46261596679688, + "loss": 0.4847, + "rewards/accuracies": 0.0, + "rewards/chosen": 6.744259834289551, + "rewards/margins": -0.3547377586364746, + "rewards/rejected": 7.098997592926025, + "step": 7434 + }, + { + "epoch": 1.21, + "learning_rate": 3.5882422845762495e-06, + "logits/chosen": -1.6161856651306152, + "logits/rejected": -1.6427268981933594, + "logps/chosen": -92.22460174560547, + "logps/rejected": -109.12112426757812, + "loss": 0.7403, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.130225419998169, + "rewards/margins": -1.1325981616973877, + "rewards/rejected": 3.2628235816955566, + "step": 7435 + }, + { + "epoch": 1.21, + "learning_rate": 3.5869815550768626e-06, + "logits/chosen": -1.4201271533966064, + "logits/rejected": -1.4676978588104248, + "logps/chosen": -92.16917419433594, + "logps/rejected": -103.9178466796875, + "loss": 0.7876, + "rewards/accuracies": 0.0, + "rewards/chosen": 7.247264385223389, + "rewards/margins": -1.1384243965148926, + "rewards/rejected": 8.385688781738281, + "step": 7436 + }, + { + "epoch": 1.21, + "learning_rate": 3.585720923203682e-06, + "logits/chosen": -1.1538335084915161, + "logits/rejected": -1.1415902376174927, + "logps/chosen": -101.18804931640625, + "logps/rejected": -162.43202209472656, + "loss": 1.6691, + "rewards/accuracies": 1.0, + "rewards/chosen": 9.861422538757324, + "rewards/margins": 1.1556406021118164, + "rewards/rejected": 8.705781936645508, + "step": 7437 + }, + { + "epoch": 1.21, + "learning_rate": 3.5844603890438013e-06, + "logits/chosen": -1.248957633972168, + "logits/rejected": -1.248957633972168, + "logps/chosen": -51.489097595214844, + "logps/rejected": -51.489097595214844, + "loss": 0.3474, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.8146041631698608, + "rewards/margins": 0.0, + "rewards/rejected": 1.8146041631698608, + "step": 7438 + }, + { + "epoch": 1.21, + "learning_rate": 3.5831999526843155e-06, + "logits/chosen": -1.682236671447754, + "logits/rejected": -1.6696339845657349, + "logps/chosen": -70.2319564819336, + "logps/rejected": -157.52896118164062, + "loss": 0.5744, + "rewards/accuracies": 0.0, + "rewards/chosen": 7.239823341369629, + "rewards/margins": -0.5651326179504395, + "rewards/rejected": 7.804955959320068, + "step": 7439 + }, + { + "epoch": 1.21, + "learning_rate": 3.5819396142123066e-06, + "logits/chosen": -1.2557437419891357, + "logits/rejected": -1.1765599250793457, + "logps/chosen": -99.48722839355469, + "logps/rejected": -52.730648040771484, + "loss": 0.3789, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.4642350673675537, + "rewards/margins": 0.38631343841552734, + "rewards/rejected": 3.0779216289520264, + "step": 7440 + }, + { + "epoch": 1.21, + "learning_rate": 3.5806793737148517e-06, + "logits/chosen": -1.009955644607544, + "logits/rejected": -0.7755861282348633, + "logps/chosen": -47.81840515136719, + "logps/rejected": -120.65369415283203, + "loss": 1.0174, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.0115983486175537, + "rewards/margins": -1.2837927341461182, + "rewards/rejected": 3.295391082763672, + "step": 7441 + }, + { + "epoch": 1.21, + "learning_rate": 3.579419231279023e-06, + "logits/chosen": -1.5739598274230957, + "logits/rejected": -1.6057453155517578, + "logps/chosen": -92.1982421875, + "logps/rejected": -35.149070739746094, + "loss": 0.5733, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.795971632003784, + "rewards/margins": 0.27963733673095703, + "rewards/rejected": 2.516334295272827, + "step": 7442 + }, + { + "epoch": 1.21, + "learning_rate": 3.5781591869918843e-06, + "logits/chosen": -1.4136806726455688, + "logits/rejected": -1.5302752256393433, + "logps/chosen": -114.60502624511719, + "logps/rejected": -201.44085693359375, + "loss": 1.4908, + "rewards/accuracies": 1.0, + "rewards/chosen": 8.204963684082031, + "rewards/margins": 0.3243088722229004, + "rewards/rejected": 7.880654811859131, + "step": 7443 + }, + { + "epoch": 1.21, + "learning_rate": 3.5768992409404914e-06, + "logits/chosen": -1.2067368030548096, + "logits/rejected": -1.2066807746887207, + "logps/chosen": -68.9541015625, + "logps/rejected": -95.10295867919922, + "loss": 0.8126, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.2019448280334473, + "rewards/margins": -1.078974962234497, + "rewards/rejected": 3.2809197902679443, + "step": 7444 + }, + { + "epoch": 1.21, + "learning_rate": 3.5756393932118956e-06, + "logits/chosen": -0.8958643078804016, + "logits/rejected": -0.8942221403121948, + "logps/chosen": -1.4222850799560547, + "logps/rejected": -1.446111798286438, + "loss": 0.4607, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.4471755027770996, + "rewards/margins": 0.018959134817123413, + "rewards/rejected": 0.4282163679599762, + "step": 7445 + }, + { + "epoch": 1.21, + "learning_rate": 3.57437964389314e-06, + "logits/chosen": -1.0883203744888306, + "logits/rejected": -1.135382056236267, + "logps/chosen": -2.1809353828430176, + "logps/rejected": -26.95846939086914, + "loss": 0.8493, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.5086603164672852, + "rewards/margins": -0.6959478855133057, + "rewards/rejected": 1.2046082019805908, + "step": 7446 + }, + { + "epoch": 1.21, + "learning_rate": 3.5731199930712623e-06, + "logits/chosen": -1.2470368146896362, + "logits/rejected": -1.2583194971084595, + "logps/chosen": -55.71192169189453, + "logps/rejected": -46.9253044128418, + "loss": 0.6624, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.235565185546875, + "rewards/margins": 0.6498775482177734, + "rewards/rejected": 1.5856876373291016, + "step": 7447 + }, + { + "epoch": 1.21, + "learning_rate": 3.5718604408332912e-06, + "logits/chosen": -1.2146024703979492, + "logits/rejected": -1.2146024703979492, + "logps/chosen": -43.374969482421875, + "logps/rejected": -43.374969482421875, + "loss": 0.3516, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.1356232166290283, + "rewards/margins": 0.0, + "rewards/rejected": 2.1356232166290283, + "step": 7448 + }, + { + "epoch": 1.21, + "learning_rate": 3.5706009872662505e-06, + "logits/chosen": -1.068790316581726, + "logits/rejected": -1.035827398300171, + "logps/chosen": -21.419105529785156, + "logps/rejected": -5.029581069946289, + "loss": 1.2391, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.7966314554214478, + "rewards/margins": 0.2877480387687683, + "rewards/rejected": 0.5088834166526794, + "step": 7449 + }, + { + "epoch": 1.21, + "learning_rate": 3.5693416324571573e-06, + "logits/chosen": -1.1464800834655762, + "logits/rejected": -1.085754632949829, + "logps/chosen": -79.09266662597656, + "logps/rejected": -39.19721984863281, + "loss": 0.7253, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.5370864868164062, + "rewards/margins": -0.45551300048828125, + "rewards/rejected": 1.9925994873046875, + "step": 7450 + }, + { + "epoch": 1.21, + "learning_rate": 3.5680823764930196e-06, + "logits/chosen": -1.2102657556533813, + "logits/rejected": -1.136974811553955, + "logps/chosen": -84.4148178100586, + "logps/rejected": -16.064851760864258, + "loss": 0.1754, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.7159889936447144, + "rewards/margins": 0.9166530966758728, + "rewards/rejected": 0.7993358969688416, + "step": 7451 + }, + { + "epoch": 1.21, + "learning_rate": 3.566823219460841e-06, + "logits/chosen": -1.1137984991073608, + "logits/rejected": -1.1354649066925049, + "logps/chosen": -46.39942169189453, + "logps/rejected": -45.04574203491211, + "loss": 0.0846, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.0288429260253906, + "rewards/margins": 1.95185124874115, + "rewards/rejected": 0.07699165493249893, + "step": 7452 + }, + { + "epoch": 1.21, + "learning_rate": 3.5655641614476172e-06, + "logits/chosen": -1.0836811065673828, + "logits/rejected": -1.2096731662750244, + "logps/chosen": -83.64213562011719, + "logps/rejected": -111.1237564086914, + "loss": 1.622, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.4675254821777344, + "rewards/margins": -2.208423137664795, + "rewards/rejected": 5.675948619842529, + "step": 7453 + }, + { + "epoch": 1.21, + "learning_rate": 3.5643052025403366e-06, + "logits/chosen": -1.3232197761535645, + "logits/rejected": -1.2560527324676514, + "logps/chosen": -52.66563415527344, + "logps/rejected": -35.22643280029297, + "loss": 0.4184, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.1334304809570312, + "rewards/margins": 0.20773804187774658, + "rewards/rejected": 1.9256924390792847, + "step": 7454 + }, + { + "epoch": 1.21, + "learning_rate": 3.563046342825982e-06, + "logits/chosen": -0.9221603870391846, + "logits/rejected": -0.9231242537498474, + "logps/chosen": -3.439493417739868, + "logps/rejected": -8.982001304626465, + "loss": 0.8599, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.4074023365974426, + "rewards/margins": -0.12230998277664185, + "rewards/rejected": 0.5297123193740845, + "step": 7455 + }, + { + "epoch": 1.21, + "learning_rate": 3.5617875823915295e-06, + "logits/chosen": -0.8938481211662292, + "logits/rejected": -0.9284549355506897, + "logps/chosen": -12.232789039611816, + "logps/rejected": -48.2284049987793, + "loss": 1.1234, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.10208997875452042, + "rewards/margins": -1.9367395639419556, + "rewards/rejected": 2.0388295650482178, + "step": 7456 + }, + { + "epoch": 1.21, + "learning_rate": 3.560528921323944e-06, + "logits/chosen": -1.8604048490524292, + "logits/rejected": -1.7443853616714478, + "logps/chosen": -79.744384765625, + "logps/rejected": -77.021484375, + "loss": 0.0713, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.120841979980469, + "rewards/margins": 1.8828086853027344, + "rewards/rejected": 4.238033294677734, + "step": 7457 + }, + { + "epoch": 1.21, + "learning_rate": 3.559270359710192e-06, + "logits/chosen": -1.3011395931243896, + "logits/rejected": -1.2156825065612793, + "logps/chosen": -62.73093032836914, + "logps/rejected": -29.677139282226562, + "loss": 0.9066, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.086108922958374, + "rewards/margins": -1.3282318115234375, + "rewards/rejected": 3.4143407344818115, + "step": 7458 + }, + { + "epoch": 1.21, + "learning_rate": 3.5580118976372245e-06, + "logits/chosen": -1.1946978569030762, + "logits/rejected": -1.1903328895568848, + "logps/chosen": -9.896952629089355, + "logps/rejected": -5.898958206176758, + "loss": 0.7077, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.7492424845695496, + "rewards/margins": 0.3330988585948944, + "rewards/rejected": 0.41614362597465515, + "step": 7459 + }, + { + "epoch": 1.21, + "learning_rate": 3.556753535191991e-06, + "logits/chosen": -1.096648931503296, + "logits/rejected": -1.0283418893814087, + "logps/chosen": -55.85740661621094, + "logps/rejected": -51.1075439453125, + "loss": 0.3453, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.2966049909591675, + "rewards/margins": 0.009843826293945312, + "rewards/rejected": 1.2867611646652222, + "step": 7460 + }, + { + "epoch": 1.21, + "learning_rate": 3.5554952724614334e-06, + "logits/chosen": -1.0746917724609375, + "logits/rejected": -1.000886082649231, + "logps/chosen": -105.73871612548828, + "logps/rejected": -73.26818084716797, + "loss": 0.1131, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.190558910369873, + "rewards/margins": 1.3717782497406006, + "rewards/rejected": 3.8187806606292725, + "step": 7461 + }, + { + "epoch": 1.21, + "learning_rate": 3.5542371095324835e-06, + "logits/chosen": -1.1909946203231812, + "logits/rejected": -1.172327995300293, + "logps/chosen": -49.364986419677734, + "logps/rejected": -31.690200805664062, + "loss": 0.7366, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.0525600910186768, + "rewards/margins": -0.635369062423706, + "rewards/rejected": 2.687929153442383, + "step": 7462 + }, + { + "epoch": 1.21, + "learning_rate": 3.5529790464920687e-06, + "logits/chosen": -1.5906167030334473, + "logits/rejected": -1.5087332725524902, + "logps/chosen": -100.35137939453125, + "logps/rejected": -150.82174682617188, + "loss": 0.3736, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.449462890625, + "rewards/margins": -0.10367274284362793, + "rewards/rejected": 3.553135633468628, + "step": 7463 + }, + { + "epoch": 1.21, + "learning_rate": 3.551721083427111e-06, + "logits/chosen": -1.42664635181427, + "logits/rejected": -1.100113868713379, + "logps/chosen": -111.03346252441406, + "logps/rejected": -90.11288452148438, + "loss": 0.5009, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.719547986984253, + "rewards/margins": 1.0190246105194092, + "rewards/rejected": 2.7005233764648438, + "step": 7464 + }, + { + "epoch": 1.21, + "learning_rate": 3.5504632204245225e-06, + "logits/chosen": -1.1778374910354614, + "logits/rejected": -1.1460087299346924, + "logps/chosen": -61.144248962402344, + "logps/rejected": -70.50234985351562, + "loss": 0.8431, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.6727463006973267, + "rewards/margins": -0.5521622896194458, + "rewards/rejected": 2.2249085903167725, + "step": 7465 + }, + { + "epoch": 1.21, + "learning_rate": 3.5492054575712094e-06, + "logits/chosen": -1.369314193725586, + "logits/rejected": -1.2291728258132935, + "logps/chosen": -93.31578826904297, + "logps/rejected": -82.17035675048828, + "loss": 0.0804, + "rewards/accuracies": 1.0, + "rewards/chosen": 7.361908912658691, + "rewards/margins": 1.7482638359069824, + "rewards/rejected": 5.613645076751709, + "step": 7466 + }, + { + "epoch": 1.21, + "learning_rate": 3.547947794954074e-06, + "logits/chosen": -0.9507116079330444, + "logits/rejected": -0.9443817138671875, + "logps/chosen": -1.6676592826843262, + "logps/rejected": -1.6139097213745117, + "loss": 1.2517, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2771589457988739, + "rewards/margins": 0.12211231887340546, + "rewards/rejected": 0.15504662692546844, + "step": 7467 + }, + { + "epoch": 1.21, + "learning_rate": 3.5466902326600043e-06, + "logits/chosen": -0.8063529133796692, + "logits/rejected": -0.8119392395019531, + "logps/chosen": -7.116447925567627, + "logps/rejected": -3.468670606613159, + "loss": 0.7549, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.16709952056407928, + "rewards/margins": -0.23330055177211761, + "rewards/rejected": 0.4004000723361969, + "step": 7468 + }, + { + "epoch": 1.21, + "learning_rate": 3.5454327707758886e-06, + "logits/chosen": -1.2595267295837402, + "logits/rejected": -1.2213592529296875, + "logps/chosen": -80.67008972167969, + "logps/rejected": -76.2438735961914, + "loss": 0.7034, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.3310836553573608, + "rewards/margins": -1.0223993062973022, + "rewards/rejected": 2.353482961654663, + "step": 7469 + }, + { + "epoch": 1.21, + "learning_rate": 3.544175409388605e-06, + "logits/chosen": -1.1868236064910889, + "logits/rejected": -1.2108670473098755, + "logps/chosen": -94.49409484863281, + "logps/rejected": -79.93266296386719, + "loss": 0.3524, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.8689935207366943, + "rewards/margins": 0.042395830154418945, + "rewards/rejected": 2.8265976905822754, + "step": 7470 + }, + { + "epoch": 1.21, + "learning_rate": 3.542918148585025e-06, + "logits/chosen": -1.2514632940292358, + "logits/rejected": -1.2708368301391602, + "logps/chosen": -53.78104782104492, + "logps/rejected": -55.27215576171875, + "loss": 1.3768, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.468300223350525, + "rewards/margins": -1.5986112356185913, + "rewards/rejected": 3.066911458969116, + "step": 7471 + }, + { + "epoch": 1.21, + "learning_rate": 3.5416609884520143e-06, + "logits/chosen": -1.3736977577209473, + "logits/rejected": -1.3096401691436768, + "logps/chosen": -48.72441864013672, + "logps/rejected": -54.501869201660156, + "loss": 0.5057, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.2921624183654785, + "rewards/margins": -0.20013189315795898, + "rewards/rejected": 2.4922943115234375, + "step": 7472 + }, + { + "epoch": 1.21, + "learning_rate": 3.54040392907643e-06, + "logits/chosen": -1.0470877885818481, + "logits/rejected": -1.0470877885818481, + "logps/chosen": -48.92411804199219, + "logps/rejected": -48.92411804199219, + "loss": 0.5761, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.002497911453247, + "rewards/margins": 0.0, + "rewards/rejected": 3.002497911453247, + "step": 7473 + }, + { + "epoch": 1.21, + "learning_rate": 3.539146970545124e-06, + "logits/chosen": -1.36347496509552, + "logits/rejected": -1.3311636447906494, + "logps/chosen": -66.76998138427734, + "logps/rejected": -79.20960998535156, + "loss": 0.1335, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.9222869873046875, + "rewards/margins": 1.1845946311950684, + "rewards/rejected": 0.7376922965049744, + "step": 7474 + }, + { + "epoch": 1.21, + "learning_rate": 3.5378901129449395e-06, + "logits/chosen": -1.2929683923721313, + "logits/rejected": -1.2614150047302246, + "logps/chosen": -85.21372985839844, + "logps/rejected": -59.0250244140625, + "loss": 0.7286, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.407914876937866, + "rewards/margins": 2.3988282680511475, + "rewards/rejected": 1.0090866088867188, + "step": 7475 + }, + { + "epoch": 1.21, + "learning_rate": 3.536633356362713e-06, + "logits/chosen": -0.9608421325683594, + "logits/rejected": -0.9608421325683594, + "logps/chosen": -1.9674255847930908, + "logps/rejected": -1.9674255847930908, + "loss": 0.5691, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.36406877636909485, + "rewards/margins": 0.0, + "rewards/rejected": 0.36406877636909485, + "step": 7476 + }, + { + "epoch": 1.21, + "learning_rate": 3.5353767008852746e-06, + "logits/chosen": -1.0847859382629395, + "logits/rejected": -0.9928660988807678, + "logps/chosen": -155.56712341308594, + "logps/rejected": -22.811250686645508, + "loss": 0.502, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.912593364715576, + "rewards/margins": 4.733013153076172, + "rewards/rejected": 0.17958031594753265, + "step": 7477 + }, + { + "epoch": 1.21, + "learning_rate": 3.5341201465994484e-06, + "logits/chosen": -1.1705429553985596, + "logits/rejected": -1.1866028308868408, + "logps/chosen": -53.56272888183594, + "logps/rejected": -78.12197875976562, + "loss": 0.3332, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.2379074096679688, + "rewards/margins": 0.1751112937927246, + "rewards/rejected": 2.062796115875244, + "step": 7478 + }, + { + "epoch": 1.21, + "learning_rate": 3.53286369359205e-06, + "logits/chosen": -1.4934526681900024, + "logits/rejected": -1.274945855140686, + "logps/chosen": -165.74908447265625, + "logps/rejected": -47.550899505615234, + "loss": 0.0165, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.825189113616943, + "rewards/margins": 4.695846080780029, + "rewards/rejected": 2.129343032836914, + "step": 7479 + }, + { + "epoch": 1.21, + "learning_rate": 3.5316073419498886e-06, + "logits/chosen": -0.7946586608886719, + "logits/rejected": -0.767666757106781, + "logps/chosen": -66.34202575683594, + "logps/rejected": -48.612022399902344, + "loss": 0.4168, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.5465316772460938, + "rewards/margins": 2.2201294898986816, + "rewards/rejected": 1.3264023065567017, + "step": 7480 + }, + { + "epoch": 1.21, + "learning_rate": 3.530351091759765e-06, + "logits/chosen": -1.4646148681640625, + "logits/rejected": -1.4574638605117798, + "logps/chosen": -63.12346267700195, + "logps/rejected": -39.9974250793457, + "loss": 0.3924, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.1720547676086426, + "rewards/margins": 0.6471009254455566, + "rewards/rejected": 1.524953842163086, + "step": 7481 + }, + { + "epoch": 1.21, + "learning_rate": 3.5290949431084755e-06, + "logits/chosen": -0.584856390953064, + "logits/rejected": -0.5781014561653137, + "logps/chosen": -15.974942207336426, + "logps/rejected": -42.87466812133789, + "loss": 0.8051, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.2755213677883148, + "rewards/margins": -0.9863221645355225, + "rewards/rejected": 1.2618435621261597, + "step": 7482 + }, + { + "epoch": 1.21, + "learning_rate": 3.527838896082808e-06, + "logits/chosen": -0.9317939281463623, + "logits/rejected": -0.9739459156990051, + "logps/chosen": -39.22065353393555, + "logps/rejected": -102.42861938476562, + "loss": 1.6751, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.354186773300171, + "rewards/margins": -3.172966241836548, + "rewards/rejected": 5.527153015136719, + "step": 7483 + }, + { + "epoch": 1.21, + "learning_rate": 3.5265829507695426e-06, + "logits/chosen": -1.269643783569336, + "logits/rejected": -0.9365056753158569, + "logps/chosen": -122.09552001953125, + "logps/rejected": -28.148597717285156, + "loss": 0.0424, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.9199113845825195, + "rewards/margins": 5.513078212738037, + "rewards/rejected": 1.406833291053772, + "step": 7484 + }, + { + "epoch": 1.21, + "learning_rate": 3.5253271072554534e-06, + "logits/chosen": -1.2952855825424194, + "logits/rejected": -1.237334132194519, + "logps/chosen": -35.37035369873047, + "logps/rejected": -30.405874252319336, + "loss": 0.7339, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.3445675373077393, + "rewards/margins": 0.5341348648071289, + "rewards/rejected": 1.8104326725006104, + "step": 7485 + }, + { + "epoch": 1.22, + "learning_rate": 3.524071365627308e-06, + "logits/chosen": -1.0814613103866577, + "logits/rejected": -1.0806138515472412, + "logps/chosen": -17.50733184814453, + "logps/rejected": -34.20408248901367, + "loss": 0.3436, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5011274218559265, + "rewards/margins": 0.48514193296432495, + "rewards/rejected": 0.015985488891601562, + "step": 7486 + }, + { + "epoch": 1.22, + "learning_rate": 3.5228157259718654e-06, + "logits/chosen": -1.3759677410125732, + "logits/rejected": -1.2725783586502075, + "logps/chosen": -86.392578125, + "logps/rejected": -23.737462997436523, + "loss": 0.1116, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.820787191390991, + "rewards/margins": 1.5535963773727417, + "rewards/rejected": 1.2671908140182495, + "step": 7487 + }, + { + "epoch": 1.22, + "learning_rate": 3.52156018837588e-06, + "logits/chosen": -1.2945724725723267, + "logits/rejected": -1.3161404132843018, + "logps/chosen": -62.694183349609375, + "logps/rejected": -66.49273681640625, + "loss": 0.4405, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.063332557678223, + "rewards/margins": 1.9277307987213135, + "rewards/rejected": 2.135601758956909, + "step": 7488 + }, + { + "epoch": 1.22, + "learning_rate": 3.5203047529260953e-06, + "logits/chosen": -1.2565768957138062, + "logits/rejected": -1.231033444404602, + "logps/chosen": -90.2315673828125, + "logps/rejected": -65.73453521728516, + "loss": 2.8976, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0421531200408936, + "rewards/margins": 1.5732321739196777, + "rewards/rejected": 1.4689209461212158, + "step": 7489 + }, + { + "epoch": 1.22, + "learning_rate": 3.519049419709252e-06, + "logits/chosen": -0.9096065163612366, + "logits/rejected": -0.865386962890625, + "logps/chosen": -46.1162109375, + "logps/rejected": -14.284357070922852, + "loss": 0.4646, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.5807060599327087, + "rewards/margins": -0.4263682961463928, + "rewards/rejected": 1.0070743560791016, + "step": 7490 + }, + { + "epoch": 1.22, + "learning_rate": 3.5177941888120802e-06, + "logits/chosen": -1.2376223802566528, + "logits/rejected": -1.0749249458312988, + "logps/chosen": -73.1670913696289, + "logps/rejected": -84.37957763671875, + "loss": 0.8487, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.5756967067718506, + "rewards/margins": -0.8929846286773682, + "rewards/rejected": 3.4686813354492188, + "step": 7491 + }, + { + "epoch": 1.22, + "learning_rate": 3.516539060321306e-06, + "logits/chosen": -1.1059314012527466, + "logits/rejected": -1.0564255714416504, + "logps/chosen": -52.9683837890625, + "logps/rejected": -44.283958435058594, + "loss": 0.4039, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.6446059942245483, + "rewards/margins": 0.3638312816619873, + "rewards/rejected": 1.280774712562561, + "step": 7492 + }, + { + "epoch": 1.22, + "learning_rate": 3.515284034323645e-06, + "logits/chosen": -1.0648106336593628, + "logits/rejected": -1.0950087308883667, + "logps/chosen": -51.351837158203125, + "logps/rejected": -91.36566162109375, + "loss": 0.4064, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.190708875656128, + "rewards/margins": -0.21485137939453125, + "rewards/rejected": 2.405560255050659, + "step": 7493 + }, + { + "epoch": 1.22, + "learning_rate": 3.514029110905809e-06, + "logits/chosen": -1.059362769126892, + "logits/rejected": -1.0538135766983032, + "logps/chosen": -41.95856475830078, + "logps/rejected": -62.805381774902344, + "loss": 1.5404, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.785538911819458, + "rewards/margins": -0.5192379951477051, + "rewards/rejected": 2.304776906967163, + "step": 7494 + }, + { + "epoch": 1.22, + "learning_rate": 3.5127742901545015e-06, + "logits/chosen": -0.6191580295562744, + "logits/rejected": -0.6191580295562744, + "logps/chosen": -45.18833541870117, + "logps/rejected": -45.18833541870117, + "loss": 0.4907, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.1355549097061157, + "rewards/margins": 0.0, + "rewards/rejected": 1.1355549097061157, + "step": 7495 + }, + { + "epoch": 1.22, + "learning_rate": 3.511519572156418e-06, + "logits/chosen": -1.333808422088623, + "logits/rejected": -1.444196343421936, + "logps/chosen": -119.65592956542969, + "logps/rejected": -89.52281188964844, + "loss": 2.0044, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.4870927333831787, + "rewards/margins": -3.811267137527466, + "rewards/rejected": 7.2983598709106445, + "step": 7496 + }, + { + "epoch": 1.22, + "learning_rate": 3.5102649569982483e-06, + "logits/chosen": -1.041312575340271, + "logits/rejected": -1.041312575340271, + "logps/chosen": -8.300339698791504, + "logps/rejected": -8.300339698791504, + "loss": 0.5621, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.9726641774177551, + "rewards/margins": 0.0, + "rewards/rejected": 0.9726641774177551, + "step": 7497 + }, + { + "epoch": 1.22, + "learning_rate": 3.509010444766674e-06, + "logits/chosen": -1.7495301961898804, + "logits/rejected": -1.749436616897583, + "logps/chosen": -94.39524841308594, + "logps/rejected": -89.26106262207031, + "loss": 1.0981, + "rewards/accuracies": 0.0, + "rewards/chosen": 6.2457594871521, + "rewards/margins": -1.4559249877929688, + "rewards/rejected": 7.701684474945068, + "step": 7498 + }, + { + "epoch": 1.22, + "learning_rate": 3.50775603554837e-06, + "logits/chosen": -1.199394941329956, + "logits/rejected": -1.2781802415847778, + "logps/chosen": -43.32706832885742, + "logps/rejected": -72.74267578125, + "loss": 1.5565, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.8395602703094482, + "rewards/margins": -2.973271131515503, + "rewards/rejected": 5.812831401824951, + "step": 7499 + }, + { + "epoch": 1.22, + "learning_rate": 3.506501729430004e-06, + "logits/chosen": -1.2859458923339844, + "logits/rejected": -1.257780909538269, + "logps/chosen": -68.8764877319336, + "logps/rejected": -43.16022872924805, + "loss": 0.2833, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.7220925092697144, + "rewards/margins": 0.48631179332733154, + "rewards/rejected": 1.2357807159423828, + "step": 7500 + }, + { + "epoch": 1.22, + "learning_rate": 3.505247526498237e-06, + "logits/chosen": -1.2368860244750977, + "logits/rejected": -1.040059208869934, + "logps/chosen": -78.93861389160156, + "logps/rejected": -49.10551834106445, + "loss": 0.0579, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.752143859863281, + "rewards/margins": 6.031066417694092, + "rewards/rejected": 0.7210773825645447, + "step": 7501 + }, + { + "epoch": 1.22, + "learning_rate": 3.5039934268397225e-06, + "logits/chosen": -0.6772788763046265, + "logits/rejected": -0.6745952367782593, + "logps/chosen": -2.27772855758667, + "logps/rejected": -6.202658653259277, + "loss": 0.698, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.4184597432613373, + "rewards/margins": -0.019475996494293213, + "rewards/rejected": 0.4379357397556305, + "step": 7502 + }, + { + "epoch": 1.22, + "learning_rate": 3.5027394305411067e-06, + "logits/chosen": -1.0934154987335205, + "logits/rejected": -1.0208832025527954, + "logps/chosen": -25.121536254882812, + "logps/rejected": -5.5925421714782715, + "loss": 0.1794, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.2504104375839233, + "rewards/margins": 0.8608918190002441, + "rewards/rejected": 0.3895185887813568, + "step": 7503 + }, + { + "epoch": 1.22, + "learning_rate": 3.501485537689029e-06, + "logits/chosen": -1.2650014162063599, + "logits/rejected": -1.1716439723968506, + "logps/chosen": -90.79412841796875, + "logps/rejected": -36.37503433227539, + "loss": 0.1904, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1510169506073, + "rewards/margins": 3.219874858856201, + "rewards/rejected": -0.06885796040296555, + "step": 7504 + }, + { + "epoch": 1.22, + "learning_rate": 3.500231748370122e-06, + "logits/chosen": -1.4210790395736694, + "logits/rejected": -1.4628962278366089, + "logps/chosen": -84.99613952636719, + "logps/rejected": -75.93596649169922, + "loss": 0.362, + "rewards/accuracies": 0.0, + "rewards/chosen": 5.2578935623168945, + "rewards/margins": -0.03553056716918945, + "rewards/rejected": 5.293424129486084, + "step": 7505 + }, + { + "epoch": 1.22, + "learning_rate": 3.4989780626710103e-06, + "logits/chosen": -1.1341413259506226, + "logits/rejected": -1.0817922353744507, + "logps/chosen": -114.92550659179688, + "logps/rejected": -50.936466217041016, + "loss": 0.2146, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.3821747303009033, + "rewards/margins": 0.7489845752716064, + "rewards/rejected": 0.6331901550292969, + "step": 7506 + }, + { + "epoch": 1.22, + "learning_rate": 3.4977244806783106e-06, + "logits/chosen": -1.4371888637542725, + "logits/rejected": -1.028390645980835, + "logps/chosen": -125.70524597167969, + "logps/rejected": -34.127010345458984, + "loss": 0.2992, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.330806255340576, + "rewards/margins": 5.199280261993408, + "rewards/rejected": 0.1315261870622635, + "step": 7507 + }, + { + "epoch": 1.22, + "learning_rate": 3.4964710024786354e-06, + "logits/chosen": -1.5451644659042358, + "logits/rejected": -1.5122029781341553, + "logps/chosen": -50.75121307373047, + "logps/rejected": -70.39913177490234, + "loss": 0.3377, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2757034301757812, + "rewards/margins": 0.05486750602722168, + "rewards/rejected": 3.2208359241485596, + "step": 7508 + }, + { + "epoch": 1.22, + "learning_rate": 3.4952176281585874e-06, + "logits/chosen": -1.2397319078445435, + "logits/rejected": -1.1947466135025024, + "logps/chosen": -73.94317626953125, + "logps/rejected": -96.11617279052734, + "loss": 0.068, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2596702575683594, + "rewards/margins": 2.1132254600524902, + "rewards/rejected": 1.1464446783065796, + "step": 7509 + }, + { + "epoch": 1.22, + "learning_rate": 3.493964357804763e-06, + "logits/chosen": -1.3520596027374268, + "logits/rejected": -1.1722124814987183, + "logps/chosen": -101.0517807006836, + "logps/rejected": -48.15989303588867, + "loss": 0.237, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.356705665588379, + "rewards/margins": 0.9606161117553711, + "rewards/rejected": 3.396089553833008, + "step": 7510 + }, + { + "epoch": 1.22, + "learning_rate": 3.4927111915037513e-06, + "logits/chosen": -1.6595553159713745, + "logits/rejected": -1.607477068901062, + "logps/chosen": -101.1016616821289, + "logps/rejected": -25.2313175201416, + "loss": 1.353, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.8485374450683594, + "rewards/margins": 1.071232795715332, + "rewards/rejected": 1.7773046493530273, + "step": 7511 + }, + { + "epoch": 1.22, + "learning_rate": 3.4914581293421335e-06, + "logits/chosen": -1.727838158607483, + "logits/rejected": -1.7373955249786377, + "logps/chosen": -68.18041229248047, + "logps/rejected": -84.92085266113281, + "loss": 0.2093, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.082118272781372, + "rewards/margins": 0.759636640548706, + "rewards/rejected": 2.322481632232666, + "step": 7512 + }, + { + "epoch": 1.22, + "learning_rate": 3.490205171406484e-06, + "logits/chosen": -1.3959298133850098, + "logits/rejected": -1.3438032865524292, + "logps/chosen": -50.94679260253906, + "logps/rejected": -19.773012161254883, + "loss": 0.0774, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.9105132818222046, + "rewards/margins": 1.935535192489624, + "rewards/rejected": -0.025021934881806374, + "step": 7513 + }, + { + "epoch": 1.22, + "learning_rate": 3.488952317783374e-06, + "logits/chosen": -0.9378186464309692, + "logits/rejected": -0.8285288214683533, + "logps/chosen": -210.78848266601562, + "logps/rejected": -44.70823669433594, + "loss": 0.0228, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.166354656219482, + "rewards/margins": 4.168278694152832, + "rewards/rejected": 1.9980758428573608, + "step": 7514 + }, + { + "epoch": 1.22, + "learning_rate": 3.4876995685593596e-06, + "logits/chosen": -1.0569928884506226, + "logits/rejected": -1.0223952531814575, + "logps/chosen": -87.29707336425781, + "logps/rejected": -75.83663177490234, + "loss": 0.1977, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2088706493377686, + "rewards/margins": 2.087170362472534, + "rewards/rejected": 1.1217002868652344, + "step": 7515 + }, + { + "epoch": 1.22, + "learning_rate": 3.486446923820996e-06, + "logits/chosen": -1.4908497333526611, + "logits/rejected": -1.2551524639129639, + "logps/chosen": -104.66099548339844, + "logps/rejected": -69.88040924072266, + "loss": 0.0279, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.799066066741943, + "rewards/margins": 4.225644588470459, + "rewards/rejected": 2.5734214782714844, + "step": 7516 + }, + { + "epoch": 1.22, + "learning_rate": 3.4851943836548286e-06, + "logits/chosen": -1.2899125814437866, + "logits/rejected": -1.426153302192688, + "logps/chosen": -83.4981689453125, + "logps/rejected": -107.86634826660156, + "loss": 2.5666, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.55033802986145, + "rewards/margins": -4.49575138092041, + "rewards/rejected": 8.046089172363281, + "step": 7517 + }, + { + "epoch": 1.22, + "learning_rate": 3.483941948147396e-06, + "logits/chosen": -1.6975070238113403, + "logits/rejected": -1.6636053323745728, + "logps/chosen": -90.7238540649414, + "logps/rejected": -98.50559997558594, + "loss": 0.5045, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.8123862743377686, + "rewards/margins": 0.7543699741363525, + "rewards/rejected": 2.058016300201416, + "step": 7518 + }, + { + "epoch": 1.22, + "learning_rate": 3.48268961738523e-06, + "logits/chosen": -1.2855265140533447, + "logits/rejected": -1.1851929426193237, + "logps/chosen": -65.47870635986328, + "logps/rejected": -75.05550384521484, + "loss": 0.2588, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.4516825675964355, + "rewards/margins": 0.40407752990722656, + "rewards/rejected": 4.047605037689209, + "step": 7519 + }, + { + "epoch": 1.22, + "learning_rate": 3.4814373914548538e-06, + "logits/chosen": -1.1952263116836548, + "logits/rejected": -1.171594500541687, + "logps/chosen": -64.17364501953125, + "logps/rejected": -71.17871856689453, + "loss": 0.9632, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.8038391470909119, + "rewards/margins": -0.438166081905365, + "rewards/rejected": 1.2420052289962769, + "step": 7520 + }, + { + "epoch": 1.22, + "learning_rate": 3.480185270442785e-06, + "logits/chosen": -0.7463510036468506, + "logits/rejected": -0.7463510036468506, + "logps/chosen": -0.9475985765457153, + "logps/rejected": -0.9475985765457153, + "loss": 0.3853, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.44583019614219666, + "rewards/margins": 0.0, + "rewards/rejected": 0.44583019614219666, + "step": 7521 + }, + { + "epoch": 1.22, + "learning_rate": 3.478933254435534e-06, + "logits/chosen": -1.3514900207519531, + "logits/rejected": -1.3635762929916382, + "logps/chosen": -66.62742614746094, + "logps/rejected": -57.35015869140625, + "loss": 0.8948, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.9819037914276123, + "rewards/margins": -0.028083086013793945, + "rewards/rejected": 3.0099868774414062, + "step": 7522 + }, + { + "epoch": 1.22, + "learning_rate": 3.4776813435196026e-06, + "logits/chosen": -1.0332671403884888, + "logits/rejected": -1.103402853012085, + "logps/chosen": -50.234710693359375, + "logps/rejected": -98.13717651367188, + "loss": 0.9176, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.280505418777466, + "rewards/margins": -1.6102159023284912, + "rewards/rejected": 4.890721321105957, + "step": 7523 + }, + { + "epoch": 1.22, + "learning_rate": 3.476429537781486e-06, + "logits/chosen": -1.0236148834228516, + "logits/rejected": -1.0419222116470337, + "logps/chosen": -105.91726684570312, + "logps/rejected": -49.32395553588867, + "loss": 0.3095, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1718690395355225, + "rewards/margins": 0.2884800434112549, + "rewards/rejected": 2.8833889961242676, + "step": 7524 + }, + { + "epoch": 1.22, + "learning_rate": 3.4751778373076718e-06, + "logits/chosen": -1.3161234855651855, + "logits/rejected": -1.3811873197555542, + "logps/chosen": -75.27232360839844, + "logps/rejected": -82.3502197265625, + "loss": 1.3409, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.9638779163360596, + "rewards/margins": -2.6045000553131104, + "rewards/rejected": 5.56837797164917, + "step": 7525 + }, + { + "epoch": 1.22, + "learning_rate": 3.473926242184642e-06, + "logits/chosen": -1.379875659942627, + "logits/rejected": -1.3526568412780762, + "logps/chosen": -60.12659454345703, + "logps/rejected": -59.74322509765625, + "loss": 0.4307, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.4870972633361816, + "rewards/margins": 0.02208566665649414, + "rewards/rejected": 2.4650115966796875, + "step": 7526 + }, + { + "epoch": 1.22, + "learning_rate": 3.472674752498868e-06, + "logits/chosen": -1.0378605127334595, + "logits/rejected": -1.0379999876022339, + "logps/chosen": -2.639254570007324, + "logps/rejected": -32.86514663696289, + "loss": 0.3758, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5737126469612122, + "rewards/margins": 0.65730220079422, + "rewards/rejected": -0.08358955383300781, + "step": 7527 + }, + { + "epoch": 1.22, + "learning_rate": 3.471423368336817e-06, + "logits/chosen": -0.9056696891784668, + "logits/rejected": -0.9091281294822693, + "logps/chosen": -24.589412689208984, + "logps/rejected": -17.339088439941406, + "loss": 0.9412, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.16011352837085724, + "rewards/margins": -0.5145473480224609, + "rewards/rejected": 0.674660861492157, + "step": 7528 + }, + { + "epoch": 1.22, + "learning_rate": 3.4701720897849485e-06, + "logits/chosen": -1.165887713432312, + "logits/rejected": -1.0819851160049438, + "logps/chosen": -150.56942749023438, + "logps/rejected": -116.7586669921875, + "loss": 0.1628, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.8034943342208862, + "rewards/margins": 1.0643532276153564, + "rewards/rejected": 0.7391411066055298, + "step": 7529 + }, + { + "epoch": 1.22, + "learning_rate": 3.4689209169297123e-06, + "logits/chosen": -1.3728513717651367, + "logits/rejected": -1.410054087638855, + "logps/chosen": -109.66831970214844, + "logps/rejected": -136.22496032714844, + "loss": 2.1198, + "rewards/accuracies": 0.0, + "rewards/chosen": 4.145692348480225, + "rewards/margins": -4.2218756675720215, + "rewards/rejected": 8.367568016052246, + "step": 7530 + }, + { + "epoch": 1.22, + "learning_rate": 3.4676698498575544e-06, + "logits/chosen": -1.5558912754058838, + "logits/rejected": -1.4968736171722412, + "logps/chosen": -78.06295776367188, + "logps/rejected": -32.17522048950195, + "loss": 0.2179, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.2218759059906006, + "rewards/margins": 1.007883906364441, + "rewards/rejected": 1.2139919996261597, + "step": 7531 + }, + { + "epoch": 1.22, + "learning_rate": 3.4664188886549105e-06, + "logits/chosen": -1.474941372871399, + "logits/rejected": -1.3366124629974365, + "logps/chosen": -64.2133560180664, + "logps/rejected": -39.898834228515625, + "loss": 0.0454, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.8546578884124756, + "rewards/margins": 2.368924617767334, + "rewards/rejected": 1.4857333898544312, + "step": 7532 + }, + { + "epoch": 1.22, + "learning_rate": 3.4651680334082104e-06, + "logits/chosen": -1.2296134233474731, + "logits/rejected": -1.1916264295578003, + "logps/chosen": -88.92587280273438, + "logps/rejected": -57.67310333251953, + "loss": 0.3605, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.9788894653320312, + "rewards/margins": 0.2071845531463623, + "rewards/rejected": 1.771704912185669, + "step": 7533 + }, + { + "epoch": 1.22, + "learning_rate": 3.4639172842038766e-06, + "logits/chosen": -1.0309118032455444, + "logits/rejected": -1.0868985652923584, + "logps/chosen": -51.20490646362305, + "logps/rejected": -79.1815414428711, + "loss": 0.4661, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.171008825302124, + "rewards/margins": 0.2215878963470459, + "rewards/rejected": 1.9494209289550781, + "step": 7534 + }, + { + "epoch": 1.22, + "learning_rate": 3.4626666411283237e-06, + "logits/chosen": -1.412879228591919, + "logits/rejected": -1.4287028312683105, + "logps/chosen": -184.4589080810547, + "logps/rejected": -81.26554107666016, + "loss": 0.1448, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.710389614105225, + "rewards/margins": 1.6001362800598145, + "rewards/rejected": 5.11025333404541, + "step": 7535 + }, + { + "epoch": 1.22, + "learning_rate": 3.4614161042679593e-06, + "logits/chosen": -1.3733375072479248, + "logits/rejected": -1.4048655033111572, + "logps/chosen": -52.477577209472656, + "logps/rejected": -78.8718032836914, + "loss": 0.1413, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.9316489696502686, + "rewards/margins": 2.194174289703369, + "rewards/rejected": 1.737474799156189, + "step": 7536 + }, + { + "epoch": 1.22, + "learning_rate": 3.460165673709185e-06, + "logits/chosen": -1.5151498317718506, + "logits/rejected": -1.54300856590271, + "logps/chosen": -88.9417724609375, + "logps/rejected": -81.77591705322266, + "loss": 0.6485, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.262997627258301, + "rewards/margins": 0.4290733337402344, + "rewards/rejected": 4.833924293518066, + "step": 7537 + }, + { + "epoch": 1.22, + "learning_rate": 3.4589153495383916e-06, + "logits/chosen": -1.0550097227096558, + "logits/rejected": -1.1342130899429321, + "logps/chosen": -53.22210693359375, + "logps/rejected": -112.54566955566406, + "loss": 0.3144, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.3332931995391846, + "rewards/margins": 0.2757568359375, + "rewards/rejected": 2.0575363636016846, + "step": 7538 + }, + { + "epoch": 1.22, + "learning_rate": 3.4576651318419658e-06, + "logits/chosen": -1.251637578010559, + "logits/rejected": -1.2483378648757935, + "logps/chosen": -54.421630859375, + "logps/rejected": -60.07252883911133, + "loss": 0.6975, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.7154481410980225, + "rewards/margins": -0.5219385623931885, + "rewards/rejected": 3.237386703491211, + "step": 7539 + }, + { + "epoch": 1.22, + "learning_rate": 3.456415020706285e-06, + "logits/chosen": -1.5818147659301758, + "logits/rejected": -1.66182541847229, + "logps/chosen": -128.21018981933594, + "logps/rejected": -166.06124877929688, + "loss": 0.6374, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.255012512207031, + "rewards/margins": 0.09538125991821289, + "rewards/rejected": 6.159631252288818, + "step": 7540 + }, + { + "epoch": 1.22, + "learning_rate": 3.455165016217722e-06, + "logits/chosen": -1.324812650680542, + "logits/rejected": -1.2312740087509155, + "logps/chosen": -49.601383209228516, + "logps/rejected": -34.50044250488281, + "loss": 0.0217, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.93363618850708, + "rewards/margins": 3.3824398517608643, + "rewards/rejected": 1.5511963367462158, + "step": 7541 + }, + { + "epoch": 1.22, + "learning_rate": 3.4539151184626385e-06, + "logits/chosen": -1.1758604049682617, + "logits/rejected": -1.1824637651443481, + "logps/chosen": -57.375885009765625, + "logps/rejected": -75.45698547363281, + "loss": 1.1532, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.1048049926757812, + "rewards/margins": -2.192678928375244, + "rewards/rejected": 5.297483921051025, + "step": 7542 + }, + { + "epoch": 1.22, + "learning_rate": 3.452665327527391e-06, + "logits/chosen": -1.1248173713684082, + "logits/rejected": -1.0364936590194702, + "logps/chosen": -42.68635559082031, + "logps/rejected": -31.778278350830078, + "loss": 0.1412, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0652527809143066, + "rewards/margins": 1.166038990020752, + "rewards/rejected": 1.8992137908935547, + "step": 7543 + }, + { + "epoch": 1.22, + "learning_rate": 3.4514156434983287e-06, + "logits/chosen": -1.2130413055419922, + "logits/rejected": -1.0345162153244019, + "logps/chosen": -102.0086669921875, + "logps/rejected": -23.179847717285156, + "loss": 0.0281, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.432150363922119, + "rewards/margins": 2.892362356185913, + "rewards/rejected": 0.5397880673408508, + "step": 7544 + }, + { + "epoch": 1.22, + "learning_rate": 3.4501660664617926e-06, + "logits/chosen": -1.1745455265045166, + "logits/rejected": -1.1346049308776855, + "logps/chosen": -23.35220718383789, + "logps/rejected": -39.6702766418457, + "loss": 1.7426, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.143504023551941, + "rewards/margins": -0.628785252571106, + "rewards/rejected": 1.7722892761230469, + "step": 7545 + }, + { + "epoch": 1.22, + "learning_rate": 3.448916596504116e-06, + "logits/chosen": -1.6361989974975586, + "logits/rejected": -1.6064026355743408, + "logps/chosen": -117.6334457397461, + "logps/rejected": -79.89613342285156, + "loss": 0.0853, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.389002323150635, + "rewards/margins": 2.794512987136841, + "rewards/rejected": 2.594489336013794, + "step": 7546 + }, + { + "epoch": 1.22, + "learning_rate": 3.4476672337116268e-06, + "logits/chosen": -1.1456400156021118, + "logits/rejected": -1.1724497079849243, + "logps/chosen": -79.94696044921875, + "logps/rejected": -177.09710693359375, + "loss": 0.5921, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.636438012123108, + "rewards/margins": 0.34603428840637207, + "rewards/rejected": 1.2904037237167358, + "step": 7547 + }, + { + "epoch": 1.23, + "learning_rate": 3.446417978170642e-06, + "logits/chosen": -1.1159061193466187, + "logits/rejected": -1.1423466205596924, + "logps/chosen": -48.200016021728516, + "logps/rejected": -49.298587799072266, + "loss": 0.6287, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.86144757270813, + "rewards/margins": 1.9973487854003906, + "rewards/rejected": 1.8640987873077393, + "step": 7548 + }, + { + "epoch": 1.23, + "learning_rate": 3.4451688299674755e-06, + "logits/chosen": -1.261152744293213, + "logits/rejected": -1.1781198978424072, + "logps/chosen": -74.41952514648438, + "logps/rejected": -130.7146759033203, + "loss": 0.2602, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.3350846767425537, + "rewards/margins": 1.0302735567092896, + "rewards/rejected": 1.3048111200332642, + "step": 7549 + }, + { + "epoch": 1.23, + "learning_rate": 3.4439197891884317e-06, + "logits/chosen": -1.4089176654815674, + "logits/rejected": -1.4089176654815674, + "logps/chosen": -106.30817413330078, + "logps/rejected": -106.30817413330078, + "loss": 0.728, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.4426963329315186, + "rewards/margins": 0.0, + "rewards/rejected": 3.4426963329315186, + "step": 7550 + }, + { + "epoch": 1.23, + "learning_rate": 3.442670855919806e-06, + "logits/chosen": -1.1652841567993164, + "logits/rejected": -1.218424916267395, + "logps/chosen": -56.575035095214844, + "logps/rejected": -77.97600555419922, + "loss": 1.1117, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.7930641174316406, + "rewards/margins": -0.23033761978149414, + "rewards/rejected": 3.0234017372131348, + "step": 7551 + }, + { + "epoch": 1.23, + "learning_rate": 3.4414220302478896e-06, + "logits/chosen": -1.0135024785995483, + "logits/rejected": -1.0071007013320923, + "logps/chosen": -35.10609436035156, + "logps/rejected": -43.87586975097656, + "loss": 0.4417, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.3175811767578125, + "rewards/margins": 0.20606613159179688, + "rewards/rejected": 2.1115150451660156, + "step": 7552 + }, + { + "epoch": 1.23, + "learning_rate": 3.4401733122589624e-06, + "logits/chosen": -0.9099920392036438, + "logits/rejected": -0.8636870384216309, + "logps/chosen": -45.978797912597656, + "logps/rejected": -40.2718620300293, + "loss": 0.3089, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.67155385017395, + "rewards/margins": 0.6792267560958862, + "rewards/rejected": 1.992327094078064, + "step": 7553 + }, + { + "epoch": 1.23, + "learning_rate": 3.438924702039301e-06, + "logits/chosen": -1.3952041864395142, + "logits/rejected": -1.385936975479126, + "logps/chosen": -178.52633666992188, + "logps/rejected": -70.46891021728516, + "loss": 0.4914, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.0543854236602783, + "rewards/margins": -0.4890601634979248, + "rewards/rejected": 1.5434455871582031, + "step": 7554 + }, + { + "epoch": 1.23, + "learning_rate": 3.4376761996751707e-06, + "logits/chosen": -0.821142315864563, + "logits/rejected": -0.821142315864563, + "logps/chosen": -4.835233211517334, + "logps/rejected": -4.835233211517334, + "loss": 0.948, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.39282652735710144, + "rewards/margins": 0.0, + "rewards/rejected": 0.39282652735710144, + "step": 7555 + }, + { + "epoch": 1.23, + "learning_rate": 3.436427805252833e-06, + "logits/chosen": -0.9654059410095215, + "logits/rejected": -0.9880254864692688, + "logps/chosen": -64.79200744628906, + "logps/rejected": -105.20074462890625, + "loss": 0.3304, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.7986526489257812, + "rewards/margins": 1.1471961736679077, + "rewards/rejected": 1.6514564752578735, + "step": 7556 + }, + { + "epoch": 1.23, + "learning_rate": 3.4351795188585392e-06, + "logits/chosen": -1.1658891439437866, + "logits/rejected": -1.1695278882980347, + "logps/chosen": -1.271192193031311, + "logps/rejected": -2.9143552780151367, + "loss": 0.3749, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.41308578848838806, + "rewards/margins": 0.09328997135162354, + "rewards/rejected": 0.3197958171367645, + "step": 7557 + }, + { + "epoch": 1.23, + "learning_rate": 3.4339313405785337e-06, + "logits/chosen": -0.7977342009544373, + "logits/rejected": -0.7977342009544373, + "logps/chosen": -16.81231689453125, + "logps/rejected": -16.81231689453125, + "loss": 0.4043, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.6493688821792603, + "rewards/margins": 0.0, + "rewards/rejected": 0.6493688821792603, + "step": 7558 + }, + { + "epoch": 1.23, + "learning_rate": 3.4326832704990543e-06, + "logits/chosen": -1.4837560653686523, + "logits/rejected": -1.5435014963150024, + "logps/chosen": -42.30615234375, + "logps/rejected": -67.06147003173828, + "loss": 0.5926, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.648078203201294, + "rewards/margins": -0.3152785301208496, + "rewards/rejected": 3.9633567333221436, + "step": 7559 + }, + { + "epoch": 1.23, + "learning_rate": 3.431435308706331e-06, + "logits/chosen": -1.2166826725006104, + "logits/rejected": -1.2227975130081177, + "logps/chosen": -79.78883361816406, + "logps/rejected": -86.45457458496094, + "loss": 0.7072, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.107814073562622, + "rewards/margins": 0.33697354793548584, + "rewards/rejected": 1.7708405256271362, + "step": 7560 + }, + { + "epoch": 1.23, + "learning_rate": 3.430187455286586e-06, + "logits/chosen": -1.2726695537567139, + "logits/rejected": -1.201093316078186, + "logps/chosen": -75.63607788085938, + "logps/rejected": -72.19255065917969, + "loss": 0.2436, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.7144362926483154, + "rewards/margins": 2.4964897632598877, + "rewards/rejected": 0.2179466336965561, + "step": 7561 + }, + { + "epoch": 1.23, + "learning_rate": 3.4289397103260346e-06, + "logits/chosen": -0.9007761478424072, + "logits/rejected": -0.9212706089019775, + "logps/chosen": -45.21135330200195, + "logps/rejected": -33.96619415283203, + "loss": 0.3743, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.051062822341919, + "rewards/margins": 0.20298540592193604, + "rewards/rejected": 0.8480774164199829, + "step": 7562 + }, + { + "epoch": 1.23, + "learning_rate": 3.4276920739108833e-06, + "logits/chosen": -1.2956299781799316, + "logits/rejected": -1.1070972681045532, + "logps/chosen": -82.55206298828125, + "logps/rejected": -29.372085571289062, + "loss": 0.0567, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.3826904296875, + "rewards/margins": 2.4057488441467285, + "rewards/rejected": 1.976941704750061, + "step": 7563 + }, + { + "epoch": 1.23, + "learning_rate": 3.4264445461273323e-06, + "logits/chosen": -0.8568709492683411, + "logits/rejected": -0.8355141878128052, + "logps/chosen": -28.283140182495117, + "logps/rejected": -8.047484397888184, + "loss": 0.703, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.09000339359045029, + "rewards/margins": -0.5678138136863708, + "rewards/rejected": 0.6578171849250793, + "step": 7564 + }, + { + "epoch": 1.23, + "learning_rate": 3.4251971270615735e-06, + "logits/chosen": -1.6175001859664917, + "logits/rejected": -1.526154637336731, + "logps/chosen": -57.71453857421875, + "logps/rejected": -58.26457214355469, + "loss": 0.5135, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.9531311988830566, + "rewards/margins": -0.17815470695495605, + "rewards/rejected": 3.1312859058380127, + "step": 7565 + }, + { + "epoch": 1.23, + "learning_rate": 3.4239498167997933e-06, + "logits/chosen": -1.19259512424469, + "logits/rejected": -1.1962965726852417, + "logps/chosen": -45.01487350463867, + "logps/rejected": -28.36554718017578, + "loss": 0.6504, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.387698769569397, + "rewards/margins": -0.7472106218338013, + "rewards/rejected": 2.1349093914031982, + "step": 7566 + }, + { + "epoch": 1.23, + "learning_rate": 3.4227026154281672e-06, + "logits/chosen": -1.0350751876831055, + "logits/rejected": -1.0350751876831055, + "logps/chosen": -59.43067169189453, + "logps/rejected": -59.43067169189453, + "loss": 0.7173, + "rewards/accuracies": 0.0, + "rewards/chosen": 5.408815860748291, + "rewards/margins": 0.0, + "rewards/rejected": 5.408815860748291, + "step": 7567 + }, + { + "epoch": 1.23, + "learning_rate": 3.421455523032866e-06, + "logits/chosen": -1.1986377239227295, + "logits/rejected": -1.2158963680267334, + "logps/chosen": -73.91004943847656, + "logps/rejected": -66.04644775390625, + "loss": 0.5755, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.7597596645355225, + "rewards/margins": -0.05230093002319336, + "rewards/rejected": 2.812060594558716, + "step": 7568 + }, + { + "epoch": 1.23, + "learning_rate": 3.420208539700053e-06, + "logits/chosen": -1.3264297246932983, + "logits/rejected": -1.2487502098083496, + "logps/chosen": -47.90403747558594, + "logps/rejected": -94.53179931640625, + "loss": 0.3015, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.1252517700195312, + "rewards/margins": 0.21858131885528564, + "rewards/rejected": 1.9066704511642456, + "step": 7569 + }, + { + "epoch": 1.23, + "learning_rate": 3.4189616655158803e-06, + "logits/chosen": -0.9989691376686096, + "logits/rejected": -0.9989691376686096, + "logps/chosen": -70.02136993408203, + "logps/rejected": -70.02136993408203, + "loss": 0.4886, + "rewards/accuracies": 0.0, + "rewards/chosen": 4.083743572235107, + "rewards/margins": 0.0, + "rewards/rejected": 4.083743572235107, + "step": 7570 + }, + { + "epoch": 1.23, + "learning_rate": 3.417714900566497e-06, + "logits/chosen": -1.1904258728027344, + "logits/rejected": -1.194862961769104, + "logps/chosen": -62.033843994140625, + "logps/rejected": -106.27549743652344, + "loss": 0.5051, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.1769287586212158, + "rewards/margins": -0.12419891357421875, + "rewards/rejected": 1.3011276721954346, + "step": 7571 + }, + { + "epoch": 1.23, + "learning_rate": 3.4164682449380426e-06, + "logits/chosen": -0.8962206244468689, + "logits/rejected": -0.6909928321838379, + "logps/chosen": -42.50404357910156, + "logps/rejected": -12.9647855758667, + "loss": 0.1112, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.6957573890686035, + "rewards/margins": 1.4316556453704834, + "rewards/rejected": 1.2641017436981201, + "step": 7572 + }, + { + "epoch": 1.23, + "learning_rate": 3.415221698716649e-06, + "logits/chosen": -1.2307722568511963, + "logits/rejected": -1.2347111701965332, + "logps/chosen": -63.142967224121094, + "logps/rejected": -65.02457427978516, + "loss": 0.1998, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.31669545173645, + "rewards/margins": 0.7653969526290894, + "rewards/rejected": 1.5512984991073608, + "step": 7573 + }, + { + "epoch": 1.23, + "learning_rate": 3.4139752619884415e-06, + "logits/chosen": -0.9766755104064941, + "logits/rejected": -0.9392477869987488, + "logps/chosen": -65.476318359375, + "logps/rejected": -62.88975143432617, + "loss": 0.3296, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.8965859413146973, + "rewards/margins": 1.1474506855010986, + "rewards/rejected": 1.7491352558135986, + "step": 7574 + }, + { + "epoch": 1.23, + "learning_rate": 3.4127289348395355e-06, + "logits/chosen": -1.1772207021713257, + "logits/rejected": -1.2740496397018433, + "logps/chosen": -71.80229187011719, + "logps/rejected": -75.28312683105469, + "loss": 1.0753, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.3764235973358154, + "rewards/margins": -1.8681352138519287, + "rewards/rejected": 4.244558811187744, + "step": 7575 + }, + { + "epoch": 1.23, + "learning_rate": 3.4114827173560407e-06, + "logits/chosen": -1.3319061994552612, + "logits/rejected": -1.5204299688339233, + "logps/chosen": -49.52720642089844, + "logps/rejected": -49.669715881347656, + "loss": 1.1058, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.450738549232483, + "rewards/margins": 0.5836361050605774, + "rewards/rejected": 0.8671024441719055, + "step": 7576 + }, + { + "epoch": 1.23, + "learning_rate": 3.41023660962406e-06, + "logits/chosen": -0.8555268049240112, + "logits/rejected": -0.8637472987174988, + "logps/chosen": -47.83363342285156, + "logps/rejected": -38.06096267700195, + "loss": 0.4512, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.6940994262695312, + "rewards/margins": -0.12470173835754395, + "rewards/rejected": 1.8188011646270752, + "step": 7577 + }, + { + "epoch": 1.23, + "learning_rate": 3.4089906117296865e-06, + "logits/chosen": -1.5490679740905762, + "logits/rejected": -1.4503642320632935, + "logps/chosen": -57.83881759643555, + "logps/rejected": -44.387474060058594, + "loss": 1.2703, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.7708537578582764, + "rewards/margins": 2.540402889251709, + "rewards/rejected": 1.2304508686065674, + "step": 7578 + }, + { + "epoch": 1.23, + "learning_rate": 3.4077447237590077e-06, + "logits/chosen": -1.5645970106124878, + "logits/rejected": -1.483237385749817, + "logps/chosen": -79.85311126708984, + "logps/rejected": -73.39954376220703, + "loss": 1.5892, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.2983651161193848, + "rewards/margins": -2.5869245529174805, + "rewards/rejected": 5.885289669036865, + "step": 7579 + }, + { + "epoch": 1.23, + "learning_rate": 3.406498945798103e-06, + "logits/chosen": -1.2603589296340942, + "logits/rejected": -1.1870384216308594, + "logps/chosen": -78.71330261230469, + "logps/rejected": -75.88165283203125, + "loss": 0.5884, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.9256629943847656, + "rewards/margins": -0.01039886474609375, + "rewards/rejected": 1.9360618591308594, + "step": 7580 + }, + { + "epoch": 1.23, + "learning_rate": 3.405253277933043e-06, + "logits/chosen": -1.0233582258224487, + "logits/rejected": -1.0233582258224487, + "logps/chosen": -57.061431884765625, + "logps/rejected": -57.061431884765625, + "loss": 0.3838, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.6286346912384033, + "rewards/margins": 0.0, + "rewards/rejected": 2.6286346912384033, + "step": 7581 + }, + { + "epoch": 1.23, + "learning_rate": 3.4040077202498916e-06, + "logits/chosen": -1.1679906845092773, + "logits/rejected": -1.2663841247558594, + "logps/chosen": -119.6171875, + "logps/rejected": -81.4598159790039, + "loss": 1.8278, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.0747894048690796, + "rewards/margins": -2.4976649284362793, + "rewards/rejected": 3.5724542140960693, + "step": 7582 + }, + { + "epoch": 1.23, + "learning_rate": 3.4027622728347054e-06, + "logits/chosen": -1.2510077953338623, + "logits/rejected": -1.2485021352767944, + "logps/chosen": -48.19075393676758, + "logps/rejected": -60.786651611328125, + "loss": 0.893, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.611166000366211, + "rewards/margins": -0.4667789936065674, + "rewards/rejected": 2.0779449939727783, + "step": 7583 + }, + { + "epoch": 1.23, + "learning_rate": 3.4015169357735334e-06, + "logits/chosen": -1.1068000793457031, + "logits/rejected": -1.1149243116378784, + "logps/chosen": -26.217864990234375, + "logps/rejected": -34.16037368774414, + "loss": 0.9689, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.0901596546173096, + "rewards/margins": -0.032628655433654785, + "rewards/rejected": 1.1227883100509644, + "step": 7584 + }, + { + "epoch": 1.23, + "learning_rate": 3.400271709152415e-06, + "logits/chosen": -0.6800649166107178, + "logits/rejected": -0.6650151610374451, + "logps/chosen": -0.9949215650558472, + "logps/rejected": -22.616039276123047, + "loss": 0.2502, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.38650986552238464, + "rewards/margins": 0.507625937461853, + "rewards/rejected": -0.12111606448888779, + "step": 7585 + }, + { + "epoch": 1.23, + "learning_rate": 3.3990265930573863e-06, + "logits/chosen": -1.1282018423080444, + "logits/rejected": -1.126876950263977, + "logps/chosen": -52.6835823059082, + "logps/rejected": -104.34639739990234, + "loss": 1.1943, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.4979313611984253, + "rewards/margins": -0.6934612989425659, + "rewards/rejected": 2.191392660140991, + "step": 7586 + }, + { + "epoch": 1.23, + "learning_rate": 3.39778158757447e-06, + "logits/chosen": -1.2888867855072021, + "logits/rejected": -1.3119244575500488, + "logps/chosen": -51.12056350708008, + "logps/rejected": -70.61683654785156, + "loss": 0.1521, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.631775379180908, + "rewards/margins": 1.362510323524475, + "rewards/rejected": 1.269265055656433, + "step": 7587 + }, + { + "epoch": 1.23, + "learning_rate": 3.3965366927896864e-06, + "logits/chosen": -1.6614190340042114, + "logits/rejected": -1.7316360473632812, + "logps/chosen": -133.8099365234375, + "logps/rejected": -86.92047119140625, + "loss": 0.0573, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.456096172332764, + "rewards/margins": 3.5041072368621826, + "rewards/rejected": 2.951988935470581, + "step": 7588 + }, + { + "epoch": 1.23, + "learning_rate": 3.3952919087890453e-06, + "logits/chosen": -1.045013189315796, + "logits/rejected": -1.052112340927124, + "logps/chosen": -80.01575469970703, + "logps/rejected": -49.31066131591797, + "loss": 1.6581, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.5705711245536804, + "rewards/margins": -0.413504421710968, + "rewards/rejected": 0.9840755462646484, + "step": 7589 + }, + { + "epoch": 1.23, + "learning_rate": 3.394047235658549e-06, + "logits/chosen": -1.2451133728027344, + "logits/rejected": -1.238727331161499, + "logps/chosen": -170.5477752685547, + "logps/rejected": -146.3071746826172, + "loss": 1.0712, + "rewards/accuracies": 0.0, + "rewards/chosen": 7.015382289886475, + "rewards/margins": -1.6853728294372559, + "rewards/rejected": 8.70075511932373, + "step": 7590 + }, + { + "epoch": 1.23, + "learning_rate": 3.3928026734841935e-06, + "logits/chosen": -1.3224080801010132, + "logits/rejected": -1.3493374586105347, + "logps/chosen": -43.79530334472656, + "logps/rejected": -103.38284301757812, + "loss": 1.3376, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.2254891395568848, + "rewards/margins": -1.9605917930603027, + "rewards/rejected": 5.1860809326171875, + "step": 7591 + }, + { + "epoch": 1.23, + "learning_rate": 3.3915582223519656e-06, + "logits/chosen": -1.2179851531982422, + "logits/rejected": -1.0800561904907227, + "logps/chosen": -67.62351989746094, + "logps/rejected": -30.91735076904297, + "loss": 0.0562, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.8656303882598877, + "rewards/margins": 2.6052603721618652, + "rewards/rejected": -0.7396299242973328, + "step": 7592 + }, + { + "epoch": 1.23, + "learning_rate": 3.3903138823478452e-06, + "logits/chosen": -1.3591945171356201, + "logits/rejected": -1.3756248950958252, + "logps/chosen": -50.63090515136719, + "logps/rejected": -47.53224182128906, + "loss": 1.3324, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.675152540206909, + "rewards/margins": 0.9063537120819092, + "rewards/rejected": 1.768798828125, + "step": 7593 + }, + { + "epoch": 1.23, + "learning_rate": 3.389069653557805e-06, + "logits/chosen": -1.1585253477096558, + "logits/rejected": -1.2351473569869995, + "logps/chosen": -5.454122543334961, + "logps/rejected": -51.65171813964844, + "loss": 1.5294, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.5209418535232544, + "rewards/margins": -2.893862247467041, + "rewards/rejected": 3.414804220199585, + "step": 7594 + }, + { + "epoch": 1.23, + "learning_rate": 3.3878255360678082e-06, + "logits/chosen": -1.3727529048919678, + "logits/rejected": -1.33969247341156, + "logps/chosen": -86.82516479492188, + "logps/rejected": -79.10490417480469, + "loss": 0.839, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.7325546741485596, + "rewards/margins": -1.4710099697113037, + "rewards/rejected": 4.203564643859863, + "step": 7595 + }, + { + "epoch": 1.23, + "learning_rate": 3.3865815299638127e-06, + "logits/chosen": -1.380028247833252, + "logits/rejected": -1.3824951648712158, + "logps/chosen": -59.786155700683594, + "logps/rejected": -58.724090576171875, + "loss": 1.0412, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.661324381828308, + "rewards/margins": -1.8224142789840698, + "rewards/rejected": 3.483738660812378, + "step": 7596 + }, + { + "epoch": 1.23, + "learning_rate": 3.3853376353317674e-06, + "logits/chosen": -1.0256704092025757, + "logits/rejected": -1.0673490762710571, + "logps/chosen": -87.42699432373047, + "logps/rejected": -132.3562774658203, + "loss": 0.9856, + "rewards/accuracies": 0.0, + "rewards/chosen": 5.7545695304870605, + "rewards/margins": -1.7241382598876953, + "rewards/rejected": 7.478707790374756, + "step": 7597 + }, + { + "epoch": 1.23, + "learning_rate": 3.3840938522576133e-06, + "logits/chosen": -1.0182411670684814, + "logits/rejected": -0.9942680597305298, + "logps/chosen": -51.18534469604492, + "logps/rejected": -2.2140486240386963, + "loss": 0.5444, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.7859981656074524, + "rewards/margins": 0.23654669523239136, + "rewards/rejected": 0.549451470375061, + "step": 7598 + }, + { + "epoch": 1.23, + "learning_rate": 3.382850180827284e-06, + "logits/chosen": -1.2997889518737793, + "logits/rejected": -1.2997889518737793, + "logps/chosen": -50.8714599609375, + "logps/rejected": -50.8714599609375, + "loss": 0.4117, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.0948455333709717, + "rewards/margins": 0.0, + "rewards/rejected": 2.0948455333709717, + "step": 7599 + }, + { + "epoch": 1.23, + "learning_rate": 3.3816066211267057e-06, + "logits/chosen": -1.4673292636871338, + "logits/rejected": -1.4361355304718018, + "logps/chosen": -119.94184875488281, + "logps/rejected": -129.01065063476562, + "loss": 0.2216, + "rewards/accuracies": 1.0, + "rewards/chosen": 8.862367630004883, + "rewards/margins": 2.523777961730957, + "rewards/rejected": 6.338589668273926, + "step": 7600 + }, + { + "epoch": 1.23, + "learning_rate": 3.380363173241796e-06, + "logits/chosen": -1.3348195552825928, + "logits/rejected": -1.323989748954773, + "logps/chosen": -53.03315734863281, + "logps/rejected": -80.0818099975586, + "loss": 0.3244, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.5050065517425537, + "rewards/margins": 0.8688035011291504, + "rewards/rejected": 2.6362030506134033, + "step": 7601 + }, + { + "epoch": 1.23, + "learning_rate": 3.3791198372584664e-06, + "logits/chosen": -1.3085613250732422, + "logits/rejected": -1.309000849723816, + "logps/chosen": -45.85423278808594, + "logps/rejected": -70.90794372558594, + "loss": 0.6318, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.672389268875122, + "rewards/margins": -0.8834762573242188, + "rewards/rejected": 3.555865526199341, + "step": 7602 + }, + { + "epoch": 1.23, + "learning_rate": 3.3778766132626197e-06, + "logits/chosen": -0.6590495109558105, + "logits/rejected": -0.6590495109558105, + "logps/chosen": -35.279212951660156, + "logps/rejected": -35.279212951660156, + "loss": 0.3585, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.1931278705596924, + "rewards/margins": 0.0, + "rewards/rejected": 1.1931278705596924, + "step": 7603 + }, + { + "epoch": 1.23, + "learning_rate": 3.3766335013401484e-06, + "logits/chosen": -1.1818798780441284, + "logits/rejected": -1.357636570930481, + "logps/chosen": -23.905128479003906, + "logps/rejected": -48.70867156982422, + "loss": 1.3059, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.4335063695907593, + "rewards/margins": -2.1452555656433105, + "rewards/rejected": 3.5787620544433594, + "step": 7604 + }, + { + "epoch": 1.23, + "learning_rate": 3.3753905015769434e-06, + "logits/chosen": -1.2919673919677734, + "logits/rejected": -1.128602385520935, + "logps/chosen": -92.05635070800781, + "logps/rejected": -52.79685974121094, + "loss": 0.2011, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2120492458343506, + "rewards/margins": 1.2059745788574219, + "rewards/rejected": 2.0060746669769287, + "step": 7605 + }, + { + "epoch": 1.23, + "learning_rate": 3.3741476140588825e-06, + "logits/chosen": -1.3648195266723633, + "logits/rejected": -1.2797813415527344, + "logps/chosen": -50.79310607910156, + "logps/rejected": -26.152250289916992, + "loss": 1.167, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.506542921066284, + "rewards/margins": 2.5228261947631836, + "rewards/rejected": -0.016283227130770683, + "step": 7606 + }, + { + "epoch": 1.23, + "learning_rate": 3.3729048388718365e-06, + "logits/chosen": -1.3133058547973633, + "logits/rejected": -1.2862998247146606, + "logps/chosen": -106.19364929199219, + "logps/rejected": -112.30799865722656, + "loss": 1.312, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.9042251110076904, + "rewards/margins": 1.08062744140625, + "rewards/rejected": 2.8235976696014404, + "step": 7607 + }, + { + "epoch": 1.23, + "learning_rate": 3.3716621761016718e-06, + "logits/chosen": -1.5330204963684082, + "logits/rejected": -1.5329066514968872, + "logps/chosen": -48.45011901855469, + "logps/rejected": -62.75605010986328, + "loss": 0.4645, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.5070641040802, + "rewards/margins": 0.7834144830703735, + "rewards/rejected": 1.7236496210098267, + "step": 7608 + }, + { + "epoch": 1.24, + "learning_rate": 3.3704196258342426e-06, + "logits/chosen": -1.0539909601211548, + "logits/rejected": -1.091548204421997, + "logps/chosen": -53.30177307128906, + "logps/rejected": -59.59678649902344, + "loss": 0.2351, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.153088331222534, + "rewards/margins": 1.4277586936950684, + "rewards/rejected": 1.7253296375274658, + "step": 7609 + }, + { + "epoch": 1.24, + "learning_rate": 3.369177188155398e-06, + "logits/chosen": -1.458133339881897, + "logits/rejected": -1.5293302536010742, + "logps/chosen": -138.05979919433594, + "logps/rejected": -134.92103576660156, + "loss": 1.068, + "rewards/accuracies": 0.0, + "rewards/chosen": 4.229193210601807, + "rewards/margins": -1.257319450378418, + "rewards/rejected": 5.486512660980225, + "step": 7610 + }, + { + "epoch": 1.24, + "learning_rate": 3.367934863150979e-06, + "logits/chosen": -1.6758568286895752, + "logits/rejected": -1.7227146625518799, + "logps/chosen": -158.44725036621094, + "logps/rejected": -75.53535461425781, + "loss": 0.1581, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.854913234710693, + "rewards/margins": 5.121452331542969, + "rewards/rejected": 1.7334610223770142, + "step": 7611 + }, + { + "epoch": 1.24, + "learning_rate": 3.3666926509068175e-06, + "logits/chosen": -1.422644853591919, + "logits/rejected": -1.2358496189117432, + "logps/chosen": -125.3482437133789, + "logps/rejected": -63.1941032409668, + "loss": 0.3597, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.07471227645874, + "rewards/margins": 1.9849357604980469, + "rewards/rejected": 4.089776515960693, + "step": 7612 + }, + { + "epoch": 1.24, + "learning_rate": 3.36545055150874e-06, + "logits/chosen": -1.2204445600509644, + "logits/rejected": -1.227157711982727, + "logps/chosen": -56.58509826660156, + "logps/rejected": -49.055763244628906, + "loss": 0.5267, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.093796491622925, + "rewards/margins": -0.08652663230895996, + "rewards/rejected": 3.1803231239318848, + "step": 7613 + }, + { + "epoch": 1.24, + "learning_rate": 3.3642085650425625e-06, + "logits/chosen": -1.0046252012252808, + "logits/rejected": -1.1169155836105347, + "logps/chosen": -82.12680053710938, + "logps/rejected": -103.45172119140625, + "loss": 0.7262, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.10280179977417, + "rewards/margins": 0.05336618423461914, + "rewards/rejected": 4.049435615539551, + "step": 7614 + }, + { + "epoch": 1.24, + "learning_rate": 3.362966691594096e-06, + "logits/chosen": -1.3611546754837036, + "logits/rejected": -1.3082282543182373, + "logps/chosen": -57.834190368652344, + "logps/rejected": -47.22373580932617, + "loss": 0.6552, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.7243050336837769, + "rewards/margins": -0.3154040575027466, + "rewards/rejected": 2.0397090911865234, + "step": 7615 + }, + { + "epoch": 1.24, + "learning_rate": 3.3617249312491406e-06, + "logits/chosen": -1.3043733835220337, + "logits/rejected": -1.1883798837661743, + "logps/chosen": -57.57126235961914, + "logps/rejected": -19.099477767944336, + "loss": 0.8108, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.3578384518623352, + "rewards/margins": -0.4660043716430664, + "rewards/rejected": 0.8238428235054016, + "step": 7616 + }, + { + "epoch": 1.24, + "learning_rate": 3.3604832840934915e-06, + "logits/chosen": -0.9931572675704956, + "logits/rejected": -0.9931572675704956, + "logps/chosen": -89.45329284667969, + "logps/rejected": -89.45329284667969, + "loss": 0.5524, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.2300933599472046, + "rewards/margins": 0.0, + "rewards/rejected": 1.2300933599472046, + "step": 7617 + }, + { + "epoch": 1.24, + "learning_rate": 3.359241750212934e-06, + "logits/chosen": -1.450661063194275, + "logits/rejected": -1.450661063194275, + "logps/chosen": -63.24571990966797, + "logps/rejected": -63.24571990966797, + "loss": 0.3487, + "rewards/accuracies": 0.0, + "rewards/chosen": 5.8712382316589355, + "rewards/margins": 0.0, + "rewards/rejected": 5.8712382316589355, + "step": 7618 + }, + { + "epoch": 1.24, + "learning_rate": 3.358000329693246e-06, + "logits/chosen": -0.8785973191261292, + "logits/rejected": -0.8554570078849792, + "logps/chosen": -81.060302734375, + "logps/rejected": -77.02837371826172, + "loss": 0.0899, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.3122756481170654, + "rewards/margins": 2.0729124546051025, + "rewards/rejected": 0.23936310410499573, + "step": 7619 + }, + { + "epoch": 1.24, + "learning_rate": 3.3567590226201996e-06, + "logits/chosen": -1.6040210723876953, + "logits/rejected": -1.30709969997406, + "logps/chosen": -158.04039001464844, + "logps/rejected": -40.12356948852539, + "loss": 0.0046, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.7874436378479, + "rewards/margins": 4.840176105499268, + "rewards/rejected": 1.9472675323486328, + "step": 7620 + }, + { + "epoch": 1.24, + "learning_rate": 3.3555178290795555e-06, + "logits/chosen": -1.5921342372894287, + "logits/rejected": -1.5984522104263306, + "logps/chosen": -84.38639068603516, + "logps/rejected": -83.29412841796875, + "loss": 0.8433, + "rewards/accuracies": 0.0, + "rewards/chosen": 6.111854076385498, + "rewards/margins": -0.4698004722595215, + "rewards/rejected": 6.5816545486450195, + "step": 7621 + }, + { + "epoch": 1.24, + "learning_rate": 3.354276749157069e-06, + "logits/chosen": -1.2246999740600586, + "logits/rejected": -1.2593576908111572, + "logps/chosen": -165.78533935546875, + "logps/rejected": -158.62081909179688, + "loss": 0.3767, + "rewards/accuracies": 0.0, + "rewards/chosen": 7.291589260101318, + "rewards/margins": -0.10480070114135742, + "rewards/rejected": 7.396389961242676, + "step": 7622 + }, + { + "epoch": 1.24, + "learning_rate": 3.353035782938488e-06, + "logits/chosen": -1.3852261304855347, + "logits/rejected": -1.5326398611068726, + "logps/chosen": -56.654930114746094, + "logps/rejected": -98.65228271484375, + "loss": 1.7435, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.213456869125366, + "rewards/margins": -2.7306134700775146, + "rewards/rejected": 5.944070339202881, + "step": 7623 + }, + { + "epoch": 1.24, + "learning_rate": 3.35179493050955e-06, + "logits/chosen": -1.7264312505722046, + "logits/rejected": -1.7186635732650757, + "logps/chosen": -104.73603820800781, + "logps/rejected": -126.4556655883789, + "loss": 0.4168, + "rewards/accuracies": 0.0, + "rewards/chosen": 7.730660915374756, + "rewards/margins": -0.1321253776550293, + "rewards/rejected": 7.862786293029785, + "step": 7624 + }, + { + "epoch": 1.24, + "learning_rate": 3.350554191955987e-06, + "logits/chosen": -1.2857111692428589, + "logits/rejected": -1.2359322309494019, + "logps/chosen": -88.58619689941406, + "logps/rejected": -60.34791564941406, + "loss": 0.3985, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.186293125152588, + "rewards/margins": 1.2807419300079346, + "rewards/rejected": 0.9055511355400085, + "step": 7625 + }, + { + "epoch": 1.24, + "learning_rate": 3.3493135673635224e-06, + "logits/chosen": -1.6281514167785645, + "logits/rejected": -1.6068708896636963, + "logps/chosen": -118.5137939453125, + "logps/rejected": -127.34516143798828, + "loss": 0.1091, + "rewards/accuracies": 1.0, + "rewards/chosen": 8.319972038269043, + "rewards/margins": 2.255441665649414, + "rewards/rejected": 6.064530372619629, + "step": 7626 + }, + { + "epoch": 1.24, + "learning_rate": 3.348073056817871e-06, + "logits/chosen": -1.5781327486038208, + "logits/rejected": -1.5659624338150024, + "logps/chosen": -51.740875244140625, + "logps/rejected": -72.2152328491211, + "loss": 0.6727, + "rewards/accuracies": 0.0, + "rewards/chosen": 4.405070781707764, + "rewards/margins": -0.05880403518676758, + "rewards/rejected": 4.463874816894531, + "step": 7627 + }, + { + "epoch": 1.24, + "learning_rate": 3.346832660404741e-06, + "logits/chosen": -1.307576060295105, + "logits/rejected": -1.257094383239746, + "logps/chosen": -49.725425720214844, + "logps/rejected": -27.894630432128906, + "loss": 0.1595, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.189868927001953, + "rewards/margins": 1.5463976860046387, + "rewards/rejected": 0.6434711813926697, + "step": 7628 + }, + { + "epoch": 1.24, + "learning_rate": 3.3455923782098315e-06, + "logits/chosen": -1.2998305559158325, + "logits/rejected": -1.2176220417022705, + "logps/chosen": -91.53013610839844, + "logps/rejected": -36.594295501708984, + "loss": 0.0729, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.082411289215088, + "rewards/margins": 1.9959683418273926, + "rewards/rejected": 2.0864429473876953, + "step": 7629 + }, + { + "epoch": 1.24, + "learning_rate": 3.344352210318834e-06, + "logits/chosen": -0.9617378115653992, + "logits/rejected": -1.0077024698257446, + "logps/chosen": -128.49652099609375, + "logps/rejected": -142.85845947265625, + "loss": 1.1641, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.503710985183716, + "rewards/margins": -1.6831648349761963, + "rewards/rejected": 4.186875820159912, + "step": 7630 + }, + { + "epoch": 1.24, + "learning_rate": 3.343112156817434e-06, + "logits/chosen": -1.379899501800537, + "logits/rejected": -1.385297417640686, + "logps/chosen": -46.996620178222656, + "logps/rejected": -51.9678955078125, + "loss": 0.5649, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.312042236328125, + "rewards/margins": -0.31415247917175293, + "rewards/rejected": 3.626194715499878, + "step": 7631 + }, + { + "epoch": 1.24, + "learning_rate": 3.3418722177913054e-06, + "logits/chosen": -1.2379963397979736, + "logits/rejected": -1.1141383647918701, + "logps/chosen": -51.47059631347656, + "logps/rejected": -12.338741302490234, + "loss": 0.1244, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.079205274581909, + "rewards/margins": 1.334525227546692, + "rewards/rejected": 0.7446800470352173, + "step": 7632 + }, + { + "epoch": 1.24, + "learning_rate": 3.340632393326118e-06, + "logits/chosen": -1.2348685264587402, + "logits/rejected": -1.2198799848556519, + "logps/chosen": -59.5858039855957, + "logps/rejected": -96.87625122070312, + "loss": 0.4057, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.266735553741455, + "rewards/margins": 0.5517735481262207, + "rewards/rejected": 1.7149620056152344, + "step": 7633 + }, + { + "epoch": 1.24, + "learning_rate": 3.3393926835075307e-06, + "logits/chosen": -1.6749110221862793, + "logits/rejected": -1.7418330907821655, + "logps/chosen": -199.05804443359375, + "logps/rejected": -82.02055358886719, + "loss": 0.0842, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.663098335266113, + "rewards/margins": 3.5790650844573975, + "rewards/rejected": 3.084033250808716, + "step": 7634 + }, + { + "epoch": 1.24, + "learning_rate": 3.3381530884211966e-06, + "logits/chosen": -1.3036798238754272, + "logits/rejected": -1.3328608274459839, + "logps/chosen": -75.51692199707031, + "logps/rejected": -104.5177001953125, + "loss": 0.0871, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.856724500656128, + "rewards/margins": 1.796128749847412, + "rewards/rejected": 1.0605957508087158, + "step": 7635 + }, + { + "epoch": 1.24, + "learning_rate": 3.3369136081527586e-06, + "logits/chosen": -1.0492255687713623, + "logits/rejected": -1.0310006141662598, + "logps/chosen": -15.68756103515625, + "logps/rejected": -2.971027135848999, + "loss": 0.3414, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.0585817098617554, + "rewards/margins": 0.6100130081176758, + "rewards/rejected": 0.448568731546402, + "step": 7636 + }, + { + "epoch": 1.24, + "learning_rate": 3.3356742427878546e-06, + "logits/chosen": -0.9494816064834595, + "logits/rejected": -0.9808802604675293, + "logps/chosen": -39.4550895690918, + "logps/rejected": -52.165191650390625, + "loss": 1.0471, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.419810175895691, + "rewards/margins": 0.5510120987892151, + "rewards/rejected": 0.8687980771064758, + "step": 7637 + }, + { + "epoch": 1.24, + "learning_rate": 3.334434992412112e-06, + "logits/chosen": -1.2407971620559692, + "logits/rejected": -1.2407971620559692, + "logps/chosen": -70.26844787597656, + "logps/rejected": -70.26844787597656, + "loss": 1.0563, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.608344316482544, + "rewards/margins": 0.0, + "rewards/rejected": 2.608344316482544, + "step": 7638 + }, + { + "epoch": 1.24, + "learning_rate": 3.333195857111153e-06, + "logits/chosen": -1.1281051635742188, + "logits/rejected": -1.0115171670913696, + "logps/chosen": -76.25228118896484, + "logps/rejected": -31.958072662353516, + "loss": 0.0853, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.8816468715667725, + "rewards/margins": 1.9436436891555786, + "rewards/rejected": 1.9380031824111938, + "step": 7639 + }, + { + "epoch": 1.24, + "learning_rate": 3.331956836970587e-06, + "logits/chosen": -1.2888474464416504, + "logits/rejected": -1.2888474464416504, + "logps/chosen": -1.748903751373291, + "logps/rejected": -1.748903751373291, + "loss": 1.0677, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.21049976348876953, + "rewards/margins": 0.0, + "rewards/rejected": 0.21049976348876953, + "step": 7640 + }, + { + "epoch": 1.24, + "learning_rate": 3.3307179320760208e-06, + "logits/chosen": -1.2794499397277832, + "logits/rejected": -1.2719287872314453, + "logps/chosen": -158.77975463867188, + "logps/rejected": -157.09197998046875, + "loss": 1.1711, + "rewards/accuracies": 0.0, + "rewards/chosen": 6.898384094238281, + "rewards/margins": -1.7012548446655273, + "rewards/rejected": 8.599638938903809, + "step": 7641 + }, + { + "epoch": 1.24, + "learning_rate": 3.3294791425130512e-06, + "logits/chosen": -1.312487006187439, + "logits/rejected": -1.1968039274215698, + "logps/chosen": -69.40304565429688, + "logps/rejected": -25.90362548828125, + "loss": 0.1648, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.130551815032959, + "rewards/margins": 4.516061782836914, + "rewards/rejected": -0.3855098783969879, + "step": 7642 + }, + { + "epoch": 1.24, + "learning_rate": 3.3282404683672663e-06, + "logits/chosen": -1.1029212474822998, + "logits/rejected": -1.0084935426712036, + "logps/chosen": -78.05682373046875, + "logps/rejected": -67.78255462646484, + "loss": 0.1609, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.6370376348495483, + "rewards/margins": 1.0376105308532715, + "rewards/rejected": 0.5994270443916321, + "step": 7643 + }, + { + "epoch": 1.24, + "learning_rate": 3.3270019097242467e-06, + "logits/chosen": -0.9556409120559692, + "logits/rejected": -0.9601806402206421, + "logps/chosen": -54.22196960449219, + "logps/rejected": -50.213863372802734, + "loss": 1.0545, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.2017745971679688, + "rewards/margins": -0.41456878185272217, + "rewards/rejected": 1.616343379020691, + "step": 7644 + }, + { + "epoch": 1.24, + "learning_rate": 3.325763466669565e-06, + "logits/chosen": -0.9349093437194824, + "logits/rejected": -0.8832528591156006, + "logps/chosen": -53.46294021606445, + "logps/rejected": -72.66253662109375, + "loss": 0.2682, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.3191096782684326, + "rewards/margins": 0.3588908910751343, + "rewards/rejected": 1.9602187871932983, + "step": 7645 + }, + { + "epoch": 1.24, + "learning_rate": 3.3245251392887856e-06, + "logits/chosen": -1.153369665145874, + "logits/rejected": -0.9998096227645874, + "logps/chosen": -76.66691589355469, + "logps/rejected": -61.98553466796875, + "loss": 0.8165, + "rewards/accuracies": 1.0, + "rewards/chosen": 7.488389492034912, + "rewards/margins": 4.295768737792969, + "rewards/rejected": 3.1926209926605225, + "step": 7646 + }, + { + "epoch": 1.24, + "learning_rate": 3.3232869276674663e-06, + "logits/chosen": -1.1562477350234985, + "logits/rejected": -1.1307536363601685, + "logps/chosen": -13.230998992919922, + "logps/rejected": -4.6557135581970215, + "loss": 0.4202, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.376001238822937, + "rewards/margins": 0.4649220108985901, + "rewards/rejected": 0.9110792279243469, + "step": 7647 + }, + { + "epoch": 1.24, + "learning_rate": 3.3220488318911544e-06, + "logits/chosen": -1.5426158905029297, + "logits/rejected": -1.4213721752166748, + "logps/chosen": -63.39637756347656, + "logps/rejected": -80.3349609375, + "loss": 0.1376, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.504764080047607, + "rewards/margins": 2.174492120742798, + "rewards/rejected": 3.3302719593048096, + "step": 7648 + }, + { + "epoch": 1.24, + "learning_rate": 3.3208108520453912e-06, + "logits/chosen": -1.1671327352523804, + "logits/rejected": -1.1193231344223022, + "logps/chosen": -85.60028076171875, + "logps/rejected": -55.564353942871094, + "loss": 0.0227, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.315657138824463, + "rewards/margins": 3.0828375816345215, + "rewards/rejected": 1.2328194379806519, + "step": 7649 + }, + { + "epoch": 1.24, + "learning_rate": 3.3195729882157103e-06, + "logits/chosen": -1.4508486986160278, + "logits/rejected": -1.4051893949508667, + "logps/chosen": -140.0477752685547, + "logps/rejected": -126.2010726928711, + "loss": 2.1908, + "rewards/accuracies": 0.0, + "rewards/chosen": 4.26338529586792, + "rewards/margins": -0.5697116851806641, + "rewards/rejected": 4.833096981048584, + "step": 7650 + }, + { + "epoch": 1.24, + "learning_rate": 3.3183352404876347e-06, + "logits/chosen": -1.3865902423858643, + "logits/rejected": -1.2405223846435547, + "logps/chosen": -137.10508728027344, + "logps/rejected": -62.007633209228516, + "loss": 0.0011, + "rewards/accuracies": 1.0, + "rewards/chosen": 11.685564041137695, + "rewards/margins": 8.921345710754395, + "rewards/rejected": 2.76421856880188, + "step": 7651 + }, + { + "epoch": 1.24, + "learning_rate": 3.3170976089466823e-06, + "logits/chosen": -1.1618189811706543, + "logits/rejected": -1.234449028968811, + "logps/chosen": -31.527379989624023, + "logps/rejected": -69.47631072998047, + "loss": 0.6056, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.414469003677368, + "rewards/margins": -0.6469225883483887, + "rewards/rejected": 3.061391592025757, + "step": 7652 + }, + { + "epoch": 1.24, + "learning_rate": 3.3158600936783614e-06, + "logits/chosen": -1.850953221321106, + "logits/rejected": -1.8497982025146484, + "logps/chosen": -49.89860534667969, + "logps/rejected": -83.50870513916016, + "loss": 0.8011, + "rewards/accuracies": 0.0, + "rewards/chosen": 5.230053901672363, + "rewards/margins": -1.2853689193725586, + "rewards/rejected": 6.515422821044922, + "step": 7653 + }, + { + "epoch": 1.24, + "learning_rate": 3.3146226947681724e-06, + "logits/chosen": -1.4064420461654663, + "logits/rejected": -1.2350916862487793, + "logps/chosen": -117.9001235961914, + "logps/rejected": -41.41470718383789, + "loss": 0.0149, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.375854015350342, + "rewards/margins": 4.199260234832764, + "rewards/rejected": 2.176593780517578, + "step": 7654 + }, + { + "epoch": 1.24, + "learning_rate": 3.3133854123016085e-06, + "logits/chosen": -1.128257155418396, + "logits/rejected": -1.111140251159668, + "logps/chosen": -25.324953079223633, + "logps/rejected": -20.786823272705078, + "loss": 0.5453, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.1051667928695679, + "rewards/margins": 0.2540450692176819, + "rewards/rejected": 0.851121723651886, + "step": 7655 + }, + { + "epoch": 1.24, + "learning_rate": 3.3121482463641533e-06, + "logits/chosen": -1.687591552734375, + "logits/rejected": -1.6870341300964355, + "logps/chosen": -99.77691650390625, + "logps/rejected": -95.89022064208984, + "loss": 1.3346, + "rewards/accuracies": 0.0, + "rewards/chosen": 7.554229736328125, + "rewards/margins": -1.3608713150024414, + "rewards/rejected": 8.915101051330566, + "step": 7656 + }, + { + "epoch": 1.24, + "learning_rate": 3.3109111970412837e-06, + "logits/chosen": -1.4997577667236328, + "logits/rejected": -1.4787431955337524, + "logps/chosen": -88.68849182128906, + "logps/rejected": -76.73577880859375, + "loss": 0.2725, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.816165447235107, + "rewards/margins": 0.9597964286804199, + "rewards/rejected": 5.8563690185546875, + "step": 7657 + }, + { + "epoch": 1.24, + "learning_rate": 3.3096742644184682e-06, + "logits/chosen": -1.4545923471450806, + "logits/rejected": -1.561671495437622, + "logps/chosen": -109.362060546875, + "logps/rejected": -124.67311096191406, + "loss": 0.9121, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.5915558338165283, + "rewards/margins": -0.8468506336212158, + "rewards/rejected": 4.438406467437744, + "step": 7658 + }, + { + "epoch": 1.24, + "learning_rate": 3.308437448581167e-06, + "logits/chosen": -1.6200542449951172, + "logits/rejected": -1.6377750635147095, + "logps/chosen": -67.32354736328125, + "logps/rejected": -252.04586791992188, + "loss": 2.0318, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.8708466291427612, + "rewards/margins": -3.0213775634765625, + "rewards/rejected": 4.892224311828613, + "step": 7659 + }, + { + "epoch": 1.24, + "learning_rate": 3.3072007496148323e-06, + "logits/chosen": -1.1519176959991455, + "logits/rejected": -1.0681369304656982, + "logps/chosen": -67.13973236083984, + "logps/rejected": -12.066773414611816, + "loss": 0.16, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.4214775562286377, + "rewards/margins": 1.8073928356170654, + "rewards/rejected": 0.6140847206115723, + "step": 7660 + }, + { + "epoch": 1.24, + "learning_rate": 3.305964167604908e-06, + "logits/chosen": -1.1037063598632812, + "logits/rejected": -0.8002412915229797, + "logps/chosen": -82.29782104492188, + "logps/rejected": -32.68909454345703, + "loss": 0.0673, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.798739910125732, + "rewards/margins": 4.519575119018555, + "rewards/rejected": 2.2791645526885986, + "step": 7661 + }, + { + "epoch": 1.24, + "learning_rate": 3.3047277026368318e-06, + "logits/chosen": -1.1568864583969116, + "logits/rejected": -1.2560055255889893, + "logps/chosen": -29.545494079589844, + "logps/rejected": -44.89651870727539, + "loss": 3.8467, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.6967315673828125, + "rewards/margins": -5.662818908691406, + "rewards/rejected": 7.359550476074219, + "step": 7662 + }, + { + "epoch": 1.24, + "learning_rate": 3.3034913547960297e-06, + "logits/chosen": -0.8790771961212158, + "logits/rejected": -0.9287693500518799, + "logps/chosen": -57.754638671875, + "logps/rejected": -52.0903205871582, + "loss": 1.8313, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.590815782546997, + "rewards/margins": 0.4950413703918457, + "rewards/rejected": 2.0957744121551514, + "step": 7663 + }, + { + "epoch": 1.24, + "learning_rate": 3.3022551241679226e-06, + "logits/chosen": -0.7065014243125916, + "logits/rejected": -0.7477826476097107, + "logps/chosen": -77.25520324707031, + "logps/rejected": -61.021751403808594, + "loss": 1.7244, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.3855164051055908, + "rewards/margins": -2.186932325363159, + "rewards/rejected": 3.57244873046875, + "step": 7664 + }, + { + "epoch": 1.24, + "learning_rate": 3.3010190108379214e-06, + "logits/chosen": -1.1362528800964355, + "logits/rejected": -1.1306809186935425, + "logps/chosen": -72.30087280273438, + "logps/rejected": -84.56982421875, + "loss": 0.2931, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.7084106802940369, + "rewards/margins": 0.30355456471443176, + "rewards/rejected": 0.4048561155796051, + "step": 7665 + }, + { + "epoch": 1.24, + "learning_rate": 3.2997830148914316e-06, + "logits/chosen": -1.2010995149612427, + "logits/rejected": -1.2108432054519653, + "logps/chosen": -50.58424758911133, + "logps/rejected": -107.69955444335938, + "loss": 0.6538, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.080352544784546, + "rewards/margins": 0.9578022956848145, + "rewards/rejected": 1.1225502490997314, + "step": 7666 + }, + { + "epoch": 1.24, + "learning_rate": 3.2985471364138477e-06, + "logits/chosen": -1.326012134552002, + "logits/rejected": -1.2581866979599, + "logps/chosen": -151.30531311035156, + "logps/rejected": -73.77799987792969, + "loss": 0.4289, + "rewards/accuracies": 1.0, + "rewards/chosen": 7.249264717102051, + "rewards/margins": 0.2623748779296875, + "rewards/rejected": 6.986889839172363, + "step": 7667 + }, + { + "epoch": 1.24, + "learning_rate": 3.297311375490557e-06, + "logits/chosen": -0.9540777206420898, + "logits/rejected": -0.9540777206420898, + "logps/chosen": -0.6125218272209167, + "logps/rejected": -0.6125218272209167, + "loss": 0.8073, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.08700893819332123, + "rewards/margins": 0.0, + "rewards/rejected": 0.08700893819332123, + "step": 7668 + }, + { + "epoch": 1.24, + "learning_rate": 3.2960757322069405e-06, + "logits/chosen": -1.4838674068450928, + "logits/rejected": -1.4097880125045776, + "logps/chosen": -112.13258361816406, + "logps/rejected": -45.4330940246582, + "loss": 0.8613, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.455705404281616, + "rewards/margins": -0.4167659282684326, + "rewards/rejected": 2.872471332550049, + "step": 7669 + }, + { + "epoch": 1.24, + "learning_rate": 3.2948402066483666e-06, + "logits/chosen": -1.549813151359558, + "logits/rejected": -1.5556753873825073, + "logps/chosen": -95.64082336425781, + "logps/rejected": -84.80300903320312, + "loss": 1.0908, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.832348585128784, + "rewards/margins": 1.190537929534912, + "rewards/rejected": 2.641810655593872, + "step": 7670 + }, + { + "epoch": 1.25, + "learning_rate": 3.2936047989002007e-06, + "logits/chosen": -1.3617421388626099, + "logits/rejected": -1.3374296426773071, + "logps/chosen": -72.71195983886719, + "logps/rejected": -69.77603149414062, + "loss": 0.3185, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.8861572742462158, + "rewards/margins": 1.2429261207580566, + "rewards/rejected": 0.643231213092804, + "step": 7671 + }, + { + "epoch": 1.25, + "learning_rate": 3.2923695090477975e-06, + "logits/chosen": -1.4569982290267944, + "logits/rejected": -1.325409173965454, + "logps/chosen": -105.44384765625, + "logps/rejected": -36.374732971191406, + "loss": 0.0763, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.123532295227051, + "rewards/margins": 4.7139458656311035, + "rewards/rejected": 0.4095863401889801, + "step": 7672 + }, + { + "epoch": 1.25, + "learning_rate": 3.291134337176503e-06, + "logits/chosen": -0.9886152744293213, + "logits/rejected": -0.9227168560028076, + "logps/chosen": -65.84503173828125, + "logps/rejected": -75.520263671875, + "loss": 0.4104, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.1964669227600098, + "rewards/margins": -0.21109843254089355, + "rewards/rejected": 2.4075653553009033, + "step": 7673 + }, + { + "epoch": 1.25, + "learning_rate": 3.289899283371657e-06, + "logits/chosen": -1.2400823831558228, + "logits/rejected": -1.2705130577087402, + "logps/chosen": -206.03366088867188, + "logps/rejected": -105.13509368896484, + "loss": 0.0053, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.786520481109619, + "rewards/margins": 4.687357425689697, + "rewards/rejected": 1.0991630554199219, + "step": 7674 + }, + { + "epoch": 1.25, + "learning_rate": 3.2886643477185875e-06, + "logits/chosen": -1.2973552942276, + "logits/rejected": -1.0783166885375977, + "logps/chosen": -87.97566223144531, + "logps/rejected": -44.53190612792969, + "loss": 0.0313, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.229588508605957, + "rewards/margins": 2.7762582302093506, + "rewards/rejected": 2.4533302783966064, + "step": 7675 + }, + { + "epoch": 1.25, + "learning_rate": 3.2874295303026206e-06, + "logits/chosen": -1.0779163837432861, + "logits/rejected": -0.9332080483436584, + "logps/chosen": -63.58500289916992, + "logps/rejected": -10.062458992004395, + "loss": 1.2123, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.539344310760498, + "rewards/margins": 4.047730445861816, + "rewards/rejected": 0.4916136860847473, + "step": 7676 + }, + { + "epoch": 1.25, + "learning_rate": 3.286194831209068e-06, + "logits/chosen": -1.0465753078460693, + "logits/rejected": -0.8672926425933838, + "logps/chosen": -69.07765197753906, + "logps/rejected": -15.554137229919434, + "loss": 0.1164, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.592552185058594, + "rewards/margins": 4.679091930389404, + "rewards/rejected": 1.9134601354599, + "step": 7677 + }, + { + "epoch": 1.25, + "learning_rate": 3.284960250523237e-06, + "logits/chosen": -1.0816946029663086, + "logits/rejected": -1.0975022315979004, + "logps/chosen": -72.32633209228516, + "logps/rejected": -88.7558822631836, + "loss": 0.4168, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0216667652130127, + "rewards/margins": 1.4890679121017456, + "rewards/rejected": 1.532598853111267, + "step": 7678 + }, + { + "epoch": 1.25, + "learning_rate": 3.283725788330424e-06, + "logits/chosen": -1.089808702468872, + "logits/rejected": -1.166386365890503, + "logps/chosen": -113.54117584228516, + "logps/rejected": -139.0958251953125, + "loss": 0.8059, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.370943546295166, + "rewards/margins": -1.3642187118530273, + "rewards/rejected": 4.735162258148193, + "step": 7679 + }, + { + "epoch": 1.25, + "learning_rate": 3.2824914447159218e-06, + "logits/chosen": -1.5972522497177124, + "logits/rejected": -1.5710418224334717, + "logps/chosen": -76.53132629394531, + "logps/rejected": -56.001686096191406, + "loss": 0.3013, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.656585693359375, + "rewards/margins": 0.5131721496582031, + "rewards/rejected": 3.143413543701172, + "step": 7680 + }, + { + "epoch": 1.25, + "learning_rate": 3.281257219765008e-06, + "logits/chosen": -1.0565186738967896, + "logits/rejected": -1.0693447589874268, + "logps/chosen": -95.10826873779297, + "logps/rejected": -175.5093536376953, + "loss": 1.4875, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.570777177810669, + "rewards/margins": 0.6398705244064331, + "rewards/rejected": 1.9309066534042358, + "step": 7681 + }, + { + "epoch": 1.25, + "learning_rate": 3.280023113562957e-06, + "logits/chosen": -1.4603712558746338, + "logits/rejected": -1.4298409223556519, + "logps/chosen": -86.17385864257812, + "logps/rejected": -45.896697998046875, + "loss": 0.5624, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.339157819747925, + "rewards/margins": 0.18186402320861816, + "rewards/rejected": 3.1572937965393066, + "step": 7682 + }, + { + "epoch": 1.25, + "learning_rate": 3.2787891261950344e-06, + "logits/chosen": -1.1440036296844482, + "logits/rejected": -1.1145485639572144, + "logps/chosen": -57.68408966064453, + "logps/rejected": -74.63520812988281, + "loss": 0.7361, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.0582756996154785, + "rewards/margins": -1.205197811126709, + "rewards/rejected": 3.2634735107421875, + "step": 7683 + }, + { + "epoch": 1.25, + "learning_rate": 3.2775552577464973e-06, + "logits/chosen": -1.521096110343933, + "logits/rejected": -1.5784122943878174, + "logps/chosen": -79.74722290039062, + "logps/rejected": -129.45994567871094, + "loss": 0.9997, + "rewards/accuracies": 0.0, + "rewards/chosen": 6.580356121063232, + "rewards/margins": -1.7444806098937988, + "rewards/rejected": 8.324836730957031, + "step": 7684 + }, + { + "epoch": 1.25, + "learning_rate": 3.2763215083025936e-06, + "logits/chosen": -0.9081538319587708, + "logits/rejected": -0.9355801343917847, + "logps/chosen": -70.5652847290039, + "logps/rejected": -77.15864562988281, + "loss": 2.4032, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.3160622119903564, + "rewards/margins": 0.8325218558311462, + "rewards/rejected": 0.4835403561592102, + "step": 7685 + }, + { + "epoch": 1.25, + "learning_rate": 3.275087877948564e-06, + "logits/chosen": -1.2221410274505615, + "logits/rejected": -0.8580423593521118, + "logps/chosen": -107.79961395263672, + "logps/rejected": -38.4866943359375, + "loss": 0.0302, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.422577857971191, + "rewards/margins": 4.604917049407959, + "rewards/rejected": 0.817660927772522, + "step": 7686 + }, + { + "epoch": 1.25, + "learning_rate": 3.2738543667696408e-06, + "logits/chosen": -1.0552762746810913, + "logits/rejected": -1.063710331916809, + "logps/chosen": -15.262456893920898, + "logps/rejected": -10.297788619995117, + "loss": 0.2811, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3465280532836914, + "rewards/margins": 0.49009430408477783, + "rewards/rejected": -0.14356623589992523, + "step": 7687 + }, + { + "epoch": 1.25, + "learning_rate": 3.2726209748510477e-06, + "logits/chosen": -1.3890095949172974, + "logits/rejected": -1.4894529581069946, + "logps/chosen": -233.79421997070312, + "logps/rejected": -108.2947769165039, + "loss": 0.4683, + "rewards/accuracies": 0.0, + "rewards/chosen": 5.8473968505859375, + "rewards/margins": -0.3647332191467285, + "rewards/rejected": 6.212130069732666, + "step": 7688 + }, + { + "epoch": 1.25, + "learning_rate": 3.271387702278001e-06, + "logits/chosen": -1.2288429737091064, + "logits/rejected": -1.1145638227462769, + "logps/chosen": -52.302146911621094, + "logps/rejected": -22.095563888549805, + "loss": 0.0746, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.7609269618988037, + "rewards/margins": 1.8385974168777466, + "rewards/rejected": 0.9223295450210571, + "step": 7689 + }, + { + "epoch": 1.25, + "learning_rate": 3.270154549135708e-06, + "logits/chosen": -1.2161760330200195, + "logits/rejected": -1.1672366857528687, + "logps/chosen": -75.06253051757812, + "logps/rejected": -40.42936706542969, + "loss": 0.131, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.5595695972442627, + "rewards/margins": 1.2780418395996094, + "rewards/rejected": 1.2815277576446533, + "step": 7690 + }, + { + "epoch": 1.25, + "learning_rate": 3.2689215155093675e-06, + "logits/chosen": -1.3242629766464233, + "logits/rejected": -1.3089932203292847, + "logps/chosen": -129.85325622558594, + "logps/rejected": -20.839183807373047, + "loss": 0.1353, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.339019775390625, + "rewards/margins": 5.782156467437744, + "rewards/rejected": 0.5568634271621704, + "step": 7691 + }, + { + "epoch": 1.25, + "learning_rate": 3.26768860148417e-06, + "logits/chosen": -1.2935981750488281, + "logits/rejected": -1.2501837015151978, + "logps/chosen": -76.78889465332031, + "logps/rejected": -56.44407653808594, + "loss": 0.3191, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.0600953102111816, + "rewards/margins": 1.640683889389038, + "rewards/rejected": 0.41941148042678833, + "step": 7692 + }, + { + "epoch": 1.25, + "learning_rate": 3.2664558071453e-06, + "logits/chosen": -0.8859999775886536, + "logits/rejected": -0.8781256675720215, + "logps/chosen": -44.337039947509766, + "logps/rejected": -61.496421813964844, + "loss": 0.3069, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.26665997505188, + "rewards/margins": 1.2861912250518799, + "rewards/rejected": 0.98046875, + "step": 7693 + }, + { + "epoch": 1.25, + "learning_rate": 3.26522313257793e-06, + "logits/chosen": -1.3538941144943237, + "logits/rejected": -1.3538941144943237, + "logps/chosen": -45.4794921875, + "logps/rejected": -45.4794921875, + "loss": 0.4222, + "rewards/accuracies": 0.0, + "rewards/chosen": 5.3754167556762695, + "rewards/margins": 0.0, + "rewards/rejected": 5.3754167556762695, + "step": 7694 + }, + { + "epoch": 1.25, + "learning_rate": 3.263990577867227e-06, + "logits/chosen": -1.517203450202942, + "logits/rejected": -1.4653736352920532, + "logps/chosen": -94.77845764160156, + "logps/rejected": -51.22882843017578, + "loss": 0.1097, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.7001755237579346, + "rewards/margins": 2.057697296142578, + "rewards/rejected": 1.6424782276153564, + "step": 7695 + }, + { + "epoch": 1.25, + "learning_rate": 3.262758143098348e-06, + "logits/chosen": -1.2646774053573608, + "logits/rejected": -1.3025764226913452, + "logps/chosen": -74.63310241699219, + "logps/rejected": -144.65191650390625, + "loss": 0.6905, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.022714376449585, + "rewards/margins": 0.1675262451171875, + "rewards/rejected": 2.8551881313323975, + "step": 7696 + }, + { + "epoch": 1.25, + "learning_rate": 3.2615258283564443e-06, + "logits/chosen": -1.0741297006607056, + "logits/rejected": -1.1176084280014038, + "logps/chosen": -20.69152069091797, + "logps/rejected": -38.23836135864258, + "loss": 0.7368, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.8103755712509155, + "rewards/margins": -0.26892411708831787, + "rewards/rejected": 2.0792996883392334, + "step": 7697 + }, + { + "epoch": 1.25, + "learning_rate": 3.260293633726656e-06, + "logits/chosen": -1.1451666355133057, + "logits/rejected": -1.1116712093353271, + "logps/chosen": -58.410118103027344, + "logps/rejected": -48.6917724609375, + "loss": 0.6733, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.9399116635322571, + "rewards/margins": -0.8905006051063538, + "rewards/rejected": 1.8304122686386108, + "step": 7698 + }, + { + "epoch": 1.25, + "learning_rate": 3.259061559294117e-06, + "logits/chosen": -1.4868712425231934, + "logits/rejected": -1.445430040359497, + "logps/chosen": -88.46916198730469, + "logps/rejected": -64.21959686279297, + "loss": 0.1731, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.720046281814575, + "rewards/margins": 0.9702361822128296, + "rewards/rejected": 1.7498100996017456, + "step": 7699 + }, + { + "epoch": 1.25, + "learning_rate": 3.257829605143951e-06, + "logits/chosen": -1.098127007484436, + "logits/rejected": -1.1278676986694336, + "logps/chosen": -79.51074981689453, + "logps/rejected": -90.7225341796875, + "loss": 0.3562, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2574188709259033, + "rewards/margins": 2.0814247131347656, + "rewards/rejected": 1.1759941577911377, + "step": 7700 + }, + { + "epoch": 1.25, + "learning_rate": 3.256597771361274e-06, + "logits/chosen": -1.2983500957489014, + "logits/rejected": -1.2907941341400146, + "logps/chosen": -89.3363265991211, + "logps/rejected": -97.929443359375, + "loss": 0.3028, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.4132080078125, + "rewards/margins": 0.18398737907409668, + "rewards/rejected": 1.2292206287384033, + "step": 7701 + }, + { + "epoch": 1.25, + "learning_rate": 3.255366058031196e-06, + "logits/chosen": -0.9578677415847778, + "logits/rejected": -0.9168121814727783, + "logps/chosen": -56.519901275634766, + "logps/rejected": -72.26576232910156, + "loss": 0.8804, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.4765194654464722, + "rewards/margins": -0.11736869812011719, + "rewards/rejected": 1.5938881635665894, + "step": 7702 + }, + { + "epoch": 1.25, + "learning_rate": 3.2541344652388153e-06, + "logits/chosen": -1.3733359575271606, + "logits/rejected": -1.3282499313354492, + "logps/chosen": -65.87022399902344, + "logps/rejected": -17.352947235107422, + "loss": 0.1686, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.0276811122894287, + "rewards/margins": 1.5149403810501099, + "rewards/rejected": 0.5127407312393188, + "step": 7703 + }, + { + "epoch": 1.25, + "learning_rate": 3.2529029930692225e-06, + "logits/chosen": -1.1280344724655151, + "logits/rejected": -1.228939175605774, + "logps/chosen": -87.41273498535156, + "logps/rejected": -98.25376892089844, + "loss": 0.3717, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9205613136291504, + "rewards/margins": 0.0135040283203125, + "rewards/rejected": 2.907057285308838, + "step": 7704 + }, + { + "epoch": 1.25, + "learning_rate": 3.2516716416075023e-06, + "logits/chosen": -1.1894721984863281, + "logits/rejected": -1.1894721984863281, + "logps/chosen": -15.32568073272705, + "logps/rejected": -15.32568073272705, + "loss": 0.3607, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.3288846015930176, + "rewards/margins": 0.0, + "rewards/rejected": 2.3288846015930176, + "step": 7705 + }, + { + "epoch": 1.25, + "learning_rate": 3.250440410938729e-06, + "logits/chosen": -1.2012850046157837, + "logits/rejected": -1.2171331644058228, + "logps/chosen": -167.49705505371094, + "logps/rejected": -104.05279541015625, + "loss": 1.1193, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.236934185028076, + "rewards/margins": 2.6770341396331787, + "rewards/rejected": 3.5599000453948975, + "step": 7706 + }, + { + "epoch": 1.25, + "learning_rate": 3.2492093011479685e-06, + "logits/chosen": -1.413772702217102, + "logits/rejected": -1.5501772165298462, + "logps/chosen": -63.44132614135742, + "logps/rejected": -36.717918395996094, + "loss": 0.7524, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.3928120136260986, + "rewards/margins": 3.1424922943115234, + "rewards/rejected": 0.2503196895122528, + "step": 7707 + }, + { + "epoch": 1.25, + "learning_rate": 3.2479783123202794e-06, + "logits/chosen": -1.1773052215576172, + "logits/rejected": -1.1054542064666748, + "logps/chosen": -51.39271926879883, + "logps/rejected": -56.01246643066406, + "loss": 0.1254, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.848479986190796, + "rewards/margins": 1.4014835357666016, + "rewards/rejected": 2.4469964504241943, + "step": 7708 + }, + { + "epoch": 1.25, + "learning_rate": 3.246747444540711e-06, + "logits/chosen": -1.4909588098526, + "logits/rejected": -1.474628210067749, + "logps/chosen": -232.077392578125, + "logps/rejected": -66.821044921875, + "loss": 0.2657, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.0740630626678467, + "rewards/margins": 0.3676718473434448, + "rewards/rejected": 1.7063912153244019, + "step": 7709 + }, + { + "epoch": 1.25, + "learning_rate": 3.2455166978943054e-06, + "logits/chosen": -1.3357406854629517, + "logits/rejected": -1.3646515607833862, + "logps/chosen": -50.73433303833008, + "logps/rejected": -36.56642532348633, + "loss": 0.3744, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.678657293319702, + "rewards/margins": -0.0938558578491211, + "rewards/rejected": 2.7725131511688232, + "step": 7710 + }, + { + "epoch": 1.25, + "learning_rate": 3.244286072466094e-06, + "logits/chosen": -1.4289213418960571, + "logits/rejected": -1.3923522233963013, + "logps/chosen": -44.23577880859375, + "logps/rejected": -50.75545120239258, + "loss": 0.2613, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.4811837673187256, + "rewards/margins": 0.45035457611083984, + "rewards/rejected": 2.0308291912078857, + "step": 7711 + }, + { + "epoch": 1.25, + "learning_rate": 3.2430555683411024e-06, + "logits/chosen": -1.2287942171096802, + "logits/rejected": -1.2287942171096802, + "logps/chosen": -36.15800476074219, + "logps/rejected": -36.15800476074219, + "loss": 0.8316, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.0727975368499756, + "rewards/margins": 0.0, + "rewards/rejected": 2.0727975368499756, + "step": 7712 + }, + { + "epoch": 1.25, + "learning_rate": 3.241825185604347e-06, + "logits/chosen": -0.9266319870948792, + "logits/rejected": -0.8933059573173523, + "logps/chosen": -34.70359802246094, + "logps/rejected": -49.75592803955078, + "loss": 0.1577, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.212519884109497, + "rewards/margins": 1.0957283973693848, + "rewards/rejected": 0.11679153889417648, + "step": 7713 + }, + { + "epoch": 1.25, + "learning_rate": 3.240594924340835e-06, + "logits/chosen": -1.5311719179153442, + "logits/rejected": -1.5133744478225708, + "logps/chosen": -27.589006423950195, + "logps/rejected": -34.05549621582031, + "loss": 0.2962, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.59181809425354, + "rewards/margins": 1.089781403541565, + "rewards/rejected": 0.5020366907119751, + "step": 7714 + }, + { + "epoch": 1.25, + "learning_rate": 3.239364784635567e-06, + "logits/chosen": -1.1671128273010254, + "logits/rejected": -1.0310254096984863, + "logps/chosen": -41.40301513671875, + "logps/rejected": -34.888702392578125, + "loss": 0.3581, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.866485595703125, + "rewards/margins": 0.7198673486709595, + "rewards/rejected": 1.1466182470321655, + "step": 7715 + }, + { + "epoch": 1.25, + "learning_rate": 3.238134766573532e-06, + "logits/chosen": -2.1839966773986816, + "logits/rejected": -2.229914665222168, + "logps/chosen": -247.8953857421875, + "logps/rejected": -126.68684387207031, + "loss": 0.1415, + "rewards/accuracies": 1.0, + "rewards/chosen": 7.393780708312988, + "rewards/margins": 1.1588211059570312, + "rewards/rejected": 6.234959602355957, + "step": 7716 + }, + { + "epoch": 1.25, + "learning_rate": 3.2369048702397145e-06, + "logits/chosen": -1.597021460533142, + "logits/rejected": -1.4671070575714111, + "logps/chosen": -95.91886138916016, + "logps/rejected": -22.68294334411621, + "loss": 0.0389, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1284539699554443, + "rewards/margins": 2.6591124534606934, + "rewards/rejected": 0.4693414866924286, + "step": 7717 + }, + { + "epoch": 1.25, + "learning_rate": 3.2356750957190867e-06, + "logits/chosen": -1.3322654962539673, + "logits/rejected": -1.2977981567382812, + "logps/chosen": -118.10890197753906, + "logps/rejected": -89.0046615600586, + "loss": 0.0947, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1422364711761475, + "rewards/margins": 1.639737844467163, + "rewards/rejected": 1.5024986267089844, + "step": 7718 + }, + { + "epoch": 1.25, + "learning_rate": 3.2344454430966165e-06, + "logits/chosen": -1.1665014028549194, + "logits/rejected": -1.2811397314071655, + "logps/chosen": -47.212127685546875, + "logps/rejected": -75.59933471679688, + "loss": 3.2957, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.4031479358673096, + "rewards/margins": -3.9806487560272217, + "rewards/rejected": 7.383796691894531, + "step": 7719 + }, + { + "epoch": 1.25, + "learning_rate": 3.2332159124572605e-06, + "logits/chosen": -1.4934762716293335, + "logits/rejected": -1.43950617313385, + "logps/chosen": -55.336570739746094, + "logps/rejected": -51.54020690917969, + "loss": 0.6558, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.944857120513916, + "rewards/margins": 0.1555030345916748, + "rewards/rejected": 2.789354085922241, + "step": 7720 + }, + { + "epoch": 1.25, + "learning_rate": 3.2319865038859664e-06, + "logits/chosen": -1.2778306007385254, + "logits/rejected": -1.1921498775482178, + "logps/chosen": -41.380760192871094, + "logps/rejected": -66.82899475097656, + "loss": 0.3179, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.134629011154175, + "rewards/margins": 0.18747782707214355, + "rewards/rejected": 2.9471511840820312, + "step": 7721 + }, + { + "epoch": 1.25, + "learning_rate": 3.230757217467677e-06, + "logits/chosen": -1.491678237915039, + "logits/rejected": -1.4822443723678589, + "logps/chosen": -82.16094970703125, + "logps/rejected": -67.77783203125, + "loss": 0.2818, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.005549669265747, + "rewards/margins": 0.48506927490234375, + "rewards/rejected": 2.5204803943634033, + "step": 7722 + }, + { + "epoch": 1.25, + "learning_rate": 3.229528053287323e-06, + "logits/chosen": -1.506422758102417, + "logits/rejected": -1.4300512075424194, + "logps/chosen": -45.738853454589844, + "logps/rejected": -59.10126495361328, + "loss": 0.2011, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.103450059890747, + "rewards/margins": 1.2007217407226562, + "rewards/rejected": 0.902728259563446, + "step": 7723 + }, + { + "epoch": 1.25, + "learning_rate": 3.2282990114298273e-06, + "logits/chosen": -1.1229605674743652, + "logits/rejected": -1.1229605674743652, + "logps/chosen": -79.81063842773438, + "logps/rejected": -79.81063842773438, + "loss": 0.9077, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.3569748401641846, + "rewards/margins": 0.0, + "rewards/rejected": 2.3569748401641846, + "step": 7724 + }, + { + "epoch": 1.25, + "learning_rate": 3.227070091980107e-06, + "logits/chosen": -0.7344436645507812, + "logits/rejected": -0.7355678677558899, + "logps/chosen": -2.5962414741516113, + "logps/rejected": -2.4117469787597656, + "loss": 0.4277, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.17084288597106934, + "rewards/margins": -0.013144642114639282, + "rewards/rejected": 0.18398752808570862, + "step": 7725 + }, + { + "epoch": 1.25, + "learning_rate": 3.2258412950230665e-06, + "logits/chosen": -1.1657956838607788, + "logits/rejected": -1.1657956838607788, + "logps/chosen": -25.935264587402344, + "logps/rejected": -25.935264587402344, + "loss": 0.8952, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.1799607276916504, + "rewards/margins": 0.0, + "rewards/rejected": 3.1799607276916504, + "step": 7726 + }, + { + "epoch": 1.25, + "learning_rate": 3.2246126206436066e-06, + "logits/chosen": -1.134351134300232, + "logits/rejected": -1.2054238319396973, + "logps/chosen": -66.10938262939453, + "logps/rejected": -85.18126678466797, + "loss": 1.3703, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.1685197353363037, + "rewards/margins": -2.521425485610962, + "rewards/rejected": 5.689945220947266, + "step": 7727 + }, + { + "epoch": 1.25, + "learning_rate": 3.223384068926615e-06, + "logits/chosen": -1.2538222074508667, + "logits/rejected": -1.1781963109970093, + "logps/chosen": -89.13629150390625, + "logps/rejected": -56.05562210083008, + "loss": 0.0157, + "rewards/accuracies": 1.0, + "rewards/chosen": 7.930652141571045, + "rewards/margins": 5.678731441497803, + "rewards/rejected": 2.251920700073242, + "step": 7728 + }, + { + "epoch": 1.25, + "learning_rate": 3.2221556399569744e-06, + "logits/chosen": -1.4730969667434692, + "logits/rejected": -1.380595326423645, + "logps/chosen": -66.96421813964844, + "logps/rejected": -31.010150909423828, + "loss": 0.0159, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.073502540588379, + "rewards/margins": 3.4608850479125977, + "rewards/rejected": 2.6126174926757812, + "step": 7729 + }, + { + "epoch": 1.25, + "learning_rate": 3.2209273338195555e-06, + "logits/chosen": -1.5478878021240234, + "logits/rejected": -1.4860186576843262, + "logps/chosen": -99.17134857177734, + "logps/rejected": -88.33175659179688, + "loss": 0.2267, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.581583499908447, + "rewards/margins": 0.663642168045044, + "rewards/rejected": 3.9179413318634033, + "step": 7730 + }, + { + "epoch": 1.25, + "learning_rate": 3.219699150599224e-06, + "logits/chosen": -1.4268643856048584, + "logits/rejected": -1.4069578647613525, + "logps/chosen": -49.7752685546875, + "logps/rejected": -734.7334594726562, + "loss": 58.8896, + "rewards/accuracies": 0.0, + "rewards/chosen": 4.844468593597412, + "rewards/margins": -117.77194213867188, + "rewards/rejected": 122.61640930175781, + "step": 7731 + }, + { + "epoch": 1.25, + "learning_rate": 3.2184710903808367e-06, + "logits/chosen": -1.3325756788253784, + "logits/rejected": -1.3537887334823608, + "logps/chosen": -118.7423095703125, + "logps/rejected": -88.008056640625, + "loss": 0.1253, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.471452236175537, + "rewards/margins": 1.29520845413208, + "rewards/rejected": 5.176243782043457, + "step": 7732 + }, + { + "epoch": 1.26, + "learning_rate": 3.217243153249241e-06, + "logits/chosen": -1.1391710042953491, + "logits/rejected": -1.2404766082763672, + "logps/chosen": -63.67228698730469, + "logps/rejected": -115.57954406738281, + "loss": 1.4121, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.5182290077209473, + "rewards/margins": -2.3959174156188965, + "rewards/rejected": 4.914146423339844, + "step": 7733 + }, + { + "epoch": 1.26, + "learning_rate": 3.2160153392892737e-06, + "logits/chosen": -1.2292394638061523, + "logits/rejected": -1.307453989982605, + "logps/chosen": -39.06103515625, + "logps/rejected": -128.92868041992188, + "loss": 0.8525, + "rewards/accuracies": 0.0, + "rewards/chosen": 4.848402500152588, + "rewards/margins": -1.5009551048278809, + "rewards/rejected": 6.349357604980469, + "step": 7734 + }, + { + "epoch": 1.26, + "learning_rate": 3.2147876485857667e-06, + "logits/chosen": -1.029685616493225, + "logits/rejected": -0.9198436141014099, + "logps/chosen": -62.75032043457031, + "logps/rejected": -65.9666976928711, + "loss": 0.1682, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0519180297851562, + "rewards/margins": 1.7418166399002075, + "rewards/rejected": 1.3101013898849487, + "step": 7735 + }, + { + "epoch": 1.26, + "learning_rate": 3.2135600812235413e-06, + "logits/chosen": -1.1616631746292114, + "logits/rejected": -1.1616631746292114, + "logps/chosen": -36.4751091003418, + "logps/rejected": -36.4751091003418, + "loss": 0.9822, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.80127215385437, + "rewards/margins": 0.0, + "rewards/rejected": 2.80127215385437, + "step": 7736 + }, + { + "epoch": 1.26, + "learning_rate": 3.2123326372874104e-06, + "logits/chosen": -1.0425060987472534, + "logits/rejected": -0.9774593710899353, + "logps/chosen": -46.925079345703125, + "logps/rejected": -61.197967529296875, + "loss": 0.4765, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.8930892944335938, + "rewards/margins": -0.4629502296447754, + "rewards/rejected": 2.356039524078369, + "step": 7737 + }, + { + "epoch": 1.26, + "learning_rate": 3.21110531686218e-06, + "logits/chosen": -1.2996642589569092, + "logits/rejected": -1.3955204486846924, + "logps/chosen": -40.907508850097656, + "logps/rejected": -96.39568328857422, + "loss": 1.1163, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.5806548595428467, + "rewards/margins": -1.2979745864868164, + "rewards/rejected": 3.878629446029663, + "step": 7738 + }, + { + "epoch": 1.26, + "learning_rate": 3.2098781200326444e-06, + "logits/chosen": -0.8792919516563416, + "logits/rejected": -0.8792919516563416, + "logps/chosen": -41.12961959838867, + "logps/rejected": -41.12961959838867, + "loss": 0.3489, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.540098190307617, + "rewards/margins": 0.0, + "rewards/rejected": 2.540098190307617, + "step": 7739 + }, + { + "epoch": 1.26, + "learning_rate": 3.208651046883593e-06, + "logits/chosen": -1.1180552244186401, + "logits/rejected": -1.1180552244186401, + "logps/chosen": -29.058258056640625, + "logps/rejected": -29.058258056640625, + "loss": 0.4035, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.9057640433311462, + "rewards/margins": 0.0, + "rewards/rejected": 0.9057640433311462, + "step": 7740 + }, + { + "epoch": 1.26, + "learning_rate": 3.207424097499805e-06, + "logits/chosen": -1.1533645391464233, + "logits/rejected": -1.1253037452697754, + "logps/chosen": -54.549591064453125, + "logps/rejected": -50.52836608886719, + "loss": 0.9622, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.7185882329940796, + "rewards/margins": -1.7550424337387085, + "rewards/rejected": 3.473630666732788, + "step": 7741 + }, + { + "epoch": 1.26, + "learning_rate": 3.2061972719660494e-06, + "logits/chosen": -1.6910382509231567, + "logits/rejected": -1.6672849655151367, + "logps/chosen": -111.37077331542969, + "logps/rejected": -80.16011810302734, + "loss": 0.0619, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.706437826156616, + "rewards/margins": 2.0290870666503906, + "rewards/rejected": 0.677350640296936, + "step": 7742 + }, + { + "epoch": 1.26, + "learning_rate": 3.2049705703670897e-06, + "logits/chosen": -1.1845186948776245, + "logits/rejected": -1.1695903539657593, + "logps/chosen": -37.495765686035156, + "logps/rejected": -59.74168395996094, + "loss": 1.0084, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.137303113937378, + "rewards/margins": -0.5935616493225098, + "rewards/rejected": 2.7308647632598877, + "step": 7743 + }, + { + "epoch": 1.26, + "learning_rate": 3.203743992787679e-06, + "logits/chosen": -1.4096519947052002, + "logits/rejected": -1.4488884210586548, + "logps/chosen": -136.36041259765625, + "logps/rejected": -166.505126953125, + "loss": 0.2427, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.257016181945801, + "rewards/margins": 0.5241866111755371, + "rewards/rejected": 5.732829570770264, + "step": 7744 + }, + { + "epoch": 1.26, + "learning_rate": 3.202517539312561e-06, + "logits/chosen": -1.146920084953308, + "logits/rejected": -1.176432490348816, + "logps/chosen": -213.12057495117188, + "logps/rejected": -183.53497314453125, + "loss": 0.0203, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.418252468109131, + "rewards/margins": 3.5706939697265625, + "rewards/rejected": 1.847558617591858, + "step": 7745 + }, + { + "epoch": 1.26, + "learning_rate": 3.2012912100264743e-06, + "logits/chosen": -1.382633924484253, + "logits/rejected": -1.3562257289886475, + "logps/chosen": -32.57220458984375, + "logps/rejected": -18.602127075195312, + "loss": 0.4743, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.3209900856018066, + "rewards/margins": -0.1674938201904297, + "rewards/rejected": 2.4884839057922363, + "step": 7746 + }, + { + "epoch": 1.26, + "learning_rate": 3.2000650050141447e-06, + "logits/chosen": -1.069056510925293, + "logits/rejected": -1.0602327585220337, + "logps/chosen": -6.566004753112793, + "logps/rejected": -4.883980751037598, + "loss": 1.9953, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.05378217622637749, + "rewards/margins": -0.33482497930526733, + "rewards/rejected": 0.28104281425476074, + "step": 7747 + }, + { + "epoch": 1.26, + "learning_rate": 3.1988389243602926e-06, + "logits/chosen": -1.3247483968734741, + "logits/rejected": -1.297686219215393, + "logps/chosen": -76.83384704589844, + "logps/rejected": -66.49423217773438, + "loss": 0.2349, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.8509368896484375, + "rewards/margins": 1.3544098138809204, + "rewards/rejected": 1.496527075767517, + "step": 7748 + }, + { + "epoch": 1.26, + "learning_rate": 3.197612968149628e-06, + "logits/chosen": -1.5703643560409546, + "logits/rejected": -1.5703643560409546, + "logps/chosen": -72.3812484741211, + "logps/rejected": -72.3812484741211, + "loss": 0.3549, + "rewards/accuracies": 0.0, + "rewards/chosen": 4.691547393798828, + "rewards/margins": 0.0, + "rewards/rejected": 4.691547393798828, + "step": 7749 + }, + { + "epoch": 1.26, + "learning_rate": 3.196387136466853e-06, + "logits/chosen": -1.7685737609863281, + "logits/rejected": -1.7292590141296387, + "logps/chosen": -75.76382446289062, + "logps/rejected": -10.4534273147583, + "loss": 0.1692, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.2647018432617188, + "rewards/margins": 1.9048879146575928, + "rewards/rejected": 0.35981398820877075, + "step": 7750 + }, + { + "epoch": 1.26, + "learning_rate": 3.1951614293966605e-06, + "logits/chosen": -0.9456852674484253, + "logits/rejected": -1.000213861465454, + "logps/chosen": -93.37980651855469, + "logps/rejected": -55.398658752441406, + "loss": 0.9569, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.8117141723632812, + "rewards/margins": -0.8711221218109131, + "rewards/rejected": 2.6828362941741943, + "step": 7751 + }, + { + "epoch": 1.26, + "learning_rate": 3.1939358470237366e-06, + "logits/chosen": -1.3713159561157227, + "logits/rejected": -1.1863434314727783, + "logps/chosen": -106.37846374511719, + "logps/rejected": -44.753990173339844, + "loss": 0.4087, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.7762649059295654, + "rewards/margins": -0.21782779693603516, + "rewards/rejected": 3.9940927028656006, + "step": 7752 + }, + { + "epoch": 1.26, + "learning_rate": 3.1927103894327553e-06, + "logits/chosen": -1.2976831197738647, + "logits/rejected": -1.255115032196045, + "logps/chosen": -56.62015151977539, + "logps/rejected": -7.988577365875244, + "loss": 1.7083, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.10716591030359268, + "rewards/margins": -0.7347936630249023, + "rewards/rejected": 0.8419595956802368, + "step": 7753 + }, + { + "epoch": 1.26, + "learning_rate": 3.1914850567083866e-06, + "logits/chosen": -1.599631428718567, + "logits/rejected": -1.6074422597885132, + "logps/chosen": -41.785194396972656, + "logps/rejected": -81.66304779052734, + "loss": 0.1969, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.923264265060425, + "rewards/margins": 0.7337050437927246, + "rewards/rejected": 2.1895592212677, + "step": 7754 + }, + { + "epoch": 1.26, + "learning_rate": 3.1902598489352877e-06, + "logits/chosen": -1.095402717590332, + "logits/rejected": -1.036537528038025, + "logps/chosen": -51.18312454223633, + "logps/rejected": -29.2708740234375, + "loss": 0.2287, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.7737209796905518, + "rewards/margins": 0.732771635055542, + "rewards/rejected": 2.0409493446350098, + "step": 7755 + }, + { + "epoch": 1.26, + "learning_rate": 3.1890347661981087e-06, + "logits/chosen": -1.2524160146713257, + "logits/rejected": -0.9568770527839661, + "logps/chosen": -62.765865325927734, + "logps/rejected": -32.247215270996094, + "loss": 0.9828, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.215193748474121, + "rewards/margins": 2.9887189865112305, + "rewards/rejected": 3.2264747619628906, + "step": 7756 + }, + { + "epoch": 1.26, + "learning_rate": 3.1878098085814926e-06, + "logits/chosen": -1.1822141408920288, + "logits/rejected": -1.1667158603668213, + "logps/chosen": -51.27202224731445, + "logps/rejected": -64.91414642333984, + "loss": 0.4701, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.875040054321289, + "rewards/margins": -0.4091947078704834, + "rewards/rejected": 3.2842347621917725, + "step": 7757 + }, + { + "epoch": 1.26, + "learning_rate": 3.1865849761700705e-06, + "logits/chosen": -1.3693479299545288, + "logits/rejected": -1.2967753410339355, + "logps/chosen": -30.950231552124023, + "logps/rejected": -36.32600784301758, + "loss": 1.055, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.9352738857269287, + "rewards/margins": -1.979625940322876, + "rewards/rejected": 4.914899826049805, + "step": 7758 + }, + { + "epoch": 1.26, + "learning_rate": 3.185360269048469e-06, + "logits/chosen": -1.3710930347442627, + "logits/rejected": -1.4389973878860474, + "logps/chosen": -65.20771026611328, + "logps/rejected": -111.98936462402344, + "loss": 1.5198, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.091713786125183, + "rewards/margins": -2.2599782943725586, + "rewards/rejected": 3.3516921997070312, + "step": 7759 + }, + { + "epoch": 1.26, + "learning_rate": 3.1841356873013024e-06, + "logits/chosen": -1.3332538604736328, + "logits/rejected": -1.3332538604736328, + "logps/chosen": -78.00877380371094, + "logps/rejected": -78.00877380371094, + "loss": 0.8659, + "rewards/accuracies": 0.0, + "rewards/chosen": 5.782798767089844, + "rewards/margins": 0.0, + "rewards/rejected": 5.782798767089844, + "step": 7760 + }, + { + "epoch": 1.26, + "learning_rate": 3.1829112310131784e-06, + "logits/chosen": -1.2014832496643066, + "logits/rejected": -1.2226682901382446, + "logps/chosen": -112.37454986572266, + "logps/rejected": -136.078857421875, + "loss": 0.5838, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.925016164779663, + "rewards/margins": -0.7849280834197998, + "rewards/rejected": 3.709944248199463, + "step": 7761 + }, + { + "epoch": 1.26, + "learning_rate": 3.181686900268694e-06, + "logits/chosen": -0.9384608864784241, + "logits/rejected": -0.9442363977432251, + "logps/chosen": -41.96270751953125, + "logps/rejected": -68.9463882446289, + "loss": 2.2882, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.5669670104980469, + "rewards/margins": -0.3613418936729431, + "rewards/rejected": 0.92830890417099, + "step": 7762 + }, + { + "epoch": 1.26, + "learning_rate": 3.18046269515244e-06, + "logits/chosen": -1.6006702184677124, + "logits/rejected": -1.510145902633667, + "logps/chosen": -32.969703674316406, + "logps/rejected": -14.652948379516602, + "loss": 0.0728, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.120616912841797, + "rewards/margins": 2.16998028755188, + "rewards/rejected": 0.9506366848945618, + "step": 7763 + }, + { + "epoch": 1.26, + "learning_rate": 3.1792386157489973e-06, + "logits/chosen": -1.1154823303222656, + "logits/rejected": -1.1556639671325684, + "logps/chosen": -34.200706481933594, + "logps/rejected": -95.90692138671875, + "loss": 0.1272, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.754407525062561, + "rewards/margins": 1.712897539138794, + "rewards/rejected": 0.04151001200079918, + "step": 7764 + }, + { + "epoch": 1.26, + "learning_rate": 3.1780146621429376e-06, + "logits/chosen": -1.1400648355484009, + "logits/rejected": -0.9574183225631714, + "logps/chosen": -67.60868072509766, + "logps/rejected": -26.31779670715332, + "loss": 0.1676, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.1059060096740723, + "rewards/margins": 1.8039991855621338, + "rewards/rejected": 0.3019067943096161, + "step": 7765 + }, + { + "epoch": 1.26, + "learning_rate": 3.176790834418826e-06, + "logits/chosen": -1.2923448085784912, + "logits/rejected": -1.2088990211486816, + "logps/chosen": -44.90126037597656, + "logps/rejected": -50.058372497558594, + "loss": 0.5269, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.7457398176193237, + "rewards/margins": -0.13186562061309814, + "rewards/rejected": 1.8776054382324219, + "step": 7766 + }, + { + "epoch": 1.26, + "learning_rate": 3.175567132661214e-06, + "logits/chosen": -1.338239312171936, + "logits/rejected": -1.338239312171936, + "logps/chosen": -34.744293212890625, + "logps/rejected": -34.744293212890625, + "loss": 0.3925, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.425941824913025, + "rewards/margins": 0.0, + "rewards/rejected": 1.425941824913025, + "step": 7767 + }, + { + "epoch": 1.26, + "learning_rate": 3.174343556954652e-06, + "logits/chosen": -1.2258918285369873, + "logits/rejected": -1.46913480758667, + "logps/chosen": -131.94346618652344, + "logps/rejected": -35.751258850097656, + "loss": 0.0888, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.472224473953247, + "rewards/margins": 2.142906904220581, + "rewards/rejected": 0.32931748032569885, + "step": 7768 + }, + { + "epoch": 1.26, + "learning_rate": 3.173120107383676e-06, + "logits/chosen": -1.1855937242507935, + "logits/rejected": -1.1940466165542603, + "logps/chosen": -59.134681701660156, + "logps/rejected": -72.91667938232422, + "loss": 0.9476, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.582473039627075, + "rewards/margins": 0.6414841413497925, + "rewards/rejected": 1.9409888982772827, + "step": 7769 + }, + { + "epoch": 1.26, + "learning_rate": 3.171896784032814e-06, + "logits/chosen": -1.2950224876403809, + "logits/rejected": -1.2843576669692993, + "logps/chosen": -75.82410430908203, + "logps/rejected": -58.04404830932617, + "loss": 0.154, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.7169175148010254, + "rewards/margins": 1.4358761310577393, + "rewards/rejected": 1.2810413837432861, + "step": 7770 + }, + { + "epoch": 1.26, + "learning_rate": 3.170673586986587e-06, + "logits/chosen": -0.9856554269790649, + "logits/rejected": -0.9909073710441589, + "logps/chosen": -7.892391681671143, + "logps/rejected": -19.370018005371094, + "loss": 0.7453, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.04744572564959526, + "rewards/margins": -0.2598785161972046, + "rewards/rejected": 0.30732423067092896, + "step": 7771 + }, + { + "epoch": 1.26, + "learning_rate": 3.169450516329505e-06, + "logits/chosen": -0.9624087810516357, + "logits/rejected": -0.943791925907135, + "logps/chosen": -47.30230712890625, + "logps/rejected": -44.20479965209961, + "loss": 1.3523, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.6888145804405212, + "rewards/margins": -1.9947834014892578, + "rewards/rejected": 2.683598041534424, + "step": 7772 + }, + { + "epoch": 1.26, + "learning_rate": 3.168227572146072e-06, + "logits/chosen": -1.5969483852386475, + "logits/rejected": -1.451132893562317, + "logps/chosen": -107.2066879272461, + "logps/rejected": -14.219970703125, + "loss": 0.027, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.53420352935791, + "rewards/margins": 4.520077228546143, + "rewards/rejected": 1.0141264200210571, + "step": 7773 + }, + { + "epoch": 1.26, + "learning_rate": 3.1670047545207817e-06, + "logits/chosen": -1.2853842973709106, + "logits/rejected": -1.204354166984558, + "logps/chosen": -145.8351287841797, + "logps/rejected": -124.74832153320312, + "loss": 0.5533, + "rewards/accuracies": 1.0, + "rewards/chosen": 7.675637722015381, + "rewards/margins": 0.9778823852539062, + "rewards/rejected": 6.697755336761475, + "step": 7774 + }, + { + "epoch": 1.26, + "learning_rate": 3.165782063538119e-06, + "logits/chosen": -1.4193611145019531, + "logits/rejected": -1.204594612121582, + "logps/chosen": -48.15144729614258, + "logps/rejected": -83.55648803710938, + "loss": 1.8123, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.226365327835083, + "rewards/margins": -2.675651788711548, + "rewards/rejected": 5.902017116546631, + "step": 7775 + }, + { + "epoch": 1.26, + "learning_rate": 3.164559499282559e-06, + "logits/chosen": -1.165503740310669, + "logits/rejected": -1.17030668258667, + "logps/chosen": -1.8637266159057617, + "logps/rejected": -3.6495003700256348, + "loss": 0.6909, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.35605287551879883, + "rewards/margins": 0.009071290493011475, + "rewards/rejected": 0.34698158502578735, + "step": 7776 + }, + { + "epoch": 1.26, + "learning_rate": 3.163337061838572e-06, + "logits/chosen": -1.138505458831787, + "logits/rejected": -0.9789041876792908, + "logps/chosen": -224.43109130859375, + "logps/rejected": -166.9702911376953, + "loss": 0.3707, + "rewards/accuracies": 0.0, + "rewards/chosen": 4.542352199554443, + "rewards/margins": -0.08777475357055664, + "rewards/rejected": 4.630126953125, + "step": 7777 + }, + { + "epoch": 1.26, + "learning_rate": 3.1621147512906136e-06, + "logits/chosen": -1.384193778038025, + "logits/rejected": -1.4828121662139893, + "logps/chosen": -70.63328552246094, + "logps/rejected": -103.65292358398438, + "loss": 0.978, + "rewards/accuracies": 0.0, + "rewards/chosen": 4.155003547668457, + "rewards/margins": -0.08603048324584961, + "rewards/rejected": 4.241034030914307, + "step": 7778 + }, + { + "epoch": 1.26, + "learning_rate": 3.1608925677231374e-06, + "logits/chosen": -1.2150527238845825, + "logits/rejected": -1.1889230012893677, + "logps/chosen": -42.53243637084961, + "logps/rejected": -127.80978393554688, + "loss": 0.3331, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.3552119731903076, + "rewards/margins": 0.0867915153503418, + "rewards/rejected": 2.268420457839966, + "step": 7779 + }, + { + "epoch": 1.26, + "learning_rate": 3.1596705112205816e-06, + "logits/chosen": -1.1993305683135986, + "logits/rejected": -1.057866096496582, + "logps/chosen": -81.96918487548828, + "logps/rejected": -16.84027862548828, + "loss": 0.0719, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.5232903957366943, + "rewards/margins": 2.2671005725860596, + "rewards/rejected": 0.2561897337436676, + "step": 7780 + }, + { + "epoch": 1.26, + "learning_rate": 3.158448581867381e-06, + "logits/chosen": -1.32004976272583, + "logits/rejected": -1.4112447500228882, + "logps/chosen": -21.846376419067383, + "logps/rejected": -54.46898651123047, + "loss": 0.632, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.036097288131714, + "rewards/margins": -0.7299230098724365, + "rewards/rejected": 2.7660202980041504, + "step": 7781 + }, + { + "epoch": 1.26, + "learning_rate": 3.1572267797479583e-06, + "logits/chosen": -1.506561279296875, + "logits/rejected": -1.506561279296875, + "logps/chosen": -68.54483032226562, + "logps/rejected": -68.54483032226562, + "loss": 0.3542, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.0072014331817627, + "rewards/margins": 0.0, + "rewards/rejected": 2.0072014331817627, + "step": 7782 + }, + { + "epoch": 1.26, + "learning_rate": 3.156005104946729e-06, + "logits/chosen": -0.7537752389907837, + "logits/rejected": -0.7531920075416565, + "logps/chosen": -2.624361038208008, + "logps/rejected": -4.258544921875, + "loss": 0.8756, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.39957162737846375, + "rewards/margins": 0.02246609330177307, + "rewards/rejected": 0.3771055340766907, + "step": 7783 + }, + { + "epoch": 1.26, + "learning_rate": 3.154783557548098e-06, + "logits/chosen": -1.6904011964797974, + "logits/rejected": -1.6170625686645508, + "logps/chosen": -111.82756805419922, + "logps/rejected": -22.229511260986328, + "loss": 0.4141, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.1586647033691406, + "rewards/margins": 0.20641016960144043, + "rewards/rejected": 1.9522545337677002, + "step": 7784 + }, + { + "epoch": 1.26, + "learning_rate": 3.1535621376364643e-06, + "logits/chosen": -1.5182403326034546, + "logits/rejected": -1.3959661722183228, + "logps/chosen": -113.7825698852539, + "logps/rejected": -92.99136352539062, + "loss": 0.0259, + "rewards/accuracies": 1.0, + "rewards/chosen": 8.159029960632324, + "rewards/margins": 3.0501065254211426, + "rewards/rejected": 5.108923435211182, + "step": 7785 + }, + { + "epoch": 1.26, + "learning_rate": 3.1523408452962156e-06, + "logits/chosen": -1.0235116481781006, + "logits/rejected": -1.0275803804397583, + "logps/chosen": -64.6036148071289, + "logps/rejected": -70.08969116210938, + "loss": 0.0979, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.880147695541382, + "rewards/margins": 1.5345253944396973, + "rewards/rejected": 1.3456223011016846, + "step": 7786 + }, + { + "epoch": 1.26, + "learning_rate": 3.1511196806117327e-06, + "logits/chosen": -0.7907735705375671, + "logits/rejected": -0.7867263555526733, + "logps/chosen": -3.4922704696655273, + "logps/rejected": -3.892179012298584, + "loss": 0.7016, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.47775402665138245, + "rewards/margins": -0.03362658619880676, + "rewards/rejected": 0.5113806128501892, + "step": 7787 + }, + { + "epoch": 1.26, + "learning_rate": 3.149898643667384e-06, + "logits/chosen": -1.2789667844772339, + "logits/rejected": -1.2365162372589111, + "logps/chosen": -74.50193786621094, + "logps/rejected": -47.36937713623047, + "loss": 0.3239, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.0050132274627686, + "rewards/margins": 0.17171168327331543, + "rewards/rejected": 1.8333015441894531, + "step": 7788 + }, + { + "epoch": 1.26, + "learning_rate": 3.148677734547534e-06, + "logits/chosen": -1.107006549835205, + "logits/rejected": -1.107006549835205, + "logps/chosen": -0.9977567791938782, + "logps/rejected": -0.9977567791938782, + "loss": 0.9025, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.23045141994953156, + "rewards/margins": 0.0, + "rewards/rejected": 0.23045141994953156, + "step": 7789 + }, + { + "epoch": 1.26, + "learning_rate": 3.147456953336535e-06, + "logits/chosen": -0.8203688859939575, + "logits/rejected": -0.8203688859939575, + "logps/chosen": -0.3980656862258911, + "logps/rejected": -0.3980656862258911, + "loss": 0.381, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.11267995089292526, + "rewards/margins": 0.0, + "rewards/rejected": 0.11267995089292526, + "step": 7790 + }, + { + "epoch": 1.26, + "learning_rate": 3.146236300118731e-06, + "logits/chosen": -1.4374765157699585, + "logits/rejected": -1.444342851638794, + "logps/chosen": -39.19002151489258, + "logps/rejected": -109.10758972167969, + "loss": 0.6798, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.424734115600586, + "rewards/margins": -0.8943684101104736, + "rewards/rejected": 3.3191025257110596, + "step": 7791 + }, + { + "epoch": 1.26, + "learning_rate": 3.1450157749784592e-06, + "logits/chosen": -1.1334539651870728, + "logits/rejected": -1.1316030025482178, + "logps/chosen": -28.44296646118164, + "logps/rejected": -75.392822265625, + "loss": 0.5355, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.3324803113937378, + "rewards/margins": -0.3846569061279297, + "rewards/rejected": 1.7171372175216675, + "step": 7792 + }, + { + "epoch": 1.26, + "learning_rate": 3.143795378000045e-06, + "logits/chosen": -1.5446470975875854, + "logits/rejected": -1.4013218879699707, + "logps/chosen": -139.38177490234375, + "logps/rejected": -84.38580322265625, + "loss": 0.1205, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.229498386383057, + "rewards/margins": 1.3346071243286133, + "rewards/rejected": 4.894891262054443, + "step": 7793 + }, + { + "epoch": 1.27, + "learning_rate": 3.1425751092678064e-06, + "logits/chosen": -1.1918706893920898, + "logits/rejected": -1.241308569908142, + "logps/chosen": -36.97517013549805, + "logps/rejected": -74.35848999023438, + "loss": 0.6161, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.1016037464141846, + "rewards/margins": -0.3179168701171875, + "rewards/rejected": 2.419520616531372, + "step": 7794 + }, + { + "epoch": 1.27, + "learning_rate": 3.141354968866053e-06, + "logits/chosen": -1.113684058189392, + "logits/rejected": -1.0986247062683105, + "logps/chosen": -62.830562591552734, + "logps/rejected": -52.399227142333984, + "loss": 0.3852, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.5558032989501953, + "rewards/margins": -0.08252882957458496, + "rewards/rejected": 2.6383321285247803, + "step": 7795 + }, + { + "epoch": 1.27, + "learning_rate": 3.140134956879084e-06, + "logits/chosen": -1.1625056266784668, + "logits/rejected": -1.0162606239318848, + "logps/chosen": -32.99547576904297, + "logps/rejected": -17.416379928588867, + "loss": 0.5381, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.7891250848770142, + "rewards/margins": 0.8449495434761047, + "rewards/rejected": 0.9441755414009094, + "step": 7796 + }, + { + "epoch": 1.27, + "learning_rate": 3.1389150733911917e-06, + "logits/chosen": -1.2934867143630981, + "logits/rejected": -1.0458366870880127, + "logps/chosen": -84.03240966796875, + "logps/rejected": -49.123809814453125, + "loss": 0.0402, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.616738796234131, + "rewards/margins": 3.1900832653045654, + "rewards/rejected": 3.4266555309295654, + "step": 7797 + }, + { + "epoch": 1.27, + "learning_rate": 3.1376953184866575e-06, + "logits/chosen": -0.7680491209030151, + "logits/rejected": -0.7680491209030151, + "logps/chosen": -3.8167572021484375, + "logps/rejected": -3.8167572021484375, + "loss": 0.8094, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.41013431549072266, + "rewards/margins": 0.0, + "rewards/rejected": 0.41013431549072266, + "step": 7798 + }, + { + "epoch": 1.27, + "learning_rate": 3.1364756922497564e-06, + "logits/chosen": -1.3896050453186035, + "logits/rejected": -1.5067245960235596, + "logps/chosen": -128.5, + "logps/rejected": -165.51406860351562, + "loss": 1.3113, + "rewards/accuracies": 0.0, + "rewards/chosen": 5.92042875289917, + "rewards/margins": -2.517638683319092, + "rewards/rejected": 8.438067436218262, + "step": 7799 + }, + { + "epoch": 1.27, + "learning_rate": 3.1352561947647513e-06, + "logits/chosen": -1.5026531219482422, + "logits/rejected": -1.1894868612289429, + "logps/chosen": -108.78890991210938, + "logps/rejected": -44.67258071899414, + "loss": 0.1245, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.079383850097656, + "rewards/margins": 3.6992766857147217, + "rewards/rejected": 1.3801071643829346, + "step": 7800 + }, + { + "epoch": 1.27, + "learning_rate": 3.1340368261158995e-06, + "logits/chosen": -1.4814437627792358, + "logits/rejected": -1.3846991062164307, + "logps/chosen": -53.953548431396484, + "logps/rejected": -35.349517822265625, + "loss": 0.086, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.5843143463134766, + "rewards/margins": 1.9356693029403687, + "rewards/rejected": 0.6486450433731079, + "step": 7801 + }, + { + "epoch": 1.27, + "learning_rate": 3.132817586387447e-06, + "logits/chosen": -0.9554505348205566, + "logits/rejected": -0.9614786505699158, + "logps/chosen": -5.022824287414551, + "logps/rejected": -2.642504930496216, + "loss": 0.4558, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.02395915985107422, + "rewards/margins": -0.1792609989643097, + "rewards/rejected": 0.2032201588153839, + "step": 7802 + }, + { + "epoch": 1.27, + "learning_rate": 3.1315984756636307e-06, + "logits/chosen": -1.2607591152191162, + "logits/rejected": -1.1240285634994507, + "logps/chosen": -57.35027313232422, + "logps/rejected": -59.98847961425781, + "loss": 1.2251, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.762626647949219, + "rewards/margins": 3.021688938140869, + "rewards/rejected": 1.7409378290176392, + "step": 7803 + }, + { + "epoch": 1.27, + "learning_rate": 3.1303794940286823e-06, + "logits/chosen": -1.6477127075195312, + "logits/rejected": -1.5127160549163818, + "logps/chosen": -153.7207794189453, + "logps/rejected": -134.41134643554688, + "loss": 0.0011, + "rewards/accuracies": 1.0, + "rewards/chosen": 8.069457054138184, + "rewards/margins": 6.593515396118164, + "rewards/rejected": 1.47594153881073, + "step": 7804 + }, + { + "epoch": 1.27, + "learning_rate": 3.1291606415668195e-06, + "logits/chosen": -1.2364078760147095, + "logits/rejected": -1.2245876789093018, + "logps/chosen": -30.988567352294922, + "logps/rejected": -5.0727338790893555, + "loss": 1.0967, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.1899189054965973, + "rewards/margins": -0.07446306943893433, + "rewards/rejected": 0.2643819749355316, + "step": 7805 + }, + { + "epoch": 1.27, + "learning_rate": 3.1279419183622544e-06, + "logits/chosen": -1.3497682809829712, + "logits/rejected": -1.3288800716400146, + "logps/chosen": -102.927734375, + "logps/rejected": -49.44586181640625, + "loss": 0.3344, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.001272678375244, + "rewards/margins": 0.0714569091796875, + "rewards/rejected": 3.9298157691955566, + "step": 7806 + }, + { + "epoch": 1.27, + "learning_rate": 3.126723324499189e-06, + "logits/chosen": -1.283535122871399, + "logits/rejected": -1.1947388648986816, + "logps/chosen": -62.8834228515625, + "logps/rejected": -77.89221954345703, + "loss": 0.9794, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.2989120483398438, + "rewards/margins": -0.4380059242248535, + "rewards/rejected": 2.7369179725646973, + "step": 7807 + }, + { + "epoch": 1.27, + "learning_rate": 3.1255048600618176e-06, + "logits/chosen": -1.2689982652664185, + "logits/rejected": -1.3314028978347778, + "logps/chosen": -69.25140380859375, + "logps/rejected": -66.38571166992188, + "loss": 0.6687, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.722914218902588, + "rewards/margins": -0.17415070533752441, + "rewards/rejected": 2.8970649242401123, + "step": 7808 + }, + { + "epoch": 1.27, + "learning_rate": 3.124286525134323e-06, + "logits/chosen": -1.1724250316619873, + "logits/rejected": -1.1707344055175781, + "logps/chosen": -52.67781448364258, + "logps/rejected": -98.29265594482422, + "loss": 0.5837, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.3603252172470093, + "rewards/margins": -0.44283103942871094, + "rewards/rejected": 1.8031562566757202, + "step": 7809 + }, + { + "epoch": 1.27, + "learning_rate": 3.1230683198008817e-06, + "logits/chosen": -1.0247268676757812, + "logits/rejected": -1.0301897525787354, + "logps/chosen": -114.82394409179688, + "logps/rejected": -85.36724853515625, + "loss": 0.7231, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9805543422698975, + "rewards/margins": 2.1913514137268066, + "rewards/rejected": 0.789202868938446, + "step": 7810 + }, + { + "epoch": 1.27, + "learning_rate": 3.1218502441456592e-06, + "logits/chosen": -1.5687682628631592, + "logits/rejected": -1.5602954626083374, + "logps/chosen": -62.133209228515625, + "logps/rejected": -56.00585174560547, + "loss": 0.4885, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.8066940307617188, + "rewards/margins": 3.569007396697998, + "rewards/rejected": -0.7623134851455688, + "step": 7811 + }, + { + "epoch": 1.27, + "learning_rate": 3.1206322982528142e-06, + "logits/chosen": -1.2388337850570679, + "logits/rejected": -1.219813346862793, + "logps/chosen": -100.84124755859375, + "logps/rejected": -55.60929870605469, + "loss": 0.3676, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.5807342529296875, + "rewards/margins": -0.0708233118057251, + "rewards/rejected": 0.6515575647354126, + "step": 7812 + }, + { + "epoch": 1.27, + "learning_rate": 3.1194144822064943e-06, + "logits/chosen": -1.153950810432434, + "logits/rejected": -1.1421256065368652, + "logps/chosen": -101.36869812011719, + "logps/rejected": -98.2359619140625, + "loss": 1.8922, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.6555466651916504, + "rewards/margins": 0.6336715221405029, + "rewards/rejected": 3.0218751430511475, + "step": 7813 + }, + { + "epoch": 1.27, + "learning_rate": 3.11819679609084e-06, + "logits/chosen": -1.1719163656234741, + "logits/rejected": -1.14616060256958, + "logps/chosen": -116.69204711914062, + "logps/rejected": -75.57969665527344, + "loss": 0.1309, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.1427459716796875, + "rewards/margins": 1.7416679859161377, + "rewards/rejected": 2.40107798576355, + "step": 7814 + }, + { + "epoch": 1.27, + "learning_rate": 3.1169792399899813e-06, + "logits/chosen": -1.5018566846847534, + "logits/rejected": -1.7261570692062378, + "logps/chosen": -162.87255859375, + "logps/rejected": -51.037899017333984, + "loss": 0.0076, + "rewards/accuracies": 1.0, + "rewards/chosen": 7.117828369140625, + "rewards/margins": 5.185773849487305, + "rewards/rejected": 1.9320545196533203, + "step": 7815 + }, + { + "epoch": 1.27, + "learning_rate": 3.1157618139880387e-06, + "logits/chosen": -1.2657513618469238, + "logits/rejected": -1.2839287519454956, + "logps/chosen": -38.25236129760742, + "logps/rejected": -84.46018981933594, + "loss": 1.5136, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.939327597618103, + "rewards/margins": -2.9466710090637207, + "rewards/rejected": 4.885998725891113, + "step": 7816 + }, + { + "epoch": 1.27, + "learning_rate": 3.114544518169127e-06, + "logits/chosen": -1.4320648908615112, + "logits/rejected": -1.4630714654922485, + "logps/chosen": -64.00509643554688, + "logps/rejected": -109.6816177368164, + "loss": 0.0861, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.316389560699463, + "rewards/margins": 1.780513048171997, + "rewards/rejected": 1.5358765125274658, + "step": 7817 + }, + { + "epoch": 1.27, + "learning_rate": 3.113327352617348e-06, + "logits/chosen": -1.2927666902542114, + "logits/rejected": -1.3120309114456177, + "logps/chosen": -74.25016784667969, + "logps/rejected": -68.94908905029297, + "loss": 0.7282, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.3565292358398438, + "rewards/margins": -0.032936930656433105, + "rewards/rejected": 1.3894661664962769, + "step": 7818 + }, + { + "epoch": 1.27, + "learning_rate": 3.1121103174167977e-06, + "logits/chosen": -1.4283792972564697, + "logits/rejected": -1.2801638841629028, + "logps/chosen": -51.350791931152344, + "logps/rejected": -23.22164535522461, + "loss": 0.1815, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.4391106367111206, + "rewards/margins": 0.9647713303565979, + "rewards/rejected": 0.4743393063545227, + "step": 7819 + }, + { + "epoch": 1.27, + "learning_rate": 3.110893412651561e-06, + "logits/chosen": -1.1987395286560059, + "logits/rejected": -1.214379072189331, + "logps/chosen": -70.9796142578125, + "logps/rejected": -98.66717529296875, + "loss": 0.0346, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.240016222000122, + "rewards/margins": 2.6700103282928467, + "rewards/rejected": 0.5700058341026306, + "step": 7820 + }, + { + "epoch": 1.27, + "learning_rate": 3.109676638405714e-06, + "logits/chosen": -1.4145960807800293, + "logits/rejected": -1.3786430358886719, + "logps/chosen": -74.01921844482422, + "logps/rejected": -66.68596649169922, + "loss": 0.9569, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.6723732948303223, + "rewards/margins": 0.4103410243988037, + "rewards/rejected": 3.2620322704315186, + "step": 7821 + }, + { + "epoch": 1.27, + "learning_rate": 3.1084599947633252e-06, + "logits/chosen": -1.2304353713989258, + "logits/rejected": -1.1788225173950195, + "logps/chosen": -95.82461547851562, + "logps/rejected": -26.064205169677734, + "loss": 0.3687, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.083293914794922, + "rewards/margins": 0.4154040813446045, + "rewards/rejected": 1.6678898334503174, + "step": 7822 + }, + { + "epoch": 1.27, + "learning_rate": 3.1072434818084524e-06, + "logits/chosen": -1.0661526918411255, + "logits/rejected": -1.1238834857940674, + "logps/chosen": -27.222084045410156, + "logps/rejected": -47.282833099365234, + "loss": 2.2784, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.416481375694275, + "rewards/margins": -3.1811256408691406, + "rewards/rejected": 4.597607135772705, + "step": 7823 + }, + { + "epoch": 1.27, + "learning_rate": 3.1060270996251465e-06, + "logits/chosen": -1.4659738540649414, + "logits/rejected": -1.329616665840149, + "logps/chosen": -121.83024597167969, + "logps/rejected": -27.87488555908203, + "loss": 0.7844, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.249678134918213, + "rewards/margins": 1.6893081665039062, + "rewards/rejected": 0.5603699088096619, + "step": 7824 + }, + { + "epoch": 1.27, + "learning_rate": 3.104810848297446e-06, + "logits/chosen": -1.4732643365859985, + "logits/rejected": -1.488560438156128, + "logps/chosen": -43.29865646362305, + "logps/rejected": -107.301025390625, + "loss": 0.4799, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.623598098754883, + "rewards/margins": -0.44594383239746094, + "rewards/rejected": 4.069541931152344, + "step": 7825 + }, + { + "epoch": 1.27, + "learning_rate": 3.1035947279093846e-06, + "logits/chosen": -1.3374265432357788, + "logits/rejected": -1.2742358446121216, + "logps/chosen": -113.39366149902344, + "logps/rejected": -102.21134948730469, + "loss": 0.0385, + "rewards/accuracies": 1.0, + "rewards/chosen": 8.112380027770996, + "rewards/margins": 5.088367462158203, + "rewards/rejected": 3.024012804031372, + "step": 7826 + }, + { + "epoch": 1.27, + "learning_rate": 3.1023787385449834e-06, + "logits/chosen": -1.230054497718811, + "logits/rejected": -1.0804686546325684, + "logps/chosen": -38.696502685546875, + "logps/rejected": -9.970520973205566, + "loss": 0.1521, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.7065902948379517, + "rewards/margins": 1.1309902667999268, + "rewards/rejected": 0.5756000876426697, + "step": 7827 + }, + { + "epoch": 1.27, + "learning_rate": 3.1011628802882556e-06, + "logits/chosen": -1.2577359676361084, + "logits/rejected": -1.2292577028274536, + "logps/chosen": -48.762962341308594, + "logps/rejected": -60.65538787841797, + "loss": 0.6159, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.4674699306488037, + "rewards/margins": 0.18051695823669434, + "rewards/rejected": 2.2869529724121094, + "step": 7828 + }, + { + "epoch": 1.27, + "learning_rate": 3.099947153223205e-06, + "logits/chosen": -1.160073161125183, + "logits/rejected": -1.1743630170822144, + "logps/chosen": -55.55127716064453, + "logps/rejected": -44.463226318359375, + "loss": 0.4227, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.945990800857544, + "rewards/margins": 0.515716552734375, + "rewards/rejected": 2.430274248123169, + "step": 7829 + }, + { + "epoch": 1.27, + "learning_rate": 3.0987315574338288e-06, + "logits/chosen": -1.2571793794631958, + "logits/rejected": -1.1138054132461548, + "logps/chosen": -99.8120346069336, + "logps/rejected": -157.0481414794922, + "loss": 0.6099, + "rewards/accuracies": 1.0, + "rewards/chosen": 7.157750606536865, + "rewards/margins": 0.6417303085327148, + "rewards/rejected": 6.51602029800415, + "step": 7830 + }, + { + "epoch": 1.27, + "learning_rate": 3.0975160930041113e-06, + "logits/chosen": -1.704317331314087, + "logits/rejected": -1.7841339111328125, + "logps/chosen": -210.53057861328125, + "logps/rejected": -24.547557830810547, + "loss": 0.0142, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.9855010509490967, + "rewards/margins": 3.6224987506866455, + "rewards/rejected": 0.36300238966941833, + "step": 7831 + }, + { + "epoch": 1.27, + "learning_rate": 3.0963007600180313e-06, + "logits/chosen": -1.42799711227417, + "logits/rejected": -1.3476390838623047, + "logps/chosen": -116.98291015625, + "logps/rejected": -39.45024490356445, + "loss": 0.1849, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.3084657192230225, + "rewards/margins": 1.587829351425171, + "rewards/rejected": 1.7206363677978516, + "step": 7832 + }, + { + "epoch": 1.27, + "learning_rate": 3.095085558559555e-06, + "logits/chosen": -1.2800533771514893, + "logits/rejected": -1.2190507650375366, + "logps/chosen": -64.2624740600586, + "logps/rejected": -63.148311614990234, + "loss": 0.512, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.6332175731658936, + "rewards/margins": -0.5738339424133301, + "rewards/rejected": 3.2070515155792236, + "step": 7833 + }, + { + "epoch": 1.27, + "learning_rate": 3.0938704887126425e-06, + "logits/chosen": -1.5099306106567383, + "logits/rejected": -1.4750666618347168, + "logps/chosen": -165.77243041992188, + "logps/rejected": -139.5188446044922, + "loss": 0.0238, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.343266487121582, + "rewards/margins": 3.194903612136841, + "rewards/rejected": 2.148362874984741, + "step": 7834 + }, + { + "epoch": 1.27, + "learning_rate": 3.092655550561243e-06, + "logits/chosen": -1.1696619987487793, + "logits/rejected": -1.2971606254577637, + "logps/chosen": -85.52293395996094, + "logps/rejected": -138.5379638671875, + "loss": 0.5865, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.841780185699463, + "rewards/margins": 0.04426097869873047, + "rewards/rejected": 6.797519207000732, + "step": 7835 + }, + { + "epoch": 1.27, + "learning_rate": 3.091440744189298e-06, + "logits/chosen": -1.4163999557495117, + "logits/rejected": -1.619636058807373, + "logps/chosen": -207.03045654296875, + "logps/rejected": -163.68463134765625, + "loss": 0.0944, + "rewards/accuracies": 1.0, + "rewards/chosen": 8.301065444946289, + "rewards/margins": 2.172961711883545, + "rewards/rejected": 6.128103733062744, + "step": 7836 + }, + { + "epoch": 1.27, + "learning_rate": 3.090226069680738e-06, + "logits/chosen": -1.3149762153625488, + "logits/rejected": -1.31795334815979, + "logps/chosen": -53.74958038330078, + "logps/rejected": -64.97793579101562, + "loss": 0.425, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.6800323724746704, + "rewards/margins": 0.8710060119628906, + "rewards/rejected": 0.8090263605117798, + "step": 7837 + }, + { + "epoch": 1.27, + "learning_rate": 3.0890115271194867e-06, + "logits/chosen": -1.3319133520126343, + "logits/rejected": -1.2355839014053345, + "logps/chosen": -166.78790283203125, + "logps/rejected": -69.43231201171875, + "loss": 0.3164, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.467074871063232, + "rewards/margins": 0.6468887329101562, + "rewards/rejected": 4.820186138153076, + "step": 7838 + }, + { + "epoch": 1.27, + "learning_rate": 3.087797116589456e-06, + "logits/chosen": -1.294298768043518, + "logits/rejected": -1.2097423076629639, + "logps/chosen": -44.54621887207031, + "logps/rejected": -14.445734977722168, + "loss": 0.2142, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.1285057067871094, + "rewards/margins": 1.1473667621612549, + "rewards/rejected": 0.9811388850212097, + "step": 7839 + }, + { + "epoch": 1.27, + "learning_rate": 3.0865828381745515e-06, + "logits/chosen": -1.1379828453063965, + "logits/rejected": -1.2501084804534912, + "logps/chosen": -60.62841796875, + "logps/rejected": -95.45574188232422, + "loss": 3.1784, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.036311388015747, + "rewards/margins": -4.506163597106934, + "rewards/rejected": 6.54247522354126, + "step": 7840 + }, + { + "epoch": 1.27, + "learning_rate": 3.0853686919586683e-06, + "logits/chosen": -1.3982384204864502, + "logits/rejected": -1.3582102060317993, + "logps/chosen": -76.53239440917969, + "logps/rejected": -80.87933349609375, + "loss": 0.2757, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0871894359588623, + "rewards/margins": 0.32661962509155273, + "rewards/rejected": 2.7605698108673096, + "step": 7841 + }, + { + "epoch": 1.27, + "learning_rate": 3.084154678025692e-06, + "logits/chosen": -1.2219804525375366, + "logits/rejected": -1.1862738132476807, + "logps/chosen": -38.9532470703125, + "logps/rejected": -54.01210403442383, + "loss": 0.4035, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.4570679664611816, + "rewards/margins": 0.20792889595031738, + "rewards/rejected": 2.2491390705108643, + "step": 7842 + }, + { + "epoch": 1.27, + "learning_rate": 3.082940796459499e-06, + "logits/chosen": -1.3594199419021606, + "logits/rejected": -1.3584518432617188, + "logps/chosen": -107.51545715332031, + "logps/rejected": -57.253936767578125, + "loss": 0.0599, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.376277446746826, + "rewards/margins": 3.101729154586792, + "rewards/rejected": 2.274548292160034, + "step": 7843 + }, + { + "epoch": 1.27, + "learning_rate": 3.0817270473439585e-06, + "logits/chosen": -1.4242347478866577, + "logits/rejected": -1.4792683124542236, + "logps/chosen": -153.280029296875, + "logps/rejected": -101.51896667480469, + "loss": 0.4754, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.5586044192314148, + "rewards/margins": -0.4122299551963806, + "rewards/rejected": 0.9708343744277954, + "step": 7844 + }, + { + "epoch": 1.27, + "learning_rate": 3.0805134307629283e-06, + "logits/chosen": -1.499417781829834, + "logits/rejected": -1.4189465045928955, + "logps/chosen": -67.45184326171875, + "logps/rejected": -29.9493465423584, + "loss": 1.0827, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.903132677078247, + "rewards/margins": -0.6349108219146729, + "rewards/rejected": 2.53804349899292, + "step": 7845 + }, + { + "epoch": 1.27, + "learning_rate": 3.0792999468002567e-06, + "logits/chosen": -1.2616020441055298, + "logits/rejected": -1.1683958768844604, + "logps/chosen": -88.55765533447266, + "logps/rejected": -42.476749420166016, + "loss": 0.2429, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.8569741249084473, + "rewards/margins": 2.211768865585327, + "rewards/rejected": 0.6452053189277649, + "step": 7846 + }, + { + "epoch": 1.27, + "learning_rate": 3.0780865955397853e-06, + "logits/chosen": -1.4904080629348755, + "logits/rejected": -1.3965864181518555, + "logps/chosen": -76.25919342041016, + "logps/rejected": -55.920013427734375, + "loss": 0.1508, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.256418704986572, + "rewards/margins": 1.0585222244262695, + "rewards/rejected": 5.197896480560303, + "step": 7847 + }, + { + "epoch": 1.27, + "learning_rate": 3.076873377065345e-06, + "logits/chosen": -1.1710989475250244, + "logits/rejected": -1.1215152740478516, + "logps/chosen": -54.597511291503906, + "logps/rejected": -38.612060546875, + "loss": 1.9614, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.114479899406433, + "rewards/margins": 0.392686128616333, + "rewards/rejected": 0.7217937707901001, + "step": 7848 + }, + { + "epoch": 1.27, + "learning_rate": 3.075660291460757e-06, + "logits/chosen": -1.0935715436935425, + "logits/rejected": -1.0935715436935425, + "logps/chosen": -0.3087105453014374, + "logps/rejected": -0.3087105453014374, + "loss": 0.8021, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.08314204216003418, + "rewards/margins": 0.0, + "rewards/rejected": 0.08314204216003418, + "step": 7849 + }, + { + "epoch": 1.27, + "learning_rate": 3.0744473388098352e-06, + "logits/chosen": -1.5506151914596558, + "logits/rejected": -1.543038249015808, + "logps/chosen": -106.68711853027344, + "logps/rejected": -318.22540283203125, + "loss": 1.2962, + "rewards/accuracies": 0.0, + "rewards/chosen": 4.825852870941162, + "rewards/margins": -2.4785938262939453, + "rewards/rejected": 7.304446697235107, + "step": 7850 + }, + { + "epoch": 1.27, + "learning_rate": 3.0732345191963825e-06, + "logits/chosen": -1.4566093683242798, + "logits/rejected": -1.328324556350708, + "logps/chosen": -71.04329681396484, + "logps/rejected": -54.359535217285156, + "loss": 0.2033, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.191477298736572, + "rewards/margins": 2.999837636947632, + "rewards/rejected": 3.1916396617889404, + "step": 7851 + }, + { + "epoch": 1.27, + "learning_rate": 3.0720218327041927e-06, + "logits/chosen": -1.1356346607208252, + "logits/rejected": -1.1222046613693237, + "logps/chosen": -15.805746078491211, + "logps/rejected": -17.690813064575195, + "loss": 0.6071, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.6591541171073914, + "rewards/margins": -0.8113871216773987, + "rewards/rejected": 1.47054123878479, + "step": 7852 + }, + { + "epoch": 1.27, + "learning_rate": 3.0708092794170525e-06, + "logits/chosen": -1.3076685667037964, + "logits/rejected": -1.3753695487976074, + "logps/chosen": -63.61774444580078, + "logps/rejected": -70.06874084472656, + "loss": 1.34, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.797870635986328, + "rewards/margins": -0.7584083080291748, + "rewards/rejected": 3.556278944015503, + "step": 7853 + }, + { + "epoch": 1.27, + "learning_rate": 3.0695968594187366e-06, + "logits/chosen": -1.343008041381836, + "logits/rejected": -1.3755868673324585, + "logps/chosen": -80.5994873046875, + "logps/rejected": -92.17474365234375, + "loss": 0.8034, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.761308431625366, + "rewards/margins": 1.125222086906433, + "rewards/rejected": 1.636086344718933, + "step": 7854 + }, + { + "epoch": 1.27, + "learning_rate": 3.068384572793012e-06, + "logits/chosen": -1.1168866157531738, + "logits/rejected": -1.07570481300354, + "logps/chosen": -63.37074279785156, + "logps/rejected": -41.3848876953125, + "loss": 0.2364, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.8419952392578125, + "rewards/margins": 0.6285148859024048, + "rewards/rejected": 1.2134803533554077, + "step": 7855 + }, + { + "epoch": 1.28, + "learning_rate": 3.0671724196236365e-06, + "logits/chosen": -1.3779231309890747, + "logits/rejected": -1.3779231309890747, + "logps/chosen": -67.52642822265625, + "logps/rejected": -67.52642822265625, + "loss": 0.3677, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.976335287094116, + "rewards/margins": 0.0, + "rewards/rejected": 2.976335287094116, + "step": 7856 + }, + { + "epoch": 1.28, + "learning_rate": 3.065960399994358e-06, + "logits/chosen": -1.3076598644256592, + "logits/rejected": -1.2694275379180908, + "logps/chosen": -83.68698120117188, + "logps/rejected": -35.453189849853516, + "loss": 0.4036, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.186988115310669, + "rewards/margins": 1.3581054210662842, + "rewards/rejected": 0.82888263463974, + "step": 7857 + }, + { + "epoch": 1.28, + "learning_rate": 3.0647485139889145e-06, + "logits/chosen": -1.327892780303955, + "logits/rejected": -1.4083620309829712, + "logps/chosen": -72.10603332519531, + "logps/rejected": -100.67501831054688, + "loss": 1.2143, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.895010471343994, + "rewards/margins": -0.8961548805236816, + "rewards/rejected": 4.791165351867676, + "step": 7858 + }, + { + "epoch": 1.28, + "learning_rate": 3.0635367616910385e-06, + "logits/chosen": -1.2429156303405762, + "logits/rejected": -1.0799870491027832, + "logps/chosen": -70.00682830810547, + "logps/rejected": -32.9203987121582, + "loss": 0.4342, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9890618324279785, + "rewards/margins": 2.502932548522949, + "rewards/rejected": 0.48612937331199646, + "step": 7859 + }, + { + "epoch": 1.28, + "learning_rate": 3.062325143184449e-06, + "logits/chosen": -1.3081353902816772, + "logits/rejected": -1.2052398920059204, + "logps/chosen": -68.49478912353516, + "logps/rejected": -26.188209533691406, + "loss": 0.1538, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.7211288213729858, + "rewards/margins": 2.0090837478637695, + "rewards/rejected": -0.2879548966884613, + "step": 7860 + }, + { + "epoch": 1.28, + "learning_rate": 3.061113658552858e-06, + "logits/chosen": -1.6732503175735474, + "logits/rejected": -1.3042947053909302, + "logps/chosen": -224.97361755371094, + "logps/rejected": -44.78147888183594, + "loss": 0.0025, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.540693759918213, + "rewards/margins": 5.386214733123779, + "rewards/rejected": 0.1544792205095291, + "step": 7861 + }, + { + "epoch": 1.28, + "learning_rate": 3.0599023078799674e-06, + "logits/chosen": -1.769235610961914, + "logits/rejected": -1.7597750425338745, + "logps/chosen": -42.703712463378906, + "logps/rejected": -75.00611114501953, + "loss": 0.3562, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.9973976612091064, + "rewards/margins": 0.6196715831756592, + "rewards/rejected": 3.3777260780334473, + "step": 7862 + }, + { + "epoch": 1.28, + "learning_rate": 3.0586910912494694e-06, + "logits/chosen": -1.2533223628997803, + "logits/rejected": -1.2703523635864258, + "logps/chosen": -71.12451934814453, + "logps/rejected": -50.48616027832031, + "loss": 0.3554, + "rewards/accuracies": 0.0, + "rewards/chosen": 4.766922950744629, + "rewards/margins": -0.0063934326171875, + "rewards/rejected": 4.773316383361816, + "step": 7863 + }, + { + "epoch": 1.28, + "learning_rate": 3.0574800087450483e-06, + "logits/chosen": -1.229636788368225, + "logits/rejected": -1.2694401741027832, + "logps/chosen": -55.9648551940918, + "logps/rejected": -73.11408996582031, + "loss": 0.5444, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.7726826667785645, + "rewards/margins": 0.7227833271026611, + "rewards/rejected": 2.0498993396759033, + "step": 7864 + }, + { + "epoch": 1.28, + "learning_rate": 3.0562690604503787e-06, + "logits/chosen": -0.9716980457305908, + "logits/rejected": -0.9716980457305908, + "logps/chosen": -0.7682275772094727, + "logps/rejected": -0.7682275772094727, + "loss": 0.3567, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.13422921299934387, + "rewards/margins": 0.0, + "rewards/rejected": 0.13422921299934387, + "step": 7865 + }, + { + "epoch": 1.28, + "learning_rate": 3.0550582464491245e-06, + "logits/chosen": -1.095638632774353, + "logits/rejected": -1.1869672536849976, + "logps/chosen": -63.26927947998047, + "logps/rejected": -65.24407958984375, + "loss": 0.9411, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.801236867904663, + "rewards/margins": -1.6739799976348877, + "rewards/rejected": 4.475216865539551, + "step": 7866 + }, + { + "epoch": 1.28, + "learning_rate": 3.053847566824943e-06, + "logits/chosen": -1.690828800201416, + "logits/rejected": -1.5037957429885864, + "logps/chosen": -121.14457702636719, + "logps/rejected": -72.06843566894531, + "loss": 0.5273, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.516003608703613, + "rewards/margins": 2.2068116664886475, + "rewards/rejected": 3.309191942214966, + "step": 7867 + }, + { + "epoch": 1.28, + "learning_rate": 3.05263702166148e-06, + "logits/chosen": -1.3694846630096436, + "logits/rejected": -1.3722060918807983, + "logps/chosen": -55.73229217529297, + "logps/rejected": -28.42200469970703, + "loss": 0.5716, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.5609161853790283, + "rewards/margins": -0.7532210350036621, + "rewards/rejected": 2.3141372203826904, + "step": 7868 + }, + { + "epoch": 1.28, + "learning_rate": 3.0514266110423718e-06, + "logits/chosen": -1.5380619764328003, + "logits/rejected": -1.5474258661270142, + "logps/chosen": -67.16065979003906, + "logps/rejected": -101.74675750732422, + "loss": 1.9756, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.9438133239746094, + "rewards/margins": -2.678267002105713, + "rewards/rejected": 5.622080326080322, + "step": 7869 + }, + { + "epoch": 1.28, + "learning_rate": 3.050216335051248e-06, + "logits/chosen": -0.9627680778503418, + "logits/rejected": -0.9680020213127136, + "logps/chosen": -8.696832656860352, + "logps/rejected": -7.5718512535095215, + "loss": 0.4149, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.07525882869958878, + "rewards/margins": -0.18041616678237915, + "rewards/rejected": 0.25567498803138733, + "step": 7870 + }, + { + "epoch": 1.28, + "learning_rate": 3.0490061937717265e-06, + "logits/chosen": -1.1270387172698975, + "logits/rejected": -1.0095151662826538, + "logps/chosen": -55.29522705078125, + "logps/rejected": -17.765132904052734, + "loss": 0.0946, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.7626748085021973, + "rewards/margins": 2.17529296875, + "rewards/rejected": 0.5873817801475525, + "step": 7871 + }, + { + "epoch": 1.28, + "learning_rate": 3.047796187287416e-06, + "logits/chosen": -1.606132984161377, + "logits/rejected": -1.6491177082061768, + "logps/chosen": -97.48587036132812, + "logps/rejected": -131.4832763671875, + "loss": 1.2321, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.78415846824646, + "rewards/margins": -0.49309229850769043, + "rewards/rejected": 3.2772507667541504, + "step": 7872 + }, + { + "epoch": 1.28, + "learning_rate": 3.0465863156819176e-06, + "logits/chosen": -1.4781115055084229, + "logits/rejected": -1.4263004064559937, + "logps/chosen": -136.2959747314453, + "logps/rejected": -138.93032836914062, + "loss": 0.4377, + "rewards/accuracies": 0.0, + "rewards/chosen": 6.20858907699585, + "rewards/margins": -0.30579710006713867, + "rewards/rejected": 6.514386177062988, + "step": 7873 + }, + { + "epoch": 1.28, + "learning_rate": 3.045376579038821e-06, + "logits/chosen": -1.529274821281433, + "logits/rejected": -1.4362744092941284, + "logps/chosen": -68.02143096923828, + "logps/rejected": -27.359861373901367, + "loss": 0.2665, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.738996148109436, + "rewards/margins": 0.5996278524398804, + "rewards/rejected": 1.1393682956695557, + "step": 7874 + }, + { + "epoch": 1.28, + "learning_rate": 3.044166977441708e-06, + "logits/chosen": -1.110198736190796, + "logits/rejected": -1.1171985864639282, + "logps/chosen": -43.82413101196289, + "logps/rejected": -80.86573791503906, + "loss": 0.8472, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.8054134845733643, + "rewards/margins": 0.2637737989425659, + "rewards/rejected": 1.5416396856307983, + "step": 7875 + }, + { + "epoch": 1.28, + "learning_rate": 3.0429575109741503e-06, + "logits/chosen": -1.1806889772415161, + "logits/rejected": -1.3363370895385742, + "logps/chosen": -69.51531982421875, + "logps/rejected": -136.849609375, + "loss": 3.6333, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.534980058670044, + "rewards/margins": -6.900094032287598, + "rewards/rejected": 9.435073852539062, + "step": 7876 + }, + { + "epoch": 1.28, + "learning_rate": 3.0417481797197116e-06, + "logits/chosen": -1.4847685098648071, + "logits/rejected": -1.5010374784469604, + "logps/chosen": -214.3499755859375, + "logps/rejected": -53.30009841918945, + "loss": 0.0264, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.466726779937744, + "rewards/margins": 5.473310470581055, + "rewards/rejected": 0.9934162497520447, + "step": 7877 + }, + { + "epoch": 1.28, + "learning_rate": 3.0405389837619447e-06, + "logits/chosen": -0.960411787033081, + "logits/rejected": -0.960411787033081, + "logps/chosen": -0.25873667001724243, + "logps/rejected": -0.25873667001724243, + "loss": 1.0911, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.07962408661842346, + "rewards/margins": 0.0, + "rewards/rejected": 0.07962408661842346, + "step": 7878 + }, + { + "epoch": 1.28, + "learning_rate": 3.0393299231843924e-06, + "logits/chosen": -1.170598030090332, + "logits/rejected": -1.170598030090332, + "logps/chosen": -55.27587127685547, + "logps/rejected": -55.27587127685547, + "loss": 0.382, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.0174310207366943, + "rewards/margins": 0.0, + "rewards/rejected": 3.0174310207366943, + "step": 7879 + }, + { + "epoch": 1.28, + "learning_rate": 3.0381209980705912e-06, + "logits/chosen": -1.1093111038208008, + "logits/rejected": -1.0523432493209839, + "logps/chosen": -35.108978271484375, + "logps/rejected": -56.152259826660156, + "loss": 0.4049, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.6207427978515625, + "rewards/margins": -0.21661925315856934, + "rewards/rejected": 3.837362051010132, + "step": 7880 + }, + { + "epoch": 1.28, + "learning_rate": 3.0369122085040654e-06, + "logits/chosen": -1.3745529651641846, + "logits/rejected": -1.250692367553711, + "logps/chosen": -69.95869445800781, + "logps/rejected": -53.94948959350586, + "loss": 0.0724, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1126954555511475, + "rewards/margins": 1.9439069032669067, + "rewards/rejected": 1.1687885522842407, + "step": 7881 + }, + { + "epoch": 1.28, + "learning_rate": 3.035703554568331e-06, + "logits/chosen": -1.324068307876587, + "logits/rejected": -1.3531187772750854, + "logps/chosen": -46.940921783447266, + "logps/rejected": -40.66663360595703, + "loss": 1.9605, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.0146052837371826, + "rewards/margins": -1.0648770332336426, + "rewards/rejected": 3.079482316970825, + "step": 7882 + }, + { + "epoch": 1.28, + "learning_rate": 3.0344950363468944e-06, + "logits/chosen": -1.1901586055755615, + "logits/rejected": -1.1740697622299194, + "logps/chosen": -64.45033264160156, + "logps/rejected": -82.25717163085938, + "loss": 0.4548, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.4471848011016846, + "rewards/margins": -0.37366795539855957, + "rewards/rejected": 3.820852756500244, + "step": 7883 + }, + { + "epoch": 1.28, + "learning_rate": 3.0332866539232532e-06, + "logits/chosen": -1.5440239906311035, + "logits/rejected": -1.5543458461761475, + "logps/chosen": -59.786293029785156, + "logps/rejected": -59.1034049987793, + "loss": 0.5472, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.733651041984558, + "rewards/margins": -0.2813061475753784, + "rewards/rejected": 2.0149571895599365, + "step": 7884 + }, + { + "epoch": 1.28, + "learning_rate": 3.032078407380895e-06, + "logits/chosen": -1.334132432937622, + "logits/rejected": -1.3049434423446655, + "logps/chosen": -45.74190139770508, + "logps/rejected": -21.041160583496094, + "loss": 0.3143, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.2037190198898315, + "rewards/margins": 0.49472928047180176, + "rewards/rejected": 0.7089897394180298, + "step": 7885 + }, + { + "epoch": 1.28, + "learning_rate": 3.0308702968032984e-06, + "logits/chosen": -1.4642932415008545, + "logits/rejected": -1.5147536993026733, + "logps/chosen": -131.3309326171875, + "logps/rejected": -128.01821899414062, + "loss": 3.1864, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.3396623134613037, + "rewards/margins": -6.327204704284668, + "rewards/rejected": 8.66686725616455, + "step": 7886 + }, + { + "epoch": 1.28, + "learning_rate": 3.029662322273932e-06, + "logits/chosen": -1.2341549396514893, + "logits/rejected": -1.087143898010254, + "logps/chosen": -233.97239685058594, + "logps/rejected": -23.224565505981445, + "loss": 0.1953, + "rewards/accuracies": 1.0, + "rewards/chosen": 7.40292501449585, + "rewards/margins": 6.8563079833984375, + "rewards/rejected": 0.5466169714927673, + "step": 7887 + }, + { + "epoch": 1.28, + "learning_rate": 3.0284544838762552e-06, + "logits/chosen": -1.5501154661178589, + "logits/rejected": -1.4791358709335327, + "logps/chosen": -44.7008056640625, + "logps/rejected": -21.744932174682617, + "loss": 0.8651, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.4494365453720093, + "rewards/margins": -0.36741209030151367, + "rewards/rejected": 1.816848635673523, + "step": 7888 + }, + { + "epoch": 1.28, + "learning_rate": 3.0272467816937197e-06, + "logits/chosen": -0.9565379023551941, + "logits/rejected": -0.9507261514663696, + "logps/chosen": -3.7299511432647705, + "logps/rejected": -2.2381324768066406, + "loss": 1.335, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3200230300426483, + "rewards/margins": 0.12265254557132721, + "rewards/rejected": 0.1973704844713211, + "step": 7889 + }, + { + "epoch": 1.28, + "learning_rate": 3.026039215809764e-06, + "logits/chosen": -1.1478729248046875, + "logits/rejected": -1.0970995426177979, + "logps/chosen": -66.17437744140625, + "logps/rejected": -91.43395233154297, + "loss": 0.4279, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.98087477684021, + "rewards/margins": 1.3536409139633179, + "rewards/rejected": 1.627233862876892, + "step": 7890 + }, + { + "epoch": 1.28, + "learning_rate": 3.0248317863078212e-06, + "logits/chosen": -1.400747299194336, + "logits/rejected": -1.3558855056762695, + "logps/chosen": -39.94198989868164, + "logps/rejected": -23.026195526123047, + "loss": 0.2768, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.54345440864563, + "rewards/margins": 0.5492820739746094, + "rewards/rejected": 1.9941723346710205, + "step": 7891 + }, + { + "epoch": 1.28, + "learning_rate": 3.023624493271313e-06, + "logits/chosen": -1.1818532943725586, + "logits/rejected": -1.2204301357269287, + "logps/chosen": -68.62586975097656, + "logps/rejected": -64.54325866699219, + "loss": 0.4281, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.3046233654022217, + "rewards/margins": 0.33690786361694336, + "rewards/rejected": 1.9677155017852783, + "step": 7892 + }, + { + "epoch": 1.28, + "learning_rate": 3.022417336783652e-06, + "logits/chosen": -1.2390395402908325, + "logits/rejected": -1.119385838508606, + "logps/chosen": -61.39771270751953, + "logps/rejected": -28.068927764892578, + "loss": 0.3245, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.808016300201416, + "rewards/margins": 1.0248825550079346, + "rewards/rejected": 1.7831337451934814, + "step": 7893 + }, + { + "epoch": 1.28, + "learning_rate": 3.0212103169282415e-06, + "logits/chosen": -1.5257313251495361, + "logits/rejected": -1.3558127880096436, + "logps/chosen": -137.4855194091797, + "logps/rejected": -78.55827331542969, + "loss": 0.06, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.436402797698975, + "rewards/margins": 2.4537360668182373, + "rewards/rejected": 3.9826667308807373, + "step": 7894 + }, + { + "epoch": 1.28, + "learning_rate": 3.0200034337884733e-06, + "logits/chosen": -1.3506025075912476, + "logits/rejected": -1.276626467704773, + "logps/chosen": -166.15023803710938, + "logps/rejected": -73.24378967285156, + "loss": 0.4161, + "rewards/accuracies": 0.0, + "rewards/chosen": 7.902395725250244, + "rewards/margins": -0.24322938919067383, + "rewards/rejected": 8.145625114440918, + "step": 7895 + }, + { + "epoch": 1.28, + "learning_rate": 3.0187966874477337e-06, + "logits/chosen": -1.2989455461502075, + "logits/rejected": -0.8418905735015869, + "logps/chosen": -40.60661315917969, + "logps/rejected": -75.26801300048828, + "loss": 0.8497, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.438308000564575, + "rewards/margins": -1.496943712234497, + "rewards/rejected": 3.9352517127990723, + "step": 7896 + }, + { + "epoch": 1.28, + "learning_rate": 3.017590077989396e-06, + "logits/chosen": -1.284989595413208, + "logits/rejected": -1.0845191478729248, + "logps/chosen": -109.18095397949219, + "logps/rejected": -38.48421096801758, + "loss": 0.0048, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.351583957672119, + "rewards/margins": 5.753942966461182, + "rewards/rejected": 0.5976409912109375, + "step": 7897 + }, + { + "epoch": 1.28, + "learning_rate": 3.0163836054968265e-06, + "logits/chosen": -1.5343270301818848, + "logits/rejected": -1.4549837112426758, + "logps/chosen": -143.59194946289062, + "logps/rejected": -87.99005126953125, + "loss": 0.3478, + "rewards/accuracies": 1.0, + "rewards/chosen": 7.462318420410156, + "rewards/margins": 0.06713104248046875, + "rewards/rejected": 7.3951873779296875, + "step": 7898 + }, + { + "epoch": 1.28, + "learning_rate": 3.01517727005338e-06, + "logits/chosen": -1.3901277780532837, + "logits/rejected": -1.3637045621871948, + "logps/chosen": -110.58954620361328, + "logps/rejected": -94.92657470703125, + "loss": 0.1007, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.8324944972991943, + "rewards/margins": 1.5404671430587769, + "rewards/rejected": 1.2920273542404175, + "step": 7899 + }, + { + "epoch": 1.28, + "learning_rate": 3.013971071742404e-06, + "logits/chosen": -1.4051181077957153, + "logits/rejected": -1.3999526500701904, + "logps/chosen": -102.25875091552734, + "logps/rejected": -147.32667541503906, + "loss": 0.1009, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.284038543701172, + "rewards/margins": 1.518358588218689, + "rewards/rejected": 1.765679955482483, + "step": 7900 + }, + { + "epoch": 1.28, + "learning_rate": 3.0127650106472358e-06, + "logits/chosen": -1.5259751081466675, + "logits/rejected": -1.5021991729736328, + "logps/chosen": -67.86460876464844, + "logps/rejected": -72.0749740600586, + "loss": 0.6888, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.332043409347534, + "rewards/margins": -0.8233392238616943, + "rewards/rejected": 3.1553826332092285, + "step": 7901 + }, + { + "epoch": 1.28, + "learning_rate": 3.0115590868512013e-06, + "logits/chosen": -0.8976389169692993, + "logits/rejected": -0.8573590517044067, + "logps/chosen": -43.18318176269531, + "logps/rejected": -9.17391586303711, + "loss": 0.7533, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.2234241962432861, + "rewards/margins": 0.6572535037994385, + "rewards/rejected": 0.5661706924438477, + "step": 7902 + }, + { + "epoch": 1.28, + "learning_rate": 3.0103533004376183e-06, + "logits/chosen": -1.2867939472198486, + "logits/rejected": -1.2603591680526733, + "logps/chosen": -49.839927673339844, + "logps/rejected": -36.777488708496094, + "loss": 0.0695, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.355762004852295, + "rewards/margins": 2.0717172622680664, + "rewards/rejected": 2.2840447425842285, + "step": 7903 + }, + { + "epoch": 1.28, + "learning_rate": 3.0091476514897975e-06, + "logits/chosen": -1.2187163829803467, + "logits/rejected": -1.2548072338104248, + "logps/chosen": -76.6348648071289, + "logps/rejected": -78.71002960205078, + "loss": 1.5835, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.8379783630371094, + "rewards/margins": -1.994110107421875, + "rewards/rejected": 2.8320884704589844, + "step": 7904 + }, + { + "epoch": 1.28, + "learning_rate": 3.0079421400910357e-06, + "logits/chosen": -1.198992133140564, + "logits/rejected": -1.2470160722732544, + "logps/chosen": -111.80570220947266, + "logps/rejected": -115.8050765991211, + "loss": 0.9768, + "rewards/accuracies": 0.0, + "rewards/chosen": 4.964200496673584, + "rewards/margins": -0.6642794609069824, + "rewards/rejected": 5.628479957580566, + "step": 7905 + }, + { + "epoch": 1.28, + "learning_rate": 3.006736766324623e-06, + "logits/chosen": -1.1362743377685547, + "logits/rejected": -1.1362743377685547, + "logps/chosen": -40.80157470703125, + "logps/rejected": -40.80157470703125, + "loss": 0.427, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.6912636160850525, + "rewards/margins": 0.0, + "rewards/rejected": 0.6912636160850525, + "step": 7906 + }, + { + "epoch": 1.28, + "learning_rate": 3.005531530273839e-06, + "logits/chosen": -1.0952904224395752, + "logits/rejected": -1.3023126125335693, + "logps/chosen": -102.70770263671875, + "logps/rejected": -37.95444869995117, + "loss": 0.8644, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.8954185247421265, + "rewards/margins": -0.6027687788009644, + "rewards/rejected": 2.498187303543091, + "step": 7907 + }, + { + "epoch": 1.28, + "learning_rate": 3.0043264320219546e-06, + "logits/chosen": -1.4629030227661133, + "logits/rejected": -1.2407748699188232, + "logps/chosen": -161.33851623535156, + "logps/rejected": -49.797340393066406, + "loss": 0.0124, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.987550258636475, + "rewards/margins": 4.003072738647461, + "rewards/rejected": 1.9844772815704346, + "step": 7908 + }, + { + "epoch": 1.28, + "learning_rate": 3.0031214716522304e-06, + "logits/chosen": -1.1714107990264893, + "logits/rejected": -1.0497604608535767, + "logps/chosen": -56.407989501953125, + "logps/rejected": -7.743035793304443, + "loss": 0.0136, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.352116584777832, + "rewards/margins": 4.588564395904541, + "rewards/rejected": 0.7635523676872253, + "step": 7909 + }, + { + "epoch": 1.28, + "learning_rate": 3.0019166492479187e-06, + "logits/chosen": -0.9906250238418579, + "logits/rejected": -0.9806320071220398, + "logps/chosen": -6.238962650299072, + "logps/rejected": -1.72551429271698, + "loss": 0.4481, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.6497330069541931, + "rewards/margins": -0.12413680553436279, + "rewards/rejected": 0.7738698124885559, + "step": 7910 + }, + { + "epoch": 1.28, + "learning_rate": 3.00071196489226e-06, + "logits/chosen": -1.324404001235962, + "logits/rejected": -1.1452643871307373, + "logps/chosen": -111.72264099121094, + "logps/rejected": -31.449758529663086, + "loss": 0.3, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.466441631317139, + "rewards/margins": 5.464188575744629, + "rewards/rejected": 0.0022531510330736637, + "step": 7911 + }, + { + "epoch": 1.28, + "learning_rate": 2.999507418668487e-06, + "logits/chosen": -0.943723201751709, + "logits/rejected": -1.009522795677185, + "logps/chosen": -103.3471908569336, + "logps/rejected": -79.52960968017578, + "loss": 1.4283, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.031325578689575, + "rewards/margins": 1.608803629875183, + "rewards/rejected": 1.422521948814392, + "step": 7912 + }, + { + "epoch": 1.28, + "learning_rate": 2.998303010659822e-06, + "logits/chosen": -1.3675647974014282, + "logits/rejected": -0.8758198618888855, + "logps/chosen": -188.5496826171875, + "logps/rejected": -275.4869384765625, + "loss": 0.2772, + "rewards/accuracies": 1.0, + "rewards/chosen": 7.0223388671875, + "rewards/margins": 0.66400146484375, + "rewards/rejected": 6.35833740234375, + "step": 7913 + }, + { + "epoch": 1.28, + "learning_rate": 2.9970987409494784e-06, + "logits/chosen": -0.8670315742492676, + "logits/rejected": -0.8769835829734802, + "logps/chosen": -0.707741916179657, + "logps/rejected": -26.834896087646484, + "loss": 0.7233, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.18401721119880676, + "rewards/margins": -0.11720368266105652, + "rewards/rejected": 0.3012208938598633, + "step": 7914 + }, + { + "epoch": 1.28, + "learning_rate": 2.995894609620661e-06, + "logits/chosen": -1.302889347076416, + "logits/rejected": -1.1657768487930298, + "logps/chosen": -43.68130111694336, + "logps/rejected": -89.97026824951172, + "loss": 1.8751, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.0993077754974365, + "rewards/margins": -3.53244948387146, + "rewards/rejected": 6.6317572593688965, + "step": 7915 + }, + { + "epoch": 1.28, + "learning_rate": 2.994690616756563e-06, + "logits/chosen": -1.405225396156311, + "logits/rejected": -1.337878942489624, + "logps/chosen": -61.86588668823242, + "logps/rejected": -63.672035217285156, + "loss": 0.4904, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.8689961433410645, + "rewards/margins": -0.5061144828796387, + "rewards/rejected": 3.375110626220703, + "step": 7916 + }, + { + "epoch": 1.29, + "learning_rate": 2.993486762440369e-06, + "logits/chosen": -1.0421761274337769, + "logits/rejected": -1.0453957319259644, + "logps/chosen": -76.04489135742188, + "logps/rejected": -132.0750732421875, + "loss": 0.7844, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.5450447797775269, + "rewards/margins": -0.43011391162872314, + "rewards/rejected": 1.97515869140625, + "step": 7917 + }, + { + "epoch": 1.29, + "learning_rate": 2.9922830467552542e-06, + "logits/chosen": -1.2464349269866943, + "logits/rejected": -1.0915982723236084, + "logps/chosen": -61.54498291015625, + "logps/rejected": -13.106462478637695, + "loss": 0.3126, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.850743055343628, + "rewards/margins": 1.788830280303955, + "rewards/rejected": 1.0619127750396729, + "step": 7918 + }, + { + "epoch": 1.29, + "learning_rate": 2.991079469784383e-06, + "logits/chosen": -1.0922746658325195, + "logits/rejected": -0.9620346426963806, + "logps/chosen": -102.17407989501953, + "logps/rejected": -85.33425903320312, + "loss": 0.0966, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.0017266273498535, + "rewards/margins": 2.7999656200408936, + "rewards/rejected": 3.20176100730896, + "step": 7919 + }, + { + "epoch": 1.29, + "learning_rate": 2.9898760316109123e-06, + "logits/chosen": -1.2482935190200806, + "logits/rejected": -1.2925868034362793, + "logps/chosen": -27.031021118164062, + "logps/rejected": -115.07650756835938, + "loss": 0.458, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.9461711645126343, + "rewards/margins": 0.06933403015136719, + "rewards/rejected": 1.876837134361267, + "step": 7920 + }, + { + "epoch": 1.29, + "learning_rate": 2.9886727323179877e-06, + "logits/chosen": -1.468773365020752, + "logits/rejected": -1.371031403541565, + "logps/chosen": -161.07168579101562, + "logps/rejected": -18.760482788085938, + "loss": 0.0047, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.225406169891357, + "rewards/margins": 4.947604656219482, + "rewards/rejected": 0.2778013348579407, + "step": 7921 + }, + { + "epoch": 1.29, + "learning_rate": 2.9874695719887463e-06, + "logits/chosen": -1.5160704851150513, + "logits/rejected": -1.4868510961532593, + "logps/chosen": -61.126556396484375, + "logps/rejected": -42.019840240478516, + "loss": 0.9216, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.8690872192382812, + "rewards/margins": -1.6676533222198486, + "rewards/rejected": 3.53674054145813, + "step": 7922 + }, + { + "epoch": 1.29, + "learning_rate": 2.986266550706315e-06, + "logits/chosen": -1.1507527828216553, + "logits/rejected": -1.0900031328201294, + "logps/chosen": -76.6364974975586, + "logps/rejected": -62.07353973388672, + "loss": 0.1579, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.6632134914398193, + "rewards/margins": 1.2601494789123535, + "rewards/rejected": 1.4030640125274658, + "step": 7923 + }, + { + "epoch": 1.29, + "learning_rate": 2.9850636685538104e-06, + "logits/chosen": -1.2521952390670776, + "logits/rejected": -1.2281241416931152, + "logps/chosen": -87.74695587158203, + "logps/rejected": -49.90520477294922, + "loss": 0.7055, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.5213615894317627, + "rewards/margins": -0.8023033142089844, + "rewards/rejected": 2.323664903640747, + "step": 7924 + }, + { + "epoch": 1.29, + "learning_rate": 2.9838609256143402e-06, + "logits/chosen": -1.4137564897537231, + "logits/rejected": -1.5155341625213623, + "logps/chosen": -60.67308044433594, + "logps/rejected": -92.986083984375, + "loss": 0.7316, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.6128029823303223, + "rewards/margins": -0.6942000389099121, + "rewards/rejected": 3.3070030212402344, + "step": 7925 + }, + { + "epoch": 1.29, + "learning_rate": 2.9826583219710036e-06, + "logits/chosen": -0.9040117859840393, + "logits/rejected": -0.9040117859840393, + "logps/chosen": -13.235888481140137, + "logps/rejected": -13.235888481140137, + "loss": 1.1831, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.457369327545166, + "rewards/margins": 0.0, + "rewards/rejected": 0.457369327545166, + "step": 7926 + }, + { + "epoch": 1.29, + "learning_rate": 2.9814558577068893e-06, + "logits/chosen": -1.3366550207138062, + "logits/rejected": -1.3391250371932983, + "logps/chosen": -57.044918060302734, + "logps/rejected": -56.66963195800781, + "loss": 0.7461, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.9549579620361328, + "rewards/margins": -0.20169425010681152, + "rewards/rejected": 2.1566522121429443, + "step": 7927 + }, + { + "epoch": 1.29, + "learning_rate": 2.980253532905075e-06, + "logits/chosen": -1.5437960624694824, + "logits/rejected": -1.5389500856399536, + "logps/chosen": -74.46035766601562, + "logps/rejected": -72.23716735839844, + "loss": 0.7868, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.2279701232910156, + "rewards/margins": -0.5306282043457031, + "rewards/rejected": 2.7585983276367188, + "step": 7928 + }, + { + "epoch": 1.29, + "learning_rate": 2.9790513476486305e-06, + "logits/chosen": -1.2358468770980835, + "logits/rejected": -1.235977292060852, + "logps/chosen": -55.97299575805664, + "logps/rejected": -65.68982696533203, + "loss": 1.0786, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.1236469745635986, + "rewards/margins": 1.837412714958191, + "rewards/rejected": 0.2862342894077301, + "step": 7929 + }, + { + "epoch": 1.29, + "learning_rate": 2.9778493020206155e-06, + "logits/chosen": -1.5803053379058838, + "logits/rejected": -1.490829348564148, + "logps/chosen": -137.5150909423828, + "logps/rejected": -48.07355499267578, + "loss": 0.0113, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.14871072769165, + "rewards/margins": 4.222097873687744, + "rewards/rejected": 1.9266128540039062, + "step": 7930 + }, + { + "epoch": 1.29, + "learning_rate": 2.976647396104081e-06, + "logits/chosen": -1.2740973234176636, + "logits/rejected": -1.2323691844940186, + "logps/chosen": -70.59059143066406, + "logps/rejected": -83.146240234375, + "loss": 0.6925, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.633749485015869, + "rewards/margins": -0.9652190208435059, + "rewards/rejected": 3.598968505859375, + "step": 7931 + }, + { + "epoch": 1.29, + "learning_rate": 2.975445629982065e-06, + "logits/chosen": -0.8535997867584229, + "logits/rejected": -0.8227804899215698, + "logps/chosen": -41.12257766723633, + "logps/rejected": -37.34663391113281, + "loss": 0.5173, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.1308602094650269, + "rewards/margins": 0.21937412023544312, + "rewards/rejected": 0.9114860892295837, + "step": 7932 + }, + { + "epoch": 1.29, + "learning_rate": 2.9742440037375996e-06, + "logits/chosen": -1.6037771701812744, + "logits/rejected": -1.33038330078125, + "logps/chosen": -121.37983703613281, + "logps/rejected": -23.620159149169922, + "loss": 0.0069, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.327481269836426, + "rewards/margins": 6.081226825714111, + "rewards/rejected": 0.2462543547153473, + "step": 7933 + }, + { + "epoch": 1.29, + "learning_rate": 2.9730425174537057e-06, + "logits/chosen": -1.3170441389083862, + "logits/rejected": -1.3168463706970215, + "logps/chosen": -79.0755615234375, + "logps/rejected": -91.55943298339844, + "loss": 0.1782, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.794818162918091, + "rewards/margins": 1.2498565912246704, + "rewards/rejected": 1.5449615716934204, + "step": 7934 + }, + { + "epoch": 1.29, + "learning_rate": 2.9718411712133956e-06, + "logits/chosen": -1.419564962387085, + "logits/rejected": -1.4280365705490112, + "logps/chosen": -69.82608032226562, + "logps/rejected": -63.731422424316406, + "loss": 0.9852, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.7961563467979431, + "rewards/margins": -0.41519469022750854, + "rewards/rejected": 1.2113510370254517, + "step": 7935 + }, + { + "epoch": 1.29, + "learning_rate": 2.9706399650996687e-06, + "logits/chosen": -1.3058743476867676, + "logits/rejected": -1.328733205795288, + "logps/chosen": -77.21869659423828, + "logps/rejected": -47.66497039794922, + "loss": 0.5485, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.7093429565429688, + "rewards/margins": -0.618248701095581, + "rewards/rejected": 2.32759165763855, + "step": 7936 + }, + { + "epoch": 1.29, + "learning_rate": 2.969438899195519e-06, + "logits/chosen": -1.5209059715270996, + "logits/rejected": -1.4137343168258667, + "logps/chosen": -93.29291534423828, + "logps/rejected": -73.44121551513672, + "loss": 0.0833, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.303536415100098, + "rewards/margins": 4.034761905670166, + "rewards/rejected": 2.2687745094299316, + "step": 7937 + }, + { + "epoch": 1.29, + "learning_rate": 2.968237973583928e-06, + "logits/chosen": -1.326765775680542, + "logits/rejected": -1.4003493785858154, + "logps/chosen": -85.95429992675781, + "logps/rejected": -95.1920166015625, + "loss": 1.9552, + "rewards/accuracies": 0.0, + "rewards/chosen": 5.5876359939575195, + "rewards/margins": -3.3392438888549805, + "rewards/rejected": 8.9268798828125, + "step": 7938 + }, + { + "epoch": 1.29, + "learning_rate": 2.967037188347868e-06, + "logits/chosen": -1.8759958744049072, + "logits/rejected": -1.4757124185562134, + "logps/chosen": -94.80668640136719, + "logps/rejected": -54.248809814453125, + "loss": 0.4993, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.4397339820861816, + "rewards/margins": 0.1428360939025879, + "rewards/rejected": 2.2968978881835938, + "step": 7939 + }, + { + "epoch": 1.29, + "learning_rate": 2.965836543570302e-06, + "logits/chosen": -1.1928989887237549, + "logits/rejected": -1.1745778322219849, + "logps/chosen": -36.7374153137207, + "logps/rejected": -23.63576889038086, + "loss": 0.3124, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.4529556035995483, + "rewards/margins": 0.6943170428276062, + "rewards/rejected": 0.7586385607719421, + "step": 7940 + }, + { + "epoch": 1.29, + "learning_rate": 2.964636039334184e-06, + "logits/chosen": -1.6845024824142456, + "logits/rejected": -1.7061035633087158, + "logps/chosen": -90.47645568847656, + "logps/rejected": -79.43342590332031, + "loss": 0.2212, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.852665662765503, + "rewards/margins": 1.3237684965133667, + "rewards/rejected": 1.5288971662521362, + "step": 7941 + }, + { + "epoch": 1.29, + "learning_rate": 2.963435675722456e-06, + "logits/chosen": -1.3882631063461304, + "logits/rejected": -1.1790562868118286, + "logps/chosen": -143.78271484375, + "logps/rejected": -71.01426696777344, + "loss": 0.3748, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.271450996398926, + "rewards/margins": 2.8449831008911133, + "rewards/rejected": 3.4264678955078125, + "step": 7942 + }, + { + "epoch": 1.29, + "learning_rate": 2.9622354528180535e-06, + "logits/chosen": -1.173143744468689, + "logits/rejected": -1.1384356021881104, + "logps/chosen": -57.74956130981445, + "logps/rejected": -72.36004638671875, + "loss": 0.3147, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9036426544189453, + "rewards/margins": 0.17650794982910156, + "rewards/rejected": 2.7271347045898438, + "step": 7943 + }, + { + "epoch": 1.29, + "learning_rate": 2.9610353707038995e-06, + "logits/chosen": -1.3711450099945068, + "logits/rejected": -1.287163257598877, + "logps/chosen": -69.3760986328125, + "logps/rejected": -35.634307861328125, + "loss": 0.1014, + "rewards/accuracies": 1.0, + "rewards/chosen": 7.444911479949951, + "rewards/margins": 4.921546459197998, + "rewards/rejected": 2.523365020751953, + "step": 7944 + }, + { + "epoch": 1.29, + "learning_rate": 2.9598354294629083e-06, + "logits/chosen": -1.1930164098739624, + "logits/rejected": -0.9373378157615662, + "logps/chosen": -107.3926010131836, + "logps/rejected": -40.916255950927734, + "loss": 0.0206, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.856906890869141, + "rewards/margins": 3.21825909614563, + "rewards/rejected": 2.6386477947235107, + "step": 7945 + }, + { + "epoch": 1.29, + "learning_rate": 2.958635629177985e-06, + "logits/chosen": -1.0041743516921997, + "logits/rejected": -1.0176939964294434, + "logps/chosen": -1.575000286102295, + "logps/rejected": -33.70977783203125, + "loss": 0.5529, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.34030967950820923, + "rewards/margins": -0.6814127564430237, + "rewards/rejected": 1.021722435951233, + "step": 7946 + }, + { + "epoch": 1.29, + "learning_rate": 2.9574359699320244e-06, + "logits/chosen": -1.3737945556640625, + "logits/rejected": -1.431516408920288, + "logps/chosen": -117.94378662109375, + "logps/rejected": -123.76074981689453, + "loss": 0.4446, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.887425422668457, + "rewards/margins": 0.3116569519042969, + "rewards/rejected": 6.57576847076416, + "step": 7947 + }, + { + "epoch": 1.29, + "learning_rate": 2.9562364518079106e-06, + "logits/chosen": -1.1438884735107422, + "logits/rejected": -1.1438884735107422, + "logps/chosen": -12.740325927734375, + "logps/rejected": -12.740325927734375, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.6437808871269226, + "rewards/margins": 0.0, + "rewards/rejected": 0.6437808871269226, + "step": 7948 + }, + { + "epoch": 1.29, + "learning_rate": 2.9550370748885203e-06, + "logits/chosen": -0.9716694951057434, + "logits/rejected": -0.9704734086990356, + "logps/chosen": -66.92446899414062, + "logps/rejected": -50.12378692626953, + "loss": 2.4594, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.722882866859436, + "rewards/margins": -0.35690534114837646, + "rewards/rejected": 2.0797882080078125, + "step": 7949 + }, + { + "epoch": 1.29, + "learning_rate": 2.9538378392567168e-06, + "logits/chosen": -1.5545521974563599, + "logits/rejected": -1.4891881942749023, + "logps/chosen": -30.04132080078125, + "logps/rejected": -30.538145065307617, + "loss": 0.7704, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.4739913940429688, + "rewards/margins": -1.2796814441680908, + "rewards/rejected": 3.7536728382110596, + "step": 7950 + }, + { + "epoch": 1.29, + "learning_rate": 2.9526387449953597e-06, + "logits/chosen": -1.337614893913269, + "logits/rejected": -1.294607162475586, + "logps/chosen": -58.295127868652344, + "logps/rejected": -41.25190734863281, + "loss": 1.2036, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.9002227783203125, + "rewards/margins": 1.3230955600738525, + "rewards/rejected": 2.57712721824646, + "step": 7951 + }, + { + "epoch": 1.29, + "learning_rate": 2.9514397921872923e-06, + "logits/chosen": -1.4966028928756714, + "logits/rejected": -1.3658366203308105, + "logps/chosen": -143.67425537109375, + "logps/rejected": -56.18772888183594, + "loss": 0.0825, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.602151393890381, + "rewards/margins": 2.0171263217926025, + "rewards/rejected": 2.5850250720977783, + "step": 7952 + }, + { + "epoch": 1.29, + "learning_rate": 2.9502409809153514e-06, + "logits/chosen": -1.3137569427490234, + "logits/rejected": -1.2407444715499878, + "logps/chosen": -47.5616569519043, + "logps/rejected": -64.97120666503906, + "loss": 0.7526, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0953099727630615, + "rewards/margins": 1.5224515199661255, + "rewards/rejected": 1.572858452796936, + "step": 7953 + }, + { + "epoch": 1.29, + "learning_rate": 2.9490423112623646e-06, + "logits/chosen": -1.526050329208374, + "logits/rejected": -1.2114782333374023, + "logps/chosen": -103.23815155029297, + "logps/rejected": -13.116313934326172, + "loss": 0.0188, + "rewards/accuracies": 1.0, + "rewards/chosen": 8.75186824798584, + "rewards/margins": 7.159284591674805, + "rewards/rejected": 1.5925835371017456, + "step": 7954 + }, + { + "epoch": 1.29, + "learning_rate": 2.9478437833111466e-06, + "logits/chosen": -1.1983944177627563, + "logits/rejected": -1.1247901916503906, + "logps/chosen": -109.64574432373047, + "logps/rejected": -92.73013305664062, + "loss": 0.3542, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.133460283279419, + "rewards/margins": 2.4101548194885254, + "rewards/rejected": 0.7233055233955383, + "step": 7955 + }, + { + "epoch": 1.29, + "learning_rate": 2.9466453971445065e-06, + "logits/chosen": -1.427362084388733, + "logits/rejected": -1.447018027305603, + "logps/chosen": -42.49574661254883, + "logps/rejected": -49.84066390991211, + "loss": 0.7816, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.1383607387542725, + "rewards/margins": -0.02230548858642578, + "rewards/rejected": 2.1606662273406982, + "step": 7956 + }, + { + "epoch": 1.29, + "learning_rate": 2.94544715284524e-06, + "logits/chosen": -1.5087367296218872, + "logits/rejected": -1.4758408069610596, + "logps/chosen": -57.74216842651367, + "logps/rejected": -65.37835693359375, + "loss": 4.2151, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.9926670789718628, + "rewards/margins": -0.0440746545791626, + "rewards/rejected": 2.0367417335510254, + "step": 7957 + }, + { + "epoch": 1.29, + "learning_rate": 2.944249050496135e-06, + "logits/chosen": -1.2822892665863037, + "logits/rejected": -1.3225523233413696, + "logps/chosen": -44.192138671875, + "logps/rejected": -56.35905838012695, + "loss": 0.7096, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.552232503890991, + "rewards/margins": -0.2895076274871826, + "rewards/rejected": 3.841740131378174, + "step": 7958 + }, + { + "epoch": 1.29, + "learning_rate": 2.9430510901799687e-06, + "logits/chosen": -0.8873611092567444, + "logits/rejected": -0.8883100748062134, + "logps/chosen": -3.9877493381500244, + "logps/rejected": -3.8120779991149902, + "loss": 0.7072, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.22242425382137299, + "rewards/margins": -0.24969981610774994, + "rewards/rejected": 0.4721240699291229, + "step": 7959 + }, + { + "epoch": 1.29, + "learning_rate": 2.941853271979509e-06, + "logits/chosen": -1.2186121940612793, + "logits/rejected": -1.2224146127700806, + "logps/chosen": -22.328575134277344, + "logps/rejected": -4.916329383850098, + "loss": 0.7213, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.48572197556495667, + "rewards/margins": 0.005245208740234375, + "rewards/rejected": 0.4804767668247223, + "step": 7960 + }, + { + "epoch": 1.29, + "learning_rate": 2.940655595977514e-06, + "logits/chosen": -1.6639715433120728, + "logits/rejected": -1.6161668300628662, + "logps/chosen": -167.855224609375, + "logps/rejected": -80.11785125732422, + "loss": 0.0638, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.221337795257568, + "rewards/margins": 3.328928232192993, + "rewards/rejected": 1.8924095630645752, + "step": 7961 + }, + { + "epoch": 1.29, + "learning_rate": 2.9394580622567316e-06, + "logits/chosen": -1.2228041887283325, + "logits/rejected": -1.2251774072647095, + "logps/chosen": -94.82231140136719, + "logps/rejected": -90.8836898803711, + "loss": 0.7353, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.127910614013672, + "rewards/margins": 0.7532165050506592, + "rewards/rejected": 1.3746941089630127, + "step": 7962 + }, + { + "epoch": 1.29, + "learning_rate": 2.9382606708999007e-06, + "logits/chosen": -1.2732243537902832, + "logits/rejected": -1.2730437517166138, + "logps/chosen": -59.28404998779297, + "logps/rejected": -95.50762176513672, + "loss": 0.1992, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.049288272857666, + "rewards/margins": 1.0151993036270142, + "rewards/rejected": 1.0340889692306519, + "step": 7963 + }, + { + "epoch": 1.29, + "learning_rate": 2.9370634219897482e-06, + "logits/chosen": -1.2600723505020142, + "logits/rejected": -1.2684905529022217, + "logps/chosen": -50.780250549316406, + "logps/rejected": -37.016510009765625, + "loss": 0.4351, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.0618979930877686, + "rewards/margins": 0.42872917652130127, + "rewards/rejected": 1.6331688165664673, + "step": 7964 + }, + { + "epoch": 1.29, + "learning_rate": 2.935866315608994e-06, + "logits/chosen": -1.0220825672149658, + "logits/rejected": -0.9894483089447021, + "logps/chosen": -42.21773147583008, + "logps/rejected": -3.944842576980591, + "loss": 0.3569, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.623767077922821, + "rewards/margins": 0.2016141712665558, + "rewards/rejected": 0.42215290665626526, + "step": 7965 + }, + { + "epoch": 1.29, + "learning_rate": 2.9346693518403456e-06, + "logits/chosen": -1.6592930555343628, + "logits/rejected": -1.764811396598816, + "logps/chosen": -76.6019058227539, + "logps/rejected": -152.92921447753906, + "loss": 1.2092, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.4724464416503906, + "rewards/margins": -2.3209800720214844, + "rewards/rejected": 4.793426513671875, + "step": 7966 + }, + { + "epoch": 1.29, + "learning_rate": 2.9334725307665034e-06, + "logits/chosen": -1.5534700155258179, + "logits/rejected": -1.5318633317947388, + "logps/chosen": -67.91986083984375, + "logps/rejected": -42.82235336303711, + "loss": 0.2681, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.936882734298706, + "rewards/margins": 0.6192805767059326, + "rewards/rejected": 2.3176021575927734, + "step": 7967 + }, + { + "epoch": 1.29, + "learning_rate": 2.9322758524701557e-06, + "logits/chosen": -1.2975432872772217, + "logits/rejected": -1.352211833000183, + "logps/chosen": -62.9936408996582, + "logps/rejected": -134.41827392578125, + "loss": 1.8616, + "rewards/accuracies": 0.0, + "rewards/chosen": 4.567548751831055, + "rewards/margins": -3.3383288383483887, + "rewards/rejected": 7.905877590179443, + "step": 7968 + }, + { + "epoch": 1.29, + "learning_rate": 2.9310793170339813e-06, + "logits/chosen": -1.305761456489563, + "logits/rejected": -1.2246092557907104, + "logps/chosen": -75.35267639160156, + "logps/rejected": -78.26073455810547, + "loss": 0.9086, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.7729103565216064, + "rewards/margins": 0.8621841669082642, + "rewards/rejected": 1.9107261896133423, + "step": 7969 + }, + { + "epoch": 1.29, + "learning_rate": 2.92988292454065e-06, + "logits/chosen": -1.4542226791381836, + "logits/rejected": -1.3900104761123657, + "logps/chosen": -161.2317352294922, + "logps/rejected": -73.5466537475586, + "loss": 0.1409, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.9204773902893066, + "rewards/margins": 1.150305986404419, + "rewards/rejected": 2.7701714038848877, + "step": 7970 + }, + { + "epoch": 1.29, + "learning_rate": 2.9286866750728203e-06, + "logits/chosen": -0.8990979790687561, + "logits/rejected": -0.9221150875091553, + "logps/chosen": -57.78392791748047, + "logps/rejected": -82.2469482421875, + "loss": 0.812, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.3802857398986816, + "rewards/margins": -1.0237257480621338, + "rewards/rejected": 3.4040114879608154, + "step": 7971 + }, + { + "epoch": 1.29, + "learning_rate": 2.9274905687131437e-06, + "logits/chosen": -0.5950641632080078, + "logits/rejected": -0.5950641632080078, + "logps/chosen": -1.4016696214675903, + "logps/rejected": -1.4016696214675903, + "loss": 0.4278, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.33833643794059753, + "rewards/margins": 0.0, + "rewards/rejected": 0.33833643794059753, + "step": 7972 + }, + { + "epoch": 1.29, + "learning_rate": 2.9262946055442577e-06, + "logits/chosen": -1.1577341556549072, + "logits/rejected": -1.1502681970596313, + "logps/chosen": -66.15382385253906, + "logps/rejected": -60.17123031616211, + "loss": 1.5494, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.7677124738693237, + "rewards/margins": -0.677636981010437, + "rewards/rejected": 2.4453494548797607, + "step": 7973 + }, + { + "epoch": 1.29, + "learning_rate": 2.9250987856487932e-06, + "logits/chosen": -1.5592573881149292, + "logits/rejected": -1.3155112266540527, + "logps/chosen": -89.71389770507812, + "logps/rejected": -54.62495803833008, + "loss": 0.1357, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.160226345062256, + "rewards/margins": 1.66591215133667, + "rewards/rejected": 4.494314193725586, + "step": 7974 + }, + { + "epoch": 1.29, + "learning_rate": 2.9239031091093696e-06, + "logits/chosen": -1.5318629741668701, + "logits/rejected": -1.513401985168457, + "logps/chosen": -33.555667877197266, + "logps/rejected": -40.68843078613281, + "loss": 0.4236, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0011730194091797, + "rewards/margins": 0.3276712894439697, + "rewards/rejected": 2.67350172996521, + "step": 7975 + }, + { + "epoch": 1.29, + "learning_rate": 2.9227075760085965e-06, + "logits/chosen": -1.5364904403686523, + "logits/rejected": -1.5934436321258545, + "logps/chosen": -165.23475646972656, + "logps/rejected": -129.6917724609375, + "loss": 0.0144, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.670838832855225, + "rewards/margins": 3.6213347911834717, + "rewards/rejected": 2.049504041671753, + "step": 7976 + }, + { + "epoch": 1.29, + "learning_rate": 2.9215121864290754e-06, + "logits/chosen": -1.2924304008483887, + "logits/rejected": -1.230981707572937, + "logps/chosen": -64.48953247070312, + "logps/rejected": -48.940643310546875, + "loss": 1.0248, + "rewards/accuracies": 0.0, + "rewards/chosen": 4.3135881423950195, + "rewards/margins": -1.550523281097412, + "rewards/rejected": 5.864111423492432, + "step": 7977 + }, + { + "epoch": 1.29, + "learning_rate": 2.9203169404533937e-06, + "logits/chosen": -1.0268868207931519, + "logits/rejected": -1.1341763734817505, + "logps/chosen": -54.31041717529297, + "logps/rejected": -55.240806579589844, + "loss": 2.6448, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.7254719138145447, + "rewards/margins": -5.201220989227295, + "rewards/rejected": 5.926692962646484, + "step": 7978 + }, + { + "epoch": 1.3, + "learning_rate": 2.919121838164134e-06, + "logits/chosen": -1.0370607376098633, + "logits/rejected": -1.030936598777771, + "logps/chosen": -21.231815338134766, + "logps/rejected": -11.069731712341309, + "loss": 0.2118, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.9905662536621094, + "rewards/margins": 0.9764496088027954, + "rewards/rejected": 1.014116644859314, + "step": 7979 + }, + { + "epoch": 1.3, + "learning_rate": 2.9179268796438664e-06, + "logits/chosen": -1.254844307899475, + "logits/rejected": -1.3413503170013428, + "logps/chosen": -140.2022247314453, + "logps/rejected": -141.12217712402344, + "loss": 1.6883, + "rewards/accuracies": 0.0, + "rewards/chosen": 5.889584541320801, + "rewards/margins": -3.315333366394043, + "rewards/rejected": 9.204917907714844, + "step": 7980 + }, + { + "epoch": 1.3, + "learning_rate": 2.9167320649751494e-06, + "logits/chosen": -1.1721142530441284, + "logits/rejected": -1.0663890838623047, + "logps/chosen": -153.12193298339844, + "logps/rejected": -94.4630355834961, + "loss": 0.1865, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.582331895828247, + "rewards/margins": 1.5912208557128906, + "rewards/rejected": 0.9911109805107117, + "step": 7981 + }, + { + "epoch": 1.3, + "learning_rate": 2.9155373942405372e-06, + "logits/chosen": -1.3629605770111084, + "logits/rejected": -1.1953840255737305, + "logps/chosen": -104.6489486694336, + "logps/rejected": -23.16888999938965, + "loss": 0.1252, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.279605865478516, + "rewards/margins": 5.525125980377197, + "rewards/rejected": 0.7544800043106079, + "step": 7982 + }, + { + "epoch": 1.3, + "learning_rate": 2.914342867522565e-06, + "logits/chosen": -1.4001153707504272, + "logits/rejected": -1.293350100517273, + "logps/chosen": -187.4434814453125, + "logps/rejected": -62.906639099121094, + "loss": 0.1121, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.972204685211182, + "rewards/margins": 2.525275468826294, + "rewards/rejected": 2.4469292163848877, + "step": 7983 + }, + { + "epoch": 1.3, + "learning_rate": 2.9131484849037683e-06, + "logits/chosen": -0.9004204273223877, + "logits/rejected": -0.7838776707649231, + "logps/chosen": -28.51259422302246, + "logps/rejected": -14.583797454833984, + "loss": 0.4205, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.3345448970794678, + "rewards/margins": 0.42704856395721436, + "rewards/rejected": 1.9074963331222534, + "step": 7984 + }, + { + "epoch": 1.3, + "learning_rate": 2.9119542464666627e-06, + "logits/chosen": -1.4998142719268799, + "logits/rejected": -1.4601680040359497, + "logps/chosen": -101.38395690917969, + "logps/rejected": -130.8211669921875, + "loss": 1.7256, + "rewards/accuracies": 0.0, + "rewards/chosen": 6.172750949859619, + "rewards/margins": -2.2181334495544434, + "rewards/rejected": 8.390884399414062, + "step": 7985 + }, + { + "epoch": 1.3, + "learning_rate": 2.9107601522937638e-06, + "logits/chosen": -1.0718728303909302, + "logits/rejected": -1.0718728303909302, + "logps/chosen": -79.76321411132812, + "logps/rejected": -79.76321411132812, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.4322190284729004, + "rewards/margins": 0.0, + "rewards/rejected": 2.4322190284729004, + "step": 7986 + }, + { + "epoch": 1.3, + "learning_rate": 2.909566202467568e-06, + "logits/chosen": -1.1283433437347412, + "logits/rejected": -1.1017817258834839, + "logps/chosen": -52.565773010253906, + "logps/rejected": -90.49230194091797, + "loss": 0.726, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9294822216033936, + "rewards/margins": 1.7360045909881592, + "rewards/rejected": 1.1934776306152344, + "step": 7987 + }, + { + "epoch": 1.3, + "learning_rate": 2.90837239707057e-06, + "logits/chosen": -1.1179500818252563, + "logits/rejected": -0.9840983152389526, + "logps/chosen": -44.598167419433594, + "logps/rejected": -12.833879470825195, + "loss": 0.4035, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.57035231590271, + "rewards/margins": 2.0811309814453125, + "rewards/rejected": 0.48922139406204224, + "step": 7988 + }, + { + "epoch": 1.3, + "learning_rate": 2.9071787361852455e-06, + "logits/chosen": -1.2424967288970947, + "logits/rejected": -1.2745349407196045, + "logps/chosen": -76.37104797363281, + "logps/rejected": -95.32389068603516, + "loss": 0.3108, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.08872389793396, + "rewards/margins": 1.58854079246521, + "rewards/rejected": 1.50018310546875, + "step": 7989 + }, + { + "epoch": 1.3, + "learning_rate": 2.9059852198940696e-06, + "logits/chosen": -1.1285505294799805, + "logits/rejected": -1.1396541595458984, + "logps/chosen": -52.37897491455078, + "logps/rejected": -42.3681640625, + "loss": 0.5555, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.5953811407089233, + "rewards/margins": -0.41445696353912354, + "rewards/rejected": 2.009838104248047, + "step": 7990 + }, + { + "epoch": 1.3, + "learning_rate": 2.9047918482795003e-06, + "logits/chosen": -1.4783862829208374, + "logits/rejected": -1.4783862829208374, + "logps/chosen": -51.031517028808594, + "logps/rejected": -51.031517028808594, + "loss": 0.501, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.947899580001831, + "rewards/margins": 0.0, + "rewards/rejected": 2.947899580001831, + "step": 7991 + }, + { + "epoch": 1.3, + "learning_rate": 2.903598621423991e-06, + "logits/chosen": -1.4683520793914795, + "logits/rejected": -1.5744410753250122, + "logps/chosen": -103.55037689208984, + "logps/rejected": -133.60995483398438, + "loss": 1.1874, + "rewards/accuracies": 0.0, + "rewards/chosen": 4.483051300048828, + "rewards/margins": -1.2088112831115723, + "rewards/rejected": 5.6918625831604, + "step": 7992 + }, + { + "epoch": 1.3, + "learning_rate": 2.902405539409978e-06, + "logits/chosen": -1.326887845993042, + "logits/rejected": -1.326887845993042, + "logps/chosen": -41.64078140258789, + "logps/rejected": -41.64078140258789, + "loss": 0.353, + "rewards/accuracies": 0.0, + "rewards/chosen": 5.630433559417725, + "rewards/margins": 0.0, + "rewards/rejected": 5.630433559417725, + "step": 7993 + }, + { + "epoch": 1.3, + "learning_rate": 2.9012126023198973e-06, + "logits/chosen": -1.3202611207962036, + "logits/rejected": -1.0967998504638672, + "logps/chosen": -68.2557601928711, + "logps/rejected": -16.466053009033203, + "loss": 0.0382, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.5504798889160156, + "rewards/margins": 3.1994621753692627, + "rewards/rejected": 0.3510177731513977, + "step": 7994 + }, + { + "epoch": 1.3, + "learning_rate": 2.9000198102361645e-06, + "logits/chosen": -1.5469332933425903, + "logits/rejected": -1.6018803119659424, + "logps/chosen": -91.48463439941406, + "logps/rejected": -38.217464447021484, + "loss": 0.0212, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.154820442199707, + "rewards/margins": 4.112148761749268, + "rewards/rejected": 0.04267158731818199, + "step": 7995 + }, + { + "epoch": 1.3, + "learning_rate": 2.898827163241195e-06, + "logits/chosen": -1.099582552909851, + "logits/rejected": -1.099582552909851, + "logps/chosen": -2.4556422233581543, + "logps/rejected": -2.4556422233581543, + "loss": 0.6402, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.7075345516204834, + "rewards/margins": 0.0, + "rewards/rejected": 0.7075345516204834, + "step": 7996 + }, + { + "epoch": 1.3, + "learning_rate": 2.897634661417384e-06, + "logits/chosen": -1.2906064987182617, + "logits/rejected": -1.2662792205810547, + "logps/chosen": -61.06288146972656, + "logps/rejected": -83.90250396728516, + "loss": 0.6194, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.2342255115509033, + "rewards/margins": 1.172492265701294, + "rewards/rejected": 0.061733245849609375, + "step": 7997 + }, + { + "epoch": 1.3, + "learning_rate": 2.896442304847128e-06, + "logits/chosen": -1.2147700786590576, + "logits/rejected": -1.296886920928955, + "logps/chosen": -157.00318908691406, + "logps/rejected": -50.64482116699219, + "loss": 0.0692, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.4347734451293945, + "rewards/margins": 2.723318576812744, + "rewards/rejected": 2.7114548683166504, + "step": 7998 + }, + { + "epoch": 1.3, + "learning_rate": 2.8952500936128027e-06, + "logits/chosen": -1.3950340747833252, + "logits/rejected": -1.248069167137146, + "logps/chosen": -106.68270874023438, + "logps/rejected": -49.620304107666016, + "loss": 0.0342, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.121499538421631, + "rewards/margins": 2.9281938076019287, + "rewards/rejected": 2.193305730819702, + "step": 7999 + }, + { + "epoch": 1.3, + "learning_rate": 2.894058027796782e-06, + "logits/chosen": -1.3410062789916992, + "logits/rejected": -1.3192023038864136, + "logps/chosen": -56.240692138671875, + "logps/rejected": -71.56926727294922, + "loss": 0.4326, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.369333028793335, + "rewards/margins": 0.7097985744476318, + "rewards/rejected": 2.659534454345703, + "step": 8000 + }, + { + "epoch": 1.3, + "learning_rate": 2.892866107481426e-06, + "logits/chosen": -1.2470431327819824, + "logits/rejected": -1.270207166671753, + "logps/chosen": -86.85971069335938, + "logps/rejected": -105.42221069335938, + "loss": 1.362, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.191511631011963, + "rewards/margins": -2.536067008972168, + "rewards/rejected": 4.727578639984131, + "step": 8001 + }, + { + "epoch": 1.3, + "learning_rate": 2.89167433274908e-06, + "logits/chosen": -1.0202749967575073, + "logits/rejected": -1.0452319383621216, + "logps/chosen": -37.143699645996094, + "logps/rejected": -64.14839935302734, + "loss": 0.9971, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.2475746870040894, + "rewards/margins": 1.0643097162246704, + "rewards/rejected": 0.18326492607593536, + "step": 8002 + }, + { + "epoch": 1.3, + "learning_rate": 2.8904827036820925e-06, + "logits/chosen": -1.51933753490448, + "logits/rejected": -1.5291953086853027, + "logps/chosen": -83.54257202148438, + "logps/rejected": -59.98664855957031, + "loss": 0.5074, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.349277019500732, + "rewards/margins": 0.38658761978149414, + "rewards/rejected": 4.962689399719238, + "step": 8003 + }, + { + "epoch": 1.3, + "learning_rate": 2.8892912203627875e-06, + "logits/chosen": -0.8836538195610046, + "logits/rejected": -0.8836538195610046, + "logps/chosen": -63.79486846923828, + "logps/rejected": -63.79486846923828, + "loss": 0.4375, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.1576553583145142, + "rewards/margins": 0.0, + "rewards/rejected": 1.1576553583145142, + "step": 8004 + }, + { + "epoch": 1.3, + "learning_rate": 2.8880998828734885e-06, + "logits/chosen": -0.9501307010650635, + "logits/rejected": -0.9501307010650635, + "logps/chosen": -73.76217651367188, + "logps/rejected": -73.76217651367188, + "loss": 0.3486, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.6795463562011719, + "rewards/margins": 0.0, + "rewards/rejected": 1.6795463562011719, + "step": 8005 + }, + { + "epoch": 1.3, + "learning_rate": 2.886908691296504e-06, + "logits/chosen": -1.3922560214996338, + "logits/rejected": -1.342218041419983, + "logps/chosen": -38.29396057128906, + "logps/rejected": -30.042701721191406, + "loss": 0.6267, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.057424306869507, + "rewards/margins": -0.6804771423339844, + "rewards/rejected": 2.737901449203491, + "step": 8006 + }, + { + "epoch": 1.3, + "learning_rate": 2.8857176457141357e-06, + "logits/chosen": -1.3742892742156982, + "logits/rejected": -1.3673146963119507, + "logps/chosen": -83.11772155761719, + "logps/rejected": -79.27323913574219, + "loss": 0.2513, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.653027296066284, + "rewards/margins": 0.5796737670898438, + "rewards/rejected": 2.0733535289764404, + "step": 8007 + }, + { + "epoch": 1.3, + "learning_rate": 2.8845267462086707e-06, + "logits/chosen": -1.327069878578186, + "logits/rejected": -1.2629187107086182, + "logps/chosen": -44.3190803527832, + "logps/rejected": -79.08274841308594, + "loss": 0.2917, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.4535017013549805, + "rewards/margins": 0.8723416328430176, + "rewards/rejected": 4.581160068511963, + "step": 8008 + }, + { + "epoch": 1.3, + "learning_rate": 2.8833359928623927e-06, + "logits/chosen": -1.1303670406341553, + "logits/rejected": -0.9600335955619812, + "logps/chosen": -110.69479370117188, + "logps/rejected": -47.92241668701172, + "loss": 0.5965, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.642297267913818, + "rewards/margins": 0.6917517185211182, + "rewards/rejected": 3.9505455493927, + "step": 8009 + }, + { + "epoch": 1.3, + "learning_rate": 2.8821453857575677e-06, + "logits/chosen": -1.3465977907180786, + "logits/rejected": -1.4006403684616089, + "logps/chosen": -39.26316833496094, + "logps/rejected": -115.900390625, + "loss": 4.1315, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.643328905105591, + "rewards/margins": -3.864428758621216, + "rewards/rejected": 6.507757663726807, + "step": 8010 + }, + { + "epoch": 1.3, + "learning_rate": 2.880954924976459e-06, + "logits/chosen": -1.4068597555160522, + "logits/rejected": -1.5261951684951782, + "logps/chosen": -82.63670349121094, + "logps/rejected": -137.0374755859375, + "loss": 2.0965, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.4781434535980225, + "rewards/margins": -4.090640068054199, + "rewards/rejected": 7.568783760070801, + "step": 8011 + }, + { + "epoch": 1.3, + "learning_rate": 2.879764610601312e-06, + "logits/chosen": -1.3337388038635254, + "logits/rejected": -1.3150616884231567, + "logps/chosen": -74.58871459960938, + "logps/rejected": -89.9319839477539, + "loss": 0.4481, + "rewards/accuracies": 0.0, + "rewards/chosen": 4.9908766746521, + "rewards/margins": -0.34979963302612305, + "rewards/rejected": 5.340676307678223, + "step": 8012 + }, + { + "epoch": 1.3, + "learning_rate": 2.878574442714371e-06, + "logits/chosen": -1.3010553121566772, + "logits/rejected": -1.4778856039047241, + "logps/chosen": -60.91366958618164, + "logps/rejected": -117.02256774902344, + "loss": 2.9232, + "rewards/accuracies": 0.0, + "rewards/chosen": 4.409240245819092, + "rewards/margins": -5.6012091636657715, + "rewards/rejected": 10.010449409484863, + "step": 8013 + }, + { + "epoch": 1.3, + "learning_rate": 2.877384421397862e-06, + "logits/chosen": -1.2523545026779175, + "logits/rejected": -1.3270457983016968, + "logps/chosen": -54.84925079345703, + "logps/rejected": -63.68082046508789, + "loss": 1.636, + "rewards/accuracies": 0.0, + "rewards/chosen": 4.008786678314209, + "rewards/margins": -1.3042449951171875, + "rewards/rejected": 5.3130316734313965, + "step": 8014 + }, + { + "epoch": 1.3, + "learning_rate": 2.8761945467340057e-06, + "logits/chosen": -0.9549131393432617, + "logits/rejected": -0.9423671364784241, + "logps/chosen": -67.9046630859375, + "logps/rejected": -73.33338165283203, + "loss": 0.2786, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.805139183998108, + "rewards/margins": 0.501508355140686, + "rewards/rejected": 1.3036308288574219, + "step": 8015 + }, + { + "epoch": 1.3, + "learning_rate": 2.875004818805009e-06, + "logits/chosen": -0.76374751329422, + "logits/rejected": -0.7644110321998596, + "logps/chosen": -3.1006600856781006, + "logps/rejected": -0.8105762600898743, + "loss": 0.7852, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.03376872465014458, + "rewards/margins": -0.16160376369953156, + "rewards/rejected": 0.19537249207496643, + "step": 8016 + }, + { + "epoch": 1.3, + "learning_rate": 2.8738152376930737e-06, + "logits/chosen": -1.1729071140289307, + "logits/rejected": -1.1810907125473022, + "logps/chosen": -60.023006439208984, + "logps/rejected": -61.39510726928711, + "loss": 0.5323, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.741477608680725, + "rewards/margins": -0.22725749015808105, + "rewards/rejected": 1.9687350988388062, + "step": 8017 + }, + { + "epoch": 1.3, + "learning_rate": 2.8726258034803866e-06, + "logits/chosen": -1.3129322528839111, + "logits/rejected": -1.1284598112106323, + "logps/chosen": -54.949317932128906, + "logps/rejected": -30.6300048828125, + "loss": 0.2698, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.756319522857666, + "rewards/margins": 2.7260141372680664, + "rewards/rejected": 0.03030548058450222, + "step": 8018 + }, + { + "epoch": 1.3, + "learning_rate": 2.8714365162491285e-06, + "logits/chosen": -1.4730018377304077, + "logits/rejected": -1.4457751512527466, + "logps/chosen": -176.14154052734375, + "logps/rejected": -45.31260681152344, + "loss": 0.0924, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.138543605804443, + "rewards/margins": 2.2881510257720947, + "rewards/rejected": 1.8503925800323486, + "step": 8019 + }, + { + "epoch": 1.3, + "learning_rate": 2.8702473760814642e-06, + "logits/chosen": -1.108797550201416, + "logits/rejected": -1.1080936193466187, + "logps/chosen": -14.079767227172852, + "logps/rejected": -21.985729217529297, + "loss": 0.3651, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.8085350394248962, + "rewards/margins": 0.06285804510116577, + "rewards/rejected": 0.7456769943237305, + "step": 8020 + }, + { + "epoch": 1.3, + "learning_rate": 2.869058383059557e-06, + "logits/chosen": -1.7215920686721802, + "logits/rejected": -1.6101807355880737, + "logps/chosen": -36.40183639526367, + "logps/rejected": -21.827077865600586, + "loss": 0.2689, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.790693998336792, + "rewards/margins": 3.227637529373169, + "rewards/rejected": 0.5630564093589783, + "step": 8021 + }, + { + "epoch": 1.3, + "learning_rate": 2.8678695372655495e-06, + "logits/chosen": -1.1756913661956787, + "logits/rejected": -1.1915384531021118, + "logps/chosen": -40.442901611328125, + "logps/rejected": -48.955116271972656, + "loss": 0.5814, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.9365265369415283, + "rewards/margins": -0.06322026252746582, + "rewards/rejected": 2.999746799468994, + "step": 8022 + }, + { + "epoch": 1.3, + "learning_rate": 2.8666808387815837e-06, + "logits/chosen": -1.5953669548034668, + "logits/rejected": -1.1236865520477295, + "logps/chosen": -49.397560119628906, + "logps/rejected": -71.85460662841797, + "loss": 0.3816, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.994189500808716, + "rewards/margins": 0.6723206043243408, + "rewards/rejected": 2.321868896484375, + "step": 8023 + }, + { + "epoch": 1.3, + "learning_rate": 2.8654922876897872e-06, + "logits/chosen": -1.370126724243164, + "logits/rejected": -1.415561556816101, + "logps/chosen": -43.95903778076172, + "logps/rejected": -70.57438659667969, + "loss": 0.3161, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.7388558387756348, + "rewards/margins": 0.15426874160766602, + "rewards/rejected": 2.5845870971679688, + "step": 8024 + }, + { + "epoch": 1.3, + "learning_rate": 2.864303884072275e-06, + "logits/chosen": -1.358615517616272, + "logits/rejected": -1.4009289741516113, + "logps/chosen": -47.5059700012207, + "logps/rejected": -40.11857223510742, + "loss": 0.8394, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.1930973529815674, + "rewards/margins": -0.5183807611465454, + "rewards/rejected": 1.7114781141281128, + "step": 8025 + }, + { + "epoch": 1.3, + "learning_rate": 2.863115628011158e-06, + "logits/chosen": -1.366447925567627, + "logits/rejected": -1.3807541131973267, + "logps/chosen": -89.27352905273438, + "logps/rejected": -62.090248107910156, + "loss": 0.5328, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.322035312652588, + "rewards/margins": -0.013486385345458984, + "rewards/rejected": 2.335521697998047, + "step": 8026 + }, + { + "epoch": 1.3, + "learning_rate": 2.861927519588531e-06, + "logits/chosen": -1.3887003660202026, + "logits/rejected": -1.4641547203063965, + "logps/chosen": -74.0079116821289, + "logps/rejected": -75.8130111694336, + "loss": 0.9553, + "rewards/accuracies": 0.0, + "rewards/chosen": 4.080179691314697, + "rewards/margins": -1.7064971923828125, + "rewards/rejected": 5.78667688369751, + "step": 8027 + }, + { + "epoch": 1.3, + "learning_rate": 2.8607395588864828e-06, + "logits/chosen": -1.3797160387039185, + "logits/rejected": -1.0978938341140747, + "logps/chosen": -108.0829849243164, + "logps/rejected": -53.579341888427734, + "loss": 0.2498, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.941237688064575, + "rewards/margins": 0.5609087944030762, + "rewards/rejected": 2.380328893661499, + "step": 8028 + }, + { + "epoch": 1.3, + "learning_rate": 2.8595517459870868e-06, + "logits/chosen": -0.8423219323158264, + "logits/rejected": -0.8444586992263794, + "logps/chosen": -43.02751159667969, + "logps/rejected": -39.7166633605957, + "loss": 0.3627, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.4854106903076172, + "rewards/margins": 0.33331215381622314, + "rewards/rejected": 1.152098536491394, + "step": 8029 + }, + { + "epoch": 1.3, + "learning_rate": 2.8583640809724143e-06, + "logits/chosen": -0.9061686992645264, + "logits/rejected": -0.9672074913978577, + "logps/chosen": -31.726648330688477, + "logps/rejected": -109.5736312866211, + "loss": 0.5762, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.1445196866989136, + "rewards/margins": 0.1994246244430542, + "rewards/rejected": 0.9450950622558594, + "step": 8030 + }, + { + "epoch": 1.3, + "learning_rate": 2.8571765639245184e-06, + "logits/chosen": -1.149463176727295, + "logits/rejected": -1.3001919984817505, + "logps/chosen": -73.11061096191406, + "logps/rejected": -73.98965454101562, + "loss": 1.4137, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.5101959705352783, + "rewards/margins": -2.7594025135040283, + "rewards/rejected": 5.269598484039307, + "step": 8031 + }, + { + "epoch": 1.3, + "learning_rate": 2.8559891949254477e-06, + "logits/chosen": -0.9461534023284912, + "logits/rejected": -0.9961075782775879, + "logps/chosen": -74.48038482666016, + "logps/rejected": -43.339324951171875, + "loss": 0.3781, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.043886661529541, + "rewards/margins": -0.12154221534729004, + "rewards/rejected": 2.165428876876831, + "step": 8032 + }, + { + "epoch": 1.3, + "learning_rate": 2.854801974057234e-06, + "logits/chosen": -0.8560875654220581, + "logits/rejected": -0.9035383462905884, + "logps/chosen": -42.744476318359375, + "logps/rejected": -81.68236541748047, + "loss": 0.3964, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.929567813873291, + "rewards/margins": 2.2555527687072754, + "rewards/rejected": 0.6740150451660156, + "step": 8033 + }, + { + "epoch": 1.3, + "learning_rate": 2.853614901401909e-06, + "logits/chosen": -1.3247480392456055, + "logits/rejected": -1.1580944061279297, + "logps/chosen": -51.361053466796875, + "logps/rejected": -14.586146354675293, + "loss": 0.2001, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.216108798980713, + "rewards/margins": 2.7378737926483154, + "rewards/rejected": 0.4782349765300751, + "step": 8034 + }, + { + "epoch": 1.3, + "learning_rate": 2.852427977041483e-06, + "logits/chosen": -1.362898826599121, + "logits/rejected": -1.5136067867279053, + "logps/chosen": -51.729095458984375, + "logps/rejected": -116.75823211669922, + "loss": 0.4828, + "rewards/accuracies": 0.0, + "rewards/chosen": 4.306966304779053, + "rewards/margins": -0.41477537155151367, + "rewards/rejected": 4.721741676330566, + "step": 8035 + }, + { + "epoch": 1.3, + "learning_rate": 2.8512412010579648e-06, + "logits/chosen": -1.3701101541519165, + "logits/rejected": -1.0763362646102905, + "logps/chosen": -169.33291625976562, + "logps/rejected": -20.155372619628906, + "loss": 0.0092, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.558403015136719, + "rewards/margins": 6.004499912261963, + "rewards/rejected": 0.5539030432701111, + "step": 8036 + }, + { + "epoch": 1.3, + "learning_rate": 2.8500545735333456e-06, + "logits/chosen": -1.1657739877700806, + "logits/rejected": -1.169600009918213, + "logps/chosen": -7.28565788269043, + "logps/rejected": -2.068768262863159, + "loss": 0.4891, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.1805473417043686, + "rewards/margins": -0.08985646069049835, + "rewards/rejected": 0.27040380239486694, + "step": 8037 + }, + { + "epoch": 1.3, + "learning_rate": 2.848868094549615e-06, + "logits/chosen": -1.3905504941940308, + "logits/rejected": -1.337651252746582, + "logps/chosen": -50.554359436035156, + "logps/rejected": -16.846899032592773, + "loss": 1.4265, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.8611282110214233, + "rewards/margins": 1.7677288055419922, + "rewards/rejected": 0.09339942783117294, + "step": 8038 + }, + { + "epoch": 1.3, + "learning_rate": 2.847681764188742e-06, + "logits/chosen": -1.3000551462173462, + "logits/rejected": -1.1094318628311157, + "logps/chosen": -62.060462951660156, + "logps/rejected": -25.174732208251953, + "loss": 0.1915, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.390256643295288, + "rewards/margins": 1.302722692489624, + "rewards/rejected": 1.087533950805664, + "step": 8039 + }, + { + "epoch": 1.3, + "learning_rate": 2.846495582532697e-06, + "logits/chosen": -1.2153396606445312, + "logits/rejected": -1.2646987438201904, + "logps/chosen": -86.51271057128906, + "logps/rejected": -121.486328125, + "loss": 0.692, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.9503676891326904, + "rewards/margins": -1.080636739730835, + "rewards/rejected": 4.031004428863525, + "step": 8040 + }, + { + "epoch": 1.31, + "learning_rate": 2.8453095496634274e-06, + "logits/chosen": -1.1699978113174438, + "logits/rejected": -1.17598295211792, + "logps/chosen": -111.37351989746094, + "logps/rejected": -118.30870056152344, + "loss": 0.6102, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.2516191005706787, + "rewards/margins": 1.7027146816253662, + "rewards/rejected": 0.5489044189453125, + "step": 8041 + }, + { + "epoch": 1.31, + "learning_rate": 2.844123665662883e-06, + "logits/chosen": -0.874628484249115, + "logits/rejected": -0.8707414269447327, + "logps/chosen": -6.208600997924805, + "logps/rejected": -3.4320530891418457, + "loss": 0.6329, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.20953817665576935, + "rewards/margins": -0.42670220136642456, + "rewards/rejected": 0.6362403631210327, + "step": 8042 + }, + { + "epoch": 1.31, + "learning_rate": 2.842937930612991e-06, + "logits/chosen": -1.0203850269317627, + "logits/rejected": -1.0492452383041382, + "logps/chosen": -71.46839141845703, + "logps/rejected": -87.86466217041016, + "loss": 0.2524, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.8799629211425781, + "rewards/margins": 0.439962774515152, + "rewards/rejected": 0.44000014662742615, + "step": 8043 + }, + { + "epoch": 1.31, + "learning_rate": 2.8417523445956806e-06, + "logits/chosen": -1.6616368293762207, + "logits/rejected": -1.6806676387786865, + "logps/chosen": -74.95372009277344, + "logps/rejected": -150.8463897705078, + "loss": 1.3808, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.2361855506896973, + "rewards/margins": -2.4941139221191406, + "rewards/rejected": 4.730299472808838, + "step": 8044 + }, + { + "epoch": 1.31, + "learning_rate": 2.84056690769286e-06, + "logits/chosen": -1.0444470643997192, + "logits/rejected": -1.0068484544754028, + "logps/chosen": -26.095840454101562, + "logps/rejected": -61.31875228881836, + "loss": 0.9309, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.055600881576538, + "rewards/margins": -0.5349864959716797, + "rewards/rejected": 2.5905873775482178, + "step": 8045 + }, + { + "epoch": 1.31, + "learning_rate": 2.8393816199864345e-06, + "logits/chosen": -1.0104587078094482, + "logits/rejected": -1.0211148262023926, + "logps/chosen": -37.150569915771484, + "logps/rejected": -43.52354431152344, + "loss": 0.3349, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.081165075302124, + "rewards/margins": 0.10107588768005371, + "rewards/rejected": 1.9800891876220703, + "step": 8046 + }, + { + "epoch": 1.31, + "learning_rate": 2.8381964815582934e-06, + "logits/chosen": -1.3999823331832886, + "logits/rejected": -1.2854830026626587, + "logps/chosen": -48.44561767578125, + "logps/rejected": -30.98514175415039, + "loss": 0.6091, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.7794471979141235, + "rewards/margins": -0.6341155767440796, + "rewards/rejected": 2.413562774658203, + "step": 8047 + }, + { + "epoch": 1.31, + "learning_rate": 2.8370114924903227e-06, + "logits/chosen": -1.3443464040756226, + "logits/rejected": -1.3896121978759766, + "logps/chosen": -56.65005874633789, + "logps/rejected": -140.34837341308594, + "loss": 0.6248, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.534217596054077, + "rewards/margins": -0.7377293109893799, + "rewards/rejected": 4.271946907043457, + "step": 8048 + }, + { + "epoch": 1.31, + "learning_rate": 2.8358266528643884e-06, + "logits/chosen": -1.64125657081604, + "logits/rejected": -1.5942045450210571, + "logps/chosen": -87.6591567993164, + "logps/rejected": -71.28810119628906, + "loss": 0.7589, + "rewards/accuracies": 0.0, + "rewards/chosen": 5.428342342376709, + "rewards/margins": -0.6475167274475098, + "rewards/rejected": 6.075859069824219, + "step": 8049 + }, + { + "epoch": 1.31, + "learning_rate": 2.834641962762358e-06, + "logits/chosen": -1.3800207376480103, + "logits/rejected": -1.3010317087173462, + "logps/chosen": -88.51060485839844, + "logps/rejected": -72.34066009521484, + "loss": 0.1047, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.721237897872925, + "rewards/margins": 1.596042513847351, + "rewards/rejected": 1.1251953840255737, + "step": 8050 + }, + { + "epoch": 1.31, + "learning_rate": 2.8334574222660767e-06, + "logits/chosen": -1.3039072751998901, + "logits/rejected": -1.1718838214874268, + "logps/chosen": -78.42115783691406, + "logps/rejected": -51.295127868652344, + "loss": 1.6849, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.313892364501953, + "rewards/margins": 2.4427127838134766, + "rewards/rejected": 0.8711795806884766, + "step": 8051 + }, + { + "epoch": 1.31, + "learning_rate": 2.8322730314573894e-06, + "logits/chosen": -0.9989686012268066, + "logits/rejected": -0.9989686012268066, + "logps/chosen": -37.842010498046875, + "logps/rejected": -37.842010498046875, + "loss": 0.4986, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.0809223651885986, + "rewards/margins": 0.0, + "rewards/rejected": 1.0809223651885986, + "step": 8052 + }, + { + "epoch": 1.31, + "learning_rate": 2.8310887904181222e-06, + "logits/chosen": -1.449385404586792, + "logits/rejected": -1.5242574214935303, + "logps/chosen": -82.28480529785156, + "logps/rejected": -127.62342071533203, + "loss": 0.4765, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.7848923206329346, + "rewards/margins": -0.4323158264160156, + "rewards/rejected": 2.21720814704895, + "step": 8053 + }, + { + "epoch": 1.31, + "learning_rate": 2.8299046992300995e-06, + "logits/chosen": -0.9562086462974548, + "logits/rejected": -0.9494308233261108, + "logps/chosen": -9.466293334960938, + "logps/rejected": -7.8433637619018555, + "loss": 0.9188, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.5813091397285461, + "rewards/margins": -0.10381209850311279, + "rewards/rejected": 0.6851212382316589, + "step": 8054 + }, + { + "epoch": 1.31, + "learning_rate": 2.8287207579751263e-06, + "logits/chosen": -0.9785857796669006, + "logits/rejected": -0.9301583766937256, + "logps/chosen": -52.63041305541992, + "logps/rejected": -80.92881774902344, + "loss": 0.0931, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.61322820186615, + "rewards/margins": 1.5961428880691528, + "rewards/rejected": 0.01708526723086834, + "step": 8055 + }, + { + "epoch": 1.31, + "learning_rate": 2.827536966735006e-06, + "logits/chosen": -1.294909119606018, + "logits/rejected": -1.294909119606018, + "logps/chosen": -56.067604064941406, + "logps/rejected": -56.067604064941406, + "loss": 0.3493, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.725292921066284, + "rewards/margins": 0.0, + "rewards/rejected": 3.725292921066284, + "step": 8056 + }, + { + "epoch": 1.31, + "learning_rate": 2.826353325591523e-06, + "logits/chosen": -1.0686386823654175, + "logits/rejected": -0.9720292091369629, + "logps/chosen": -57.30915451049805, + "logps/rejected": -85.24421691894531, + "loss": 2.1563, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.012880325317383, + "rewards/margins": -3.185058116912842, + "rewards/rejected": 5.197938442230225, + "step": 8057 + }, + { + "epoch": 1.31, + "learning_rate": 2.82516983462646e-06, + "logits/chosen": -1.325774908065796, + "logits/rejected": -1.3455325365066528, + "logps/chosen": -96.40425109863281, + "logps/rejected": -96.112060546875, + "loss": 1.4438, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.432293653488159, + "rewards/margins": -2.6658356189727783, + "rewards/rejected": 5.0981292724609375, + "step": 8058 + }, + { + "epoch": 1.31, + "learning_rate": 2.823986493921581e-06, + "logits/chosen": -1.2115981578826904, + "logits/rejected": -1.1902683973312378, + "logps/chosen": -33.28099060058594, + "logps/rejected": -27.076656341552734, + "loss": 0.6337, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.229630708694458, + "rewards/margins": -0.1479783058166504, + "rewards/rejected": 2.3776090145111084, + "step": 8059 + }, + { + "epoch": 1.31, + "learning_rate": 2.822803303558646e-06, + "logits/chosen": -1.4285475015640259, + "logits/rejected": -1.4469594955444336, + "logps/chosen": -77.729248046875, + "logps/rejected": -50.461299896240234, + "loss": 0.2175, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.9928009510040283, + "rewards/margins": 1.1641299724578857, + "rewards/rejected": 2.8286709785461426, + "step": 8060 + }, + { + "epoch": 1.31, + "learning_rate": 2.8216202636194036e-06, + "logits/chosen": -1.3253319263458252, + "logits/rejected": -1.3339189291000366, + "logps/chosen": -59.414649963378906, + "logps/rejected": -136.35577392578125, + "loss": 1.8777, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.6022125482559204, + "rewards/margins": -1.152368187904358, + "rewards/rejected": 2.7545807361602783, + "step": 8061 + }, + { + "epoch": 1.31, + "learning_rate": 2.8204373741855874e-06, + "logits/chosen": -1.2248427867889404, + "logits/rejected": -1.4343820810317993, + "logps/chosen": -119.29402160644531, + "logps/rejected": -33.78567886352539, + "loss": 0.044, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.489372253417969, + "rewards/margins": 4.184353828430176, + "rewards/rejected": 0.3050186336040497, + "step": 8062 + }, + { + "epoch": 1.31, + "learning_rate": 2.819254635338927e-06, + "logits/chosen": -0.9015527963638306, + "logits/rejected": -0.8982248306274414, + "logps/chosen": -61.684913635253906, + "logps/rejected": -46.10698699951172, + "loss": 0.7262, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.7426460981369019, + "rewards/margins": -0.1053779125213623, + "rewards/rejected": 1.8480240106582642, + "step": 8063 + }, + { + "epoch": 1.31, + "learning_rate": 2.8180720471611346e-06, + "logits/chosen": -1.3009536266326904, + "logits/rejected": -1.3493937253952026, + "logps/chosen": -77.37848663330078, + "logps/rejected": -74.19068908691406, + "loss": 0.6954, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.252532482147217, + "rewards/margins": 1.5671653747558594, + "rewards/rejected": 4.685367107391357, + "step": 8064 + }, + { + "epoch": 1.31, + "learning_rate": 2.8168896097339206e-06, + "logits/chosen": -1.1496254205703735, + "logits/rejected": -1.1156085729599, + "logps/chosen": -54.56019973754883, + "logps/rejected": -61.35221862792969, + "loss": 0.7089, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.588462471961975, + "rewards/margins": -1.0655971765518188, + "rewards/rejected": 2.654059648513794, + "step": 8065 + }, + { + "epoch": 1.31, + "learning_rate": 2.8157073231389752e-06, + "logits/chosen": -0.9288349151611328, + "logits/rejected": -0.9288349151611328, + "logps/chosen": -15.385824203491211, + "logps/rejected": -15.385824203491211, + "loss": 0.7519, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.53096604347229, + "rewards/margins": 0.0, + "rewards/rejected": 1.53096604347229, + "step": 8066 + }, + { + "epoch": 1.31, + "learning_rate": 2.814525187457989e-06, + "logits/chosen": -1.164961576461792, + "logits/rejected": -0.9897142052650452, + "logps/chosen": -131.42103576660156, + "logps/rejected": -16.90764808654785, + "loss": 0.3156, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.978773593902588, + "rewards/margins": 2.4566736221313477, + "rewards/rejected": 0.5220999121665955, + "step": 8067 + }, + { + "epoch": 1.31, + "learning_rate": 2.8133432027726305e-06, + "logits/chosen": -1.0284457206726074, + "logits/rejected": -0.8660066723823547, + "logps/chosen": -56.743316650390625, + "logps/rejected": -33.611331939697266, + "loss": 0.0213, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.238089084625244, + "rewards/margins": 3.16518497467041, + "rewards/rejected": -0.9270960092544556, + "step": 8068 + }, + { + "epoch": 1.31, + "learning_rate": 2.812161369164568e-06, + "logits/chosen": -1.3202317953109741, + "logits/rejected": -1.340828537940979, + "logps/chosen": -51.0026741027832, + "logps/rejected": -60.1148796081543, + "loss": 0.8815, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.567174196243286, + "rewards/margins": -1.2961623668670654, + "rewards/rejected": 4.863336563110352, + "step": 8069 + }, + { + "epoch": 1.31, + "learning_rate": 2.8109796867154515e-06, + "logits/chosen": -1.1636382341384888, + "logits/rejected": -1.1667404174804688, + "logps/chosen": -28.28139305114746, + "logps/rejected": -19.59764862060547, + "loss": 0.3155, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5623636245727539, + "rewards/margins": 0.1303746998310089, + "rewards/rejected": 0.431988924741745, + "step": 8070 + }, + { + "epoch": 1.31, + "learning_rate": 2.809798155506929e-06, + "logits/chosen": -1.5059468746185303, + "logits/rejected": -1.4371577501296997, + "logps/chosen": -151.91212463378906, + "logps/rejected": -29.50225067138672, + "loss": 0.1874, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.3314056396484375, + "rewards/margins": 2.184657573699951, + "rewards/rejected": 0.14674797654151917, + "step": 8071 + }, + { + "epoch": 1.31, + "learning_rate": 2.8086167756206266e-06, + "logits/chosen": -1.5471704006195068, + "logits/rejected": -1.402544379234314, + "logps/chosen": -101.14619445800781, + "logps/rejected": -19.60715675354004, + "loss": 0.5227, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2368226051330566, + "rewards/margins": 3.3152084350585938, + "rewards/rejected": -0.07838573306798935, + "step": 8072 + }, + { + "epoch": 1.31, + "learning_rate": 2.8074355471381726e-06, + "logits/chosen": -1.3787392377853394, + "logits/rejected": -1.2889282703399658, + "logps/chosen": -80.462158203125, + "logps/rejected": -45.17176055908203, + "loss": 0.0448, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.452369689941406, + "rewards/margins": 3.461440324783325, + "rewards/rejected": 2.990929365158081, + "step": 8073 + }, + { + "epoch": 1.31, + "learning_rate": 2.806254470141174e-06, + "logits/chosen": -1.0559688806533813, + "logits/rejected": -1.096605658531189, + "logps/chosen": -89.20167541503906, + "logps/rejected": -126.21511840820312, + "loss": 0.0577, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.166851043701172, + "rewards/margins": 2.3026726245880127, + "rewards/rejected": 0.864178478717804, + "step": 8074 + }, + { + "epoch": 1.31, + "learning_rate": 2.8050735447112364e-06, + "logits/chosen": -1.2501333951950073, + "logits/rejected": -1.2373429536819458, + "logps/chosen": -60.69178771972656, + "logps/rejected": -34.12314224243164, + "loss": 2.2275, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.8207123279571533, + "rewards/margins": -0.5425934791564941, + "rewards/rejected": 2.3633058071136475, + "step": 8075 + }, + { + "epoch": 1.31, + "learning_rate": 2.803892770929946e-06, + "logits/chosen": -1.298844814300537, + "logits/rejected": -1.2244614362716675, + "logps/chosen": -53.99370574951172, + "logps/rejected": -22.70244598388672, + "loss": 0.6862, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.6161186695098877, + "rewards/margins": -0.39380836486816406, + "rewards/rejected": 3.0099270343780518, + "step": 8076 + }, + { + "epoch": 1.31, + "learning_rate": 2.8027121488788868e-06, + "logits/chosen": -1.3194907903671265, + "logits/rejected": -1.2679888010025024, + "logps/chosen": -38.858055114746094, + "logps/rejected": -33.770912170410156, + "loss": 0.3913, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.726832628250122, + "rewards/margins": 0.4014599323272705, + "rewards/rejected": 2.3253726959228516, + "step": 8077 + }, + { + "epoch": 1.31, + "learning_rate": 2.8015316786396265e-06, + "logits/chosen": -1.3428864479064941, + "logits/rejected": -1.3177363872528076, + "logps/chosen": -64.17774963378906, + "logps/rejected": -38.01250457763672, + "loss": 0.917, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.9764069318771362, + "rewards/margins": -0.8496314287185669, + "rewards/rejected": 2.826038360595703, + "step": 8078 + }, + { + "epoch": 1.31, + "learning_rate": 2.800351360293726e-06, + "logits/chosen": -1.5116171836853027, + "logits/rejected": -1.2952682971954346, + "logps/chosen": -82.37013244628906, + "logps/rejected": -11.100637435913086, + "loss": 0.0579, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.007972717285156, + "rewards/margins": 2.515009880065918, + "rewards/rejected": 1.4929628372192383, + "step": 8079 + }, + { + "epoch": 1.31, + "learning_rate": 2.7991711939227317e-06, + "logits/chosen": -1.1722228527069092, + "logits/rejected": -1.1165306568145752, + "logps/chosen": -87.0289306640625, + "logps/rejected": -78.15745544433594, + "loss": 0.4799, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.270097494125366, + "rewards/margins": 1.1108391284942627, + "rewards/rejected": 2.1592583656311035, + "step": 8080 + }, + { + "epoch": 1.31, + "learning_rate": 2.797991179608185e-06, + "logits/chosen": -1.0887483358383179, + "logits/rejected": -1.1194261312484741, + "logps/chosen": -90.91860961914062, + "logps/rejected": -70.32601928710938, + "loss": 1.2486, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.8008469343185425, + "rewards/margins": -1.624179720878601, + "rewards/rejected": 3.4250266551971436, + "step": 8081 + }, + { + "epoch": 1.31, + "learning_rate": 2.7968113174316102e-06, + "logits/chosen": -1.972027063369751, + "logits/rejected": -1.9501279592514038, + "logps/chosen": -25.00454330444336, + "logps/rejected": -21.226078033447266, + "loss": 0.9569, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.6870037317276, + "rewards/margins": 0.6844911575317383, + "rewards/rejected": 1.0025125741958618, + "step": 8082 + }, + { + "epoch": 1.31, + "learning_rate": 2.7956316074745294e-06, + "logits/chosen": -1.2913544178009033, + "logits/rejected": -1.2358686923980713, + "logps/chosen": -55.03557586669922, + "logps/rejected": -64.22407531738281, + "loss": 0.1992, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.013620853424072, + "rewards/margins": 0.7425956726074219, + "rewards/rejected": 3.2710251808166504, + "step": 8083 + }, + { + "epoch": 1.31, + "learning_rate": 2.794452049818444e-06, + "logits/chosen": -1.3846979141235352, + "logits/rejected": -1.2760460376739502, + "logps/chosen": -110.10224914550781, + "logps/rejected": -99.56097412109375, + "loss": 0.0048, + "rewards/accuracies": 1.0, + "rewards/chosen": 7.821189880371094, + "rewards/margins": 4.719782829284668, + "rewards/rejected": 3.1014068126678467, + "step": 8084 + }, + { + "epoch": 1.31, + "learning_rate": 2.793272644544855e-06, + "logits/chosen": -1.1952341794967651, + "logits/rejected": -1.1952341794967651, + "logps/chosen": -67.17364501953125, + "logps/rejected": -67.17364501953125, + "loss": 0.3529, + "rewards/accuracies": 0.0, + "rewards/chosen": 4.335975170135498, + "rewards/margins": 0.0, + "rewards/rejected": 4.335975170135498, + "step": 8085 + }, + { + "epoch": 1.31, + "learning_rate": 2.7920933917352443e-06, + "logits/chosen": -1.2476168870925903, + "logits/rejected": -1.2604581117630005, + "logps/chosen": -87.61538696289062, + "logps/rejected": -57.86940002441406, + "loss": 0.2982, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.8706451654434204, + "rewards/margins": 0.5157425403594971, + "rewards/rejected": 1.3549026250839233, + "step": 8086 + }, + { + "epoch": 1.31, + "learning_rate": 2.79091429147109e-06, + "logits/chosen": -1.2921589612960815, + "logits/rejected": -1.2693629264831543, + "logps/chosen": -73.35836029052734, + "logps/rejected": -85.91612243652344, + "loss": 0.1603, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.727553606033325, + "rewards/margins": 2.082611083984375, + "rewards/rejected": 0.6449424624443054, + "step": 8087 + }, + { + "epoch": 1.31, + "learning_rate": 2.789735343833854e-06, + "logits/chosen": -1.0644168853759766, + "logits/rejected": -1.0667524337768555, + "logps/chosen": -88.88972473144531, + "logps/rejected": -98.86483764648438, + "loss": 0.9343, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.4507241249084473, + "rewards/margins": 1.5877511501312256, + "rewards/rejected": 0.8629730343818665, + "step": 8088 + }, + { + "epoch": 1.31, + "learning_rate": 2.7885565489049948e-06, + "logits/chosen": -1.363610029220581, + "logits/rejected": -1.4328246116638184, + "logps/chosen": -55.942604064941406, + "logps/rejected": -110.69657897949219, + "loss": 2.019, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.3158347606658936, + "rewards/margins": -2.734508752822876, + "rewards/rejected": 6.0503435134887695, + "step": 8089 + }, + { + "epoch": 1.31, + "learning_rate": 2.78737790676595e-06, + "logits/chosen": -1.3203641176223755, + "logits/rejected": -1.3203641176223755, + "logps/chosen": -68.8667221069336, + "logps/rejected": -68.8667221069336, + "loss": 0.3782, + "rewards/accuracies": 0.0, + "rewards/chosen": 4.480655670166016, + "rewards/margins": 0.0, + "rewards/rejected": 4.480655670166016, + "step": 8090 + }, + { + "epoch": 1.31, + "learning_rate": 2.7861994174981587e-06, + "logits/chosen": -1.374352216720581, + "logits/rejected": -1.479109287261963, + "logps/chosen": -106.99301147460938, + "logps/rejected": -154.3737030029297, + "loss": 1.1807, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.674429416656494, + "rewards/margins": -2.2536635398864746, + "rewards/rejected": 5.928092956542969, + "step": 8091 + }, + { + "epoch": 1.31, + "learning_rate": 2.785021081183038e-06, + "logits/chosen": -1.2287698984146118, + "logits/rejected": -1.0487263202667236, + "logps/chosen": -70.20889282226562, + "logps/rejected": -5.438445091247559, + "loss": 1.1128, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.8337960243225098, + "rewards/margins": 2.1802003383636475, + "rewards/rejected": 0.6535957455635071, + "step": 8092 + }, + { + "epoch": 1.31, + "learning_rate": 2.7838428979020045e-06, + "logits/chosen": -0.8857554793357849, + "logits/rejected": -0.9765334129333496, + "logps/chosen": -0.479680597782135, + "logps/rejected": -61.71275329589844, + "loss": 1.788, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.15729986131191254, + "rewards/margins": -0.562235951423645, + "rewards/rejected": 0.7195358276367188, + "step": 8093 + }, + { + "epoch": 1.31, + "learning_rate": 2.7826648677364555e-06, + "logits/chosen": -1.203360915184021, + "logits/rejected": -1.203360915184021, + "logps/chosen": -30.751230239868164, + "logps/rejected": -30.751230239868164, + "loss": 0.4565, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.367234945297241, + "rewards/margins": 0.0, + "rewards/rejected": 2.367234945297241, + "step": 8094 + }, + { + "epoch": 1.31, + "learning_rate": 2.7814869907677833e-06, + "logits/chosen": -1.5700517892837524, + "logits/rejected": -1.4259732961654663, + "logps/chosen": -124.36911010742188, + "logps/rejected": -28.917438507080078, + "loss": 0.0261, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.289233684539795, + "rewards/margins": 3.190073251724243, + "rewards/rejected": 3.0991604328155518, + "step": 8095 + }, + { + "epoch": 1.31, + "learning_rate": 2.7803092670773702e-06, + "logits/chosen": -1.3631396293640137, + "logits/rejected": -1.3774068355560303, + "logps/chosen": -71.22573852539062, + "logps/rejected": -83.90144348144531, + "loss": 0.5938, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.0493125915527344, + "rewards/margins": 0.30895310640335083, + "rewards/rejected": 0.7403594851493835, + "step": 8096 + }, + { + "epoch": 1.31, + "learning_rate": 2.7791316967465816e-06, + "logits/chosen": -1.073479175567627, + "logits/rejected": -1.073479175567627, + "logps/chosen": -2.115720272064209, + "logps/rejected": -2.115720272064209, + "loss": 0.708, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.28579264879226685, + "rewards/margins": 0.0, + "rewards/rejected": 0.28579264879226685, + "step": 8097 + }, + { + "epoch": 1.31, + "learning_rate": 2.7779542798567804e-06, + "logits/chosen": -1.3027147054672241, + "logits/rejected": -1.2432479858398438, + "logps/chosen": -70.24762725830078, + "logps/rejected": -60.29499053955078, + "loss": 0.2836, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.2443671226501465, + "rewards/margins": 0.29636669158935547, + "rewards/rejected": 3.948000431060791, + "step": 8098 + }, + { + "epoch": 1.31, + "learning_rate": 2.776777016489312e-06, + "logits/chosen": -1.370912790298462, + "logits/rejected": -1.2173057794570923, + "logps/chosen": -91.9005355834961, + "logps/rejected": -96.0918197631836, + "loss": 3.5301, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.839969635009766, + "rewards/margins": 1.2018475532531738, + "rewards/rejected": 4.638122081756592, + "step": 8099 + }, + { + "epoch": 1.31, + "learning_rate": 2.775599906725517e-06, + "logits/chosen": -0.9906097054481506, + "logits/rejected": -0.9794793725013733, + "logps/chosen": -16.666017532348633, + "logps/rejected": -1.9778496026992798, + "loss": 0.594, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5455949902534485, + "rewards/margins": 0.19143149256706238, + "rewards/rejected": 0.3541634976863861, + "step": 8100 + }, + { + "epoch": 1.31, + "learning_rate": 2.7744229506467195e-06, + "logits/chosen": -1.181665301322937, + "logits/rejected": -1.1259735822677612, + "logps/chosen": -56.073036193847656, + "logps/rejected": -9.795448303222656, + "loss": 0.3301, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1114234924316406, + "rewards/margins": 1.8639812469482422, + "rewards/rejected": 1.2474422454833984, + "step": 8101 + }, + { + "epoch": 1.32, + "learning_rate": 2.7732461483342393e-06, + "logits/chosen": -0.6496152877807617, + "logits/rejected": -0.6496152877807617, + "logps/chosen": -13.58236312866211, + "logps/rejected": -13.58236312866211, + "loss": 0.6314, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.7215394973754883, + "rewards/margins": 0.0, + "rewards/rejected": 0.7215394973754883, + "step": 8102 + }, + { + "epoch": 1.32, + "learning_rate": 2.7720694998693786e-06, + "logits/chosen": -1.298954725265503, + "logits/rejected": -1.4040101766586304, + "logps/chosen": -84.06248474121094, + "logps/rejected": -61.55091094970703, + "loss": 0.0402, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.046861171722412, + "rewards/margins": 2.4948339462280273, + "rewards/rejected": 3.5520272254943848, + "step": 8103 + }, + { + "epoch": 1.32, + "learning_rate": 2.770893005333436e-06, + "logits/chosen": -1.019318699836731, + "logits/rejected": -1.0460196733474731, + "logps/chosen": -80.88427734375, + "logps/rejected": -74.25959777832031, + "loss": 0.8785, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.418452501296997, + "rewards/margins": -0.5480804443359375, + "rewards/rejected": 1.9665329456329346, + "step": 8104 + }, + { + "epoch": 1.32, + "learning_rate": 2.7697166648076936e-06, + "logits/chosen": -1.4682761430740356, + "logits/rejected": -1.4240936040878296, + "logps/chosen": -130.12432861328125, + "logps/rejected": -57.232391357421875, + "loss": 0.4871, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.268052816390991, + "rewards/margins": 2.865860939025879, + "rewards/rejected": 0.4021919369697571, + "step": 8105 + }, + { + "epoch": 1.32, + "learning_rate": 2.7685404783734275e-06, + "logits/chosen": -1.0526567697525024, + "logits/rejected": -1.0688154697418213, + "logps/chosen": -57.19411849975586, + "logps/rejected": -42.607059478759766, + "loss": 0.3047, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.110631227493286, + "rewards/margins": 0.9352821111679077, + "rewards/rejected": 1.1753491163253784, + "step": 8106 + }, + { + "epoch": 1.32, + "learning_rate": 2.767364446111901e-06, + "logits/chosen": -1.4942100048065186, + "logits/rejected": -1.2895504236221313, + "logps/chosen": -117.61638641357422, + "logps/rejected": -59.80255889892578, + "loss": 0.0502, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.787784576416016, + "rewards/margins": 2.866952419281006, + "rewards/rejected": 2.9208321571350098, + "step": 8107 + }, + { + "epoch": 1.32, + "learning_rate": 2.7661885681043654e-06, + "logits/chosen": -1.3261979818344116, + "logits/rejected": -1.3884623050689697, + "logps/chosen": -97.49275970458984, + "logps/rejected": -72.19480895996094, + "loss": 0.987, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.4219002723693848, + "rewards/margins": -1.7405509948730469, + "rewards/rejected": 4.162451267242432, + "step": 8108 + }, + { + "epoch": 1.32, + "learning_rate": 2.765012844432061e-06, + "logits/chosen": -0.5494131445884705, + "logits/rejected": -0.5494131445884705, + "logps/chosen": -35.03011703491211, + "logps/rejected": -35.03011703491211, + "loss": 0.8886, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.8975459933280945, + "rewards/margins": 0.0, + "rewards/rejected": 0.8975459933280945, + "step": 8109 + }, + { + "epoch": 1.32, + "learning_rate": 2.763837275176224e-06, + "logits/chosen": -1.4492958784103394, + "logits/rejected": -1.354045033454895, + "logps/chosen": -72.53530883789062, + "logps/rejected": -93.74304962158203, + "loss": 0.0571, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.151854038238525, + "rewards/margins": 2.7472426891326904, + "rewards/rejected": 3.404611349105835, + "step": 8110 + }, + { + "epoch": 1.32, + "learning_rate": 2.76266186041807e-06, + "logits/chosen": -1.2935012578964233, + "logits/rejected": -1.1332335472106934, + "logps/chosen": -145.33294677734375, + "logps/rejected": -63.06904602050781, + "loss": 0.1366, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.769481182098389, + "rewards/margins": 1.544421672821045, + "rewards/rejected": 5.225059509277344, + "step": 8111 + }, + { + "epoch": 1.32, + "learning_rate": 2.761486600238814e-06, + "logits/chosen": -1.777692437171936, + "logits/rejected": -1.7654139995574951, + "logps/chosen": -25.040035247802734, + "logps/rejected": -86.3615951538086, + "loss": 0.4203, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.481832504272461, + "rewards/margins": -0.24345755577087402, + "rewards/rejected": 2.725290060043335, + "step": 8112 + }, + { + "epoch": 1.32, + "learning_rate": 2.7603114947196507e-06, + "logits/chosen": -1.542216181755066, + "logits/rejected": -1.3798203468322754, + "logps/chosen": -92.99946594238281, + "logps/rejected": -21.798423767089844, + "loss": 0.146, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.5487382411956787, + "rewards/margins": 1.5228421688079834, + "rewards/rejected": 2.0258960723876953, + "step": 8113 + }, + { + "epoch": 1.32, + "learning_rate": 2.759136543941773e-06, + "logits/chosen": -1.4292243719100952, + "logits/rejected": -1.436732292175293, + "logps/chosen": -140.24122619628906, + "logps/rejected": -84.96653747558594, + "loss": 0.2587, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.507826328277588, + "rewards/margins": 0.7849702835083008, + "rewards/rejected": 4.722856044769287, + "step": 8114 + }, + { + "epoch": 1.32, + "learning_rate": 2.757961747986355e-06, + "logits/chosen": -1.383015751838684, + "logits/rejected": -1.0000169277191162, + "logps/chosen": -181.54733276367188, + "logps/rejected": -13.493551254272461, + "loss": 0.0098, + "rewards/accuracies": 1.0, + "rewards/chosen": 8.147326469421387, + "rewards/margins": 7.711408615112305, + "rewards/rejected": 0.43591785430908203, + "step": 8115 + }, + { + "epoch": 1.32, + "learning_rate": 2.7567871069345654e-06, + "logits/chosen": -1.3369406461715698, + "logits/rejected": -1.3369406461715698, + "logps/chosen": -45.5792350769043, + "logps/rejected": -45.5792350769043, + "loss": 1.5786, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.534017562866211, + "rewards/margins": 0.0, + "rewards/rejected": 3.534017562866211, + "step": 8116 + }, + { + "epoch": 1.32, + "learning_rate": 2.7556126208675637e-06, + "logits/chosen": -1.644731044769287, + "logits/rejected": -1.585086703300476, + "logps/chosen": -97.27906799316406, + "logps/rejected": -23.075721740722656, + "loss": 1.6531, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.6887832880020142, + "rewards/margins": 0.1282813549041748, + "rewards/rejected": 1.5605019330978394, + "step": 8117 + }, + { + "epoch": 1.32, + "learning_rate": 2.754438289866491e-06, + "logits/chosen": -1.4343820810317993, + "logits/rejected": -1.3709455728530884, + "logps/chosen": -87.9265365600586, + "logps/rejected": -111.81156921386719, + "loss": 0.1102, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.7367472648620605, + "rewards/margins": 3.813347816467285, + "rewards/rejected": 1.9233993291854858, + "step": 8118 + }, + { + "epoch": 1.32, + "learning_rate": 2.753264114012487e-06, + "logits/chosen": -1.45057213306427, + "logits/rejected": -1.431308627128601, + "logps/chosen": -144.758544921875, + "logps/rejected": -175.5499267578125, + "loss": 1.1052, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.21734619140625, + "rewards/margins": 0.40268242359161377, + "rewards/rejected": 1.8146637678146362, + "step": 8119 + }, + { + "epoch": 1.32, + "learning_rate": 2.7520900933866725e-06, + "logits/chosen": -1.4907892942428589, + "logits/rejected": -1.5029549598693848, + "logps/chosen": -97.905029296875, + "logps/rejected": -108.98172760009766, + "loss": 0.2921, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.163743734359741, + "rewards/margins": 0.23135459423065186, + "rewards/rejected": 1.9323891401290894, + "step": 8120 + }, + { + "epoch": 1.32, + "learning_rate": 2.7509162280701633e-06, + "logits/chosen": -1.6761653423309326, + "logits/rejected": -1.6124788522720337, + "logps/chosen": -80.9135513305664, + "logps/rejected": -68.89674377441406, + "loss": 0.5696, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.9686760902404785, + "rewards/margins": 1.5003182888031006, + "rewards/rejected": 2.468357801437378, + "step": 8121 + }, + { + "epoch": 1.32, + "learning_rate": 2.749742518144061e-06, + "logits/chosen": -1.4053888320922852, + "logits/rejected": -1.4120813608169556, + "logps/chosen": -158.74737548828125, + "logps/rejected": -127.609130859375, + "loss": 0.5193, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.002798557281494, + "rewards/margins": -0.5932693481445312, + "rewards/rejected": 2.5960679054260254, + "step": 8122 + }, + { + "epoch": 1.32, + "learning_rate": 2.74856896368946e-06, + "logits/chosen": -1.5749839544296265, + "logits/rejected": -1.5749839544296265, + "logps/chosen": -41.07457733154297, + "logps/rejected": -41.07457733154297, + "loss": 0.8297, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.7189754843711853, + "rewards/margins": 0.0, + "rewards/rejected": 0.7189754843711853, + "step": 8123 + }, + { + "epoch": 1.32, + "learning_rate": 2.747395564787438e-06, + "logits/chosen": -1.014504313468933, + "logits/rejected": -0.8789948225021362, + "logps/chosen": -59.5491943359375, + "logps/rejected": -9.65066146850586, + "loss": 0.0688, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.300373077392578, + "rewards/margins": 2.291257381439209, + "rewards/rejected": 1.0091155767440796, + "step": 8124 + }, + { + "epoch": 1.32, + "learning_rate": 2.7462223215190703e-06, + "logits/chosen": -1.198984980583191, + "logits/rejected": -1.0879782438278198, + "logps/chosen": -82.51643371582031, + "logps/rejected": -58.650840759277344, + "loss": 0.3097, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.114457845687866, + "rewards/margins": 0.25740909576416016, + "rewards/rejected": 2.857048749923706, + "step": 8125 + }, + { + "epoch": 1.32, + "learning_rate": 2.7450492339654127e-06, + "logits/chosen": -0.6864829063415527, + "logits/rejected": -0.692853569984436, + "logps/chosen": -4.421006202697754, + "logps/rejected": -1.0500061511993408, + "loss": 0.5244, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3165931701660156, + "rewards/margins": 0.053808510303497314, + "rewards/rejected": 0.2627846598625183, + "step": 8126 + }, + { + "epoch": 1.32, + "learning_rate": 2.7438763022075187e-06, + "logits/chosen": -1.3445028066635132, + "logits/rejected": -1.2241058349609375, + "logps/chosen": -72.26150512695312, + "logps/rejected": -22.975170135498047, + "loss": 0.3616, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.890533447265625, + "rewards/margins": 0.7852871417999268, + "rewards/rejected": 2.1052463054656982, + "step": 8127 + }, + { + "epoch": 1.32, + "learning_rate": 2.7427035263264222e-06, + "logits/chosen": -1.2370373010635376, + "logits/rejected": -1.225740671157837, + "logps/chosen": -103.61141967773438, + "logps/rejected": -68.97659301757812, + "loss": 0.3643, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.64572286605835, + "rewards/margins": 0.12833595275878906, + "rewards/rejected": 4.5173869132995605, + "step": 8128 + }, + { + "epoch": 1.32, + "learning_rate": 2.741530906403156e-06, + "logits/chosen": -1.3067749738693237, + "logits/rejected": -1.3067749738693237, + "logps/chosen": -26.754764556884766, + "logps/rejected": -26.754764556884766, + "loss": 0.9903, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.9282498359680176, + "rewards/margins": 0.0, + "rewards/rejected": 2.9282498359680176, + "step": 8129 + }, + { + "epoch": 1.32, + "learning_rate": 2.7403584425187326e-06, + "logits/chosen": -1.366697072982788, + "logits/rejected": -1.2182544469833374, + "logps/chosen": -102.2010269165039, + "logps/rejected": -52.059104919433594, + "loss": 0.4831, + "rewards/accuracies": 1.0, + "rewards/chosen": 7.133982181549072, + "rewards/margins": 4.069744110107422, + "rewards/rejected": 3.0642380714416504, + "step": 8130 + }, + { + "epoch": 1.32, + "learning_rate": 2.7391861347541603e-06, + "logits/chosen": -1.1078159809112549, + "logits/rejected": -0.9346082210540771, + "logps/chosen": -90.40296936035156, + "logps/rejected": -37.71478271484375, + "loss": 0.2065, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.686187744140625, + "rewards/margins": 1.229982614517212, + "rewards/rejected": 3.456205129623413, + "step": 8131 + }, + { + "epoch": 1.32, + "learning_rate": 2.7380139831904336e-06, + "logits/chosen": -1.2472259998321533, + "logits/rejected": -1.34163498878479, + "logps/chosen": -91.00801086425781, + "logps/rejected": -127.26941680908203, + "loss": 1.0981, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.8867340087890625, + "rewards/margins": 0.4165778160095215, + "rewards/rejected": 3.470156192779541, + "step": 8132 + }, + { + "epoch": 1.32, + "learning_rate": 2.7368419879085383e-06, + "logits/chosen": -1.4718544483184814, + "logits/rejected": -1.5320943593978882, + "logps/chosen": -61.98504638671875, + "logps/rejected": -70.71003723144531, + "loss": 1.7595, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1207473278045654, + "rewards/margins": 0.02165055274963379, + "rewards/rejected": 3.0990967750549316, + "step": 8133 + }, + { + "epoch": 1.32, + "learning_rate": 2.7356701489894468e-06, + "logits/chosen": -1.1346808671951294, + "logits/rejected": -1.1327683925628662, + "logps/chosen": -76.84315490722656, + "logps/rejected": -49.587005615234375, + "loss": 0.8068, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.4035041332244873, + "rewards/margins": -0.2460160255432129, + "rewards/rejected": 2.6495201587677, + "step": 8134 + }, + { + "epoch": 1.32, + "learning_rate": 2.734498466514124e-06, + "logits/chosen": -1.0886198282241821, + "logits/rejected": -1.0886198282241821, + "logps/chosen": -16.354522705078125, + "logps/rejected": -16.354522705078125, + "loss": 0.6889, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.8670501708984375, + "rewards/margins": 0.0, + "rewards/rejected": 1.8670501708984375, + "step": 8135 + }, + { + "epoch": 1.32, + "learning_rate": 2.7333269405635188e-06, + "logits/chosen": -1.559893012046814, + "logits/rejected": -1.6233052015304565, + "logps/chosen": -97.53728485107422, + "logps/rejected": -85.05529022216797, + "loss": 1.7225, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.0191445350646973, + "rewards/margins": -2.5974364280700684, + "rewards/rejected": 4.616580963134766, + "step": 8136 + }, + { + "epoch": 1.32, + "learning_rate": 2.7321555712185766e-06, + "logits/chosen": -1.176780343055725, + "logits/rejected": -1.1214600801467896, + "logps/chosen": -47.949424743652344, + "logps/rejected": -64.98320770263672, + "loss": 0.8387, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.344332218170166, + "rewards/margins": -0.32082366943359375, + "rewards/rejected": 2.6651558876037598, + "step": 8137 + }, + { + "epoch": 1.32, + "learning_rate": 2.7309843585602238e-06, + "logits/chosen": -1.4091806411743164, + "logits/rejected": -1.3804214000701904, + "logps/chosen": -103.3932876586914, + "logps/rejected": -77.1568832397461, + "loss": 0.3734, + "rewards/accuracies": 0.0, + "rewards/chosen": 6.290401458740234, + "rewards/margins": -0.0330047607421875, + "rewards/rejected": 6.323406219482422, + "step": 8138 + }, + { + "epoch": 1.32, + "learning_rate": 2.729813302669384e-06, + "logits/chosen": -1.3346565961837769, + "logits/rejected": -1.3423664569854736, + "logps/chosen": -179.94964599609375, + "logps/rejected": -108.69392395019531, + "loss": 0.3195, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.723718166351318, + "rewards/margins": 0.5501384735107422, + "rewards/rejected": 6.173579692840576, + "step": 8139 + }, + { + "epoch": 1.32, + "learning_rate": 2.7286424036269626e-06, + "logits/chosen": -1.4646191596984863, + "logits/rejected": -1.4223898649215698, + "logps/chosen": -61.264591217041016, + "logps/rejected": -59.33787155151367, + "loss": 0.349, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.961120367050171, + "rewards/margins": 1.8974664211273193, + "rewards/rejected": 2.0636539459228516, + "step": 8140 + }, + { + "epoch": 1.32, + "learning_rate": 2.727471661513861e-06, + "logits/chosen": -1.3610879182815552, + "logits/rejected": -1.4104132652282715, + "logps/chosen": -100.96128845214844, + "logps/rejected": -91.28096008300781, + "loss": 0.7898, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.2293992042541504, + "rewards/margins": -1.3418793678283691, + "rewards/rejected": 4.5712785720825195, + "step": 8141 + }, + { + "epoch": 1.32, + "learning_rate": 2.726301076410963e-06, + "logits/chosen": -1.0822491645812988, + "logits/rejected": -1.1132417917251587, + "logps/chosen": -6.223763465881348, + "logps/rejected": -26.532489776611328, + "loss": 0.8666, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.4521646499633789, + "rewards/margins": -0.05052584409713745, + "rewards/rejected": 0.5026904940605164, + "step": 8142 + }, + { + "epoch": 1.32, + "learning_rate": 2.725130648399149e-06, + "logits/chosen": -1.4159514904022217, + "logits/rejected": -1.4994337558746338, + "logps/chosen": -70.27659606933594, + "logps/rejected": -90.63006591796875, + "loss": 1.3374, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.8905303478240967, + "rewards/margins": -1.8332703113555908, + "rewards/rejected": 4.7238006591796875, + "step": 8143 + }, + { + "epoch": 1.32, + "learning_rate": 2.72396037755928e-06, + "logits/chosen": -1.2637065649032593, + "logits/rejected": -1.0433759689331055, + "logps/chosen": -110.36074829101562, + "logps/rejected": -52.358314514160156, + "loss": 0.0293, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.6678466796875, + "rewards/margins": 4.14835262298584, + "rewards/rejected": 1.5194939374923706, + "step": 8144 + }, + { + "epoch": 1.32, + "learning_rate": 2.7227902639722146e-06, + "logits/chosen": -1.6150037050247192, + "logits/rejected": -1.639910101890564, + "logps/chosen": -145.51708984375, + "logps/rejected": -67.5361557006836, + "loss": 0.2312, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.240332126617432, + "rewards/margins": 2.9380149841308594, + "rewards/rejected": 3.3023171424865723, + "step": 8145 + }, + { + "epoch": 1.32, + "learning_rate": 2.721620307718793e-06, + "logits/chosen": -1.2219127416610718, + "logits/rejected": -1.2954025268554688, + "logps/chosen": -97.3949966430664, + "logps/rejected": -139.462646484375, + "loss": 1.2843, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.4460837841033936, + "rewards/margins": -2.4552791118621826, + "rewards/rejected": 5.901362895965576, + "step": 8146 + }, + { + "epoch": 1.32, + "learning_rate": 2.7204505088798517e-06, + "logits/chosen": -0.8891754746437073, + "logits/rejected": -0.8891754746437073, + "logps/chosen": -48.54339599609375, + "logps/rejected": -48.54339599609375, + "loss": 0.3764, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.449240207672119, + "rewards/margins": 0.0, + "rewards/rejected": 2.449240207672119, + "step": 8147 + }, + { + "epoch": 1.32, + "learning_rate": 2.7192808675362092e-06, + "logits/chosen": -1.6018216609954834, + "logits/rejected": -1.554132342338562, + "logps/chosen": -178.40602111816406, + "logps/rejected": -60.69694137573242, + "loss": 0.29, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.501237630844116, + "rewards/margins": 0.26562929153442383, + "rewards/rejected": 3.2356083393096924, + "step": 8148 + }, + { + "epoch": 1.32, + "learning_rate": 2.71811138376868e-06, + "logits/chosen": -1.4011744260787964, + "logits/rejected": -1.3953222036361694, + "logps/chosen": -45.15214538574219, + "logps/rejected": -88.88888549804688, + "loss": 0.8907, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.8137664794921875, + "rewards/margins": 1.0026443004608154, + "rewards/rejected": 0.8111221194267273, + "step": 8149 + }, + { + "epoch": 1.32, + "learning_rate": 2.716942057658061e-06, + "logits/chosen": -1.0339466333389282, + "logits/rejected": -1.0321954488754272, + "logps/chosen": -5.289636135101318, + "logps/rejected": -2.612257957458496, + "loss": 0.3731, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.490551620721817, + "rewards/margins": -0.08063605427742004, + "rewards/rejected": 0.5711876749992371, + "step": 8150 + }, + { + "epoch": 1.32, + "learning_rate": 2.715772889285143e-06, + "logits/chosen": -1.4670319557189941, + "logits/rejected": -1.4756759405136108, + "logps/chosen": -46.9383544921875, + "logps/rejected": -60.9525260925293, + "loss": 0.54, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.41324782371521, + "rewards/margins": 0.7497906684875488, + "rewards/rejected": 1.6634571552276611, + "step": 8151 + }, + { + "epoch": 1.32, + "learning_rate": 2.714603878730707e-06, + "logits/chosen": -0.6706271171569824, + "logits/rejected": -0.6706271171569824, + "logps/chosen": -16.768558502197266, + "logps/rejected": -16.768558502197266, + "loss": 0.4911, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.4750986099243164, + "rewards/margins": 0.0, + "rewards/rejected": 0.4750986099243164, + "step": 8152 + }, + { + "epoch": 1.32, + "learning_rate": 2.713435026075517e-06, + "logits/chosen": -1.4532405138015747, + "logits/rejected": -1.5597695112228394, + "logps/chosen": -53.389488220214844, + "logps/rejected": -155.69662475585938, + "loss": 2.7221, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.7022628784179688, + "rewards/margins": -3.692836284637451, + "rewards/rejected": 6.39509916305542, + "step": 8153 + }, + { + "epoch": 1.32, + "learning_rate": 2.712266331400332e-06, + "logits/chosen": -1.2427374124526978, + "logits/rejected": -1.2288029193878174, + "logps/chosen": -84.26834106445312, + "logps/rejected": -50.69140625, + "loss": 0.1068, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.675950765609741, + "rewards/margins": 1.4737619161605835, + "rewards/rejected": 1.2021888494491577, + "step": 8154 + }, + { + "epoch": 1.32, + "learning_rate": 2.7110977947858954e-06, + "logits/chosen": -1.4067014455795288, + "logits/rejected": -1.3643594980239868, + "logps/chosen": -101.85122680664062, + "logps/rejected": -33.537742614746094, + "loss": 0.2904, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.329815864562988, + "rewards/margins": 0.5657951831817627, + "rewards/rejected": 3.7640206813812256, + "step": 8155 + }, + { + "epoch": 1.32, + "learning_rate": 2.7099294163129453e-06, + "logits/chosen": -1.3153488636016846, + "logits/rejected": -1.1663814783096313, + "logps/chosen": -108.02410125732422, + "logps/rejected": -106.18440246582031, + "loss": 0.0296, + "rewards/accuracies": 1.0, + "rewards/chosen": 10.098432540893555, + "rewards/margins": 3.6308650970458984, + "rewards/rejected": 6.467567443847656, + "step": 8156 + }, + { + "epoch": 1.32, + "learning_rate": 2.708761196062202e-06, + "logits/chosen": -1.3127650022506714, + "logits/rejected": -1.4558409452438354, + "logps/chosen": -57.71234130859375, + "logps/rejected": -103.51004028320312, + "loss": 1.5121, + "rewards/accuracies": 0.0, + "rewards/chosen": 4.160107612609863, + "rewards/margins": -2.960026264190674, + "rewards/rejected": 7.120133876800537, + "step": 8157 + }, + { + "epoch": 1.32, + "learning_rate": 2.7075931341143812e-06, + "logits/chosen": -1.326025366783142, + "logits/rejected": -1.2264963388442993, + "logps/chosen": -95.94691467285156, + "logps/rejected": -33.72822570800781, + "loss": 0.0554, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.8410911560058594, + "rewards/margins": 2.4037551879882812, + "rewards/rejected": 0.4373359680175781, + "step": 8158 + }, + { + "epoch": 1.32, + "learning_rate": 2.7064252305501825e-06, + "logits/chosen": -1.516761302947998, + "logits/rejected": -1.5058510303497314, + "logps/chosen": -65.95903015136719, + "logps/rejected": -88.68214416503906, + "loss": 0.0553, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.0175399780273438, + "rewards/margins": 2.239403486251831, + "rewards/rejected": -0.2218635529279709, + "step": 8159 + }, + { + "epoch": 1.32, + "learning_rate": 2.7052574854503e-06, + "logits/chosen": -1.0832762718200684, + "logits/rejected": -1.183577060699463, + "logps/chosen": -147.04400634765625, + "logps/rejected": -105.07978820800781, + "loss": 0.226, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.433651924133301, + "rewards/margins": 1.1122910976409912, + "rewards/rejected": 3.3213608264923096, + "step": 8160 + }, + { + "epoch": 1.32, + "learning_rate": 2.7040898988954105e-06, + "logits/chosen": -1.042234182357788, + "logits/rejected": -1.0904549360275269, + "logps/chosen": -65.25962829589844, + "logps/rejected": -41.683162689208984, + "loss": 1.037, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.7509522438049316, + "rewards/margins": 0.6128456592559814, + "rewards/rejected": 2.13810658454895, + "step": 8161 + }, + { + "epoch": 1.32, + "learning_rate": 2.702922470966187e-06, + "logits/chosen": -1.0913137197494507, + "logits/rejected": -1.0913137197494507, + "logps/chosen": -60.61610412597656, + "logps/rejected": -60.61610412597656, + "loss": 0.3828, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.6832168102264404, + "rewards/margins": 0.0, + "rewards/rejected": 3.6832168102264404, + "step": 8162 + }, + { + "epoch": 1.32, + "learning_rate": 2.7017552017432834e-06, + "logits/chosen": -1.299911618232727, + "logits/rejected": -1.1519454717636108, + "logps/chosen": -47.19085693359375, + "logps/rejected": -14.113296508789062, + "loss": 0.2288, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.1955955028533936, + "rewards/margins": 1.5582287311553955, + "rewards/rejected": 0.6373667120933533, + "step": 8163 + }, + { + "epoch": 1.33, + "learning_rate": 2.700588091307351e-06, + "logits/chosen": -1.391202688217163, + "logits/rejected": -1.4010705947875977, + "logps/chosen": -85.71265411376953, + "logps/rejected": -87.7371826171875, + "loss": 0.6465, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.646881103515625, + "rewards/margins": -0.217559814453125, + "rewards/rejected": 2.86444091796875, + "step": 8164 + }, + { + "epoch": 1.33, + "learning_rate": 2.699421139739023e-06, + "logits/chosen": -0.9938008785247803, + "logits/rejected": -0.9872059226036072, + "logps/chosen": -73.52735137939453, + "logps/rejected": -62.83936309814453, + "loss": 0.4873, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.111840009689331, + "rewards/margins": 0.44344472885131836, + "rewards/rejected": 1.6683952808380127, + "step": 8165 + }, + { + "epoch": 1.33, + "learning_rate": 2.698254347118927e-06, + "logits/chosen": -1.3196862936019897, + "logits/rejected": -1.351332426071167, + "logps/chosen": -79.08424377441406, + "logps/rejected": -108.95970153808594, + "loss": 1.1386, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.668194532394409, + "rewards/margins": -1.096736192703247, + "rewards/rejected": 4.764930725097656, + "step": 8166 + }, + { + "epoch": 1.33, + "learning_rate": 2.697087713527675e-06, + "logits/chosen": -1.443640112876892, + "logits/rejected": -1.4400830268859863, + "logps/chosen": -45.48210144042969, + "logps/rejected": -97.5584487915039, + "loss": 0.868, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.1534981727600098, + "rewards/margins": -1.2898848056793213, + "rewards/rejected": 3.443382978439331, + "step": 8167 + }, + { + "epoch": 1.33, + "learning_rate": 2.695921239045873e-06, + "logits/chosen": -1.4018590450286865, + "logits/rejected": -1.412941575050354, + "logps/chosen": -64.15528106689453, + "logps/rejected": -143.1763916015625, + "loss": 0.2713, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.8968056440353394, + "rewards/margins": 0.96254962682724, + "rewards/rejected": 0.9342560172080994, + "step": 8168 + }, + { + "epoch": 1.33, + "learning_rate": 2.694754923754111e-06, + "logits/chosen": -1.4092957973480225, + "logits/rejected": -1.5778558254241943, + "logps/chosen": -221.11572265625, + "logps/rejected": -131.7153778076172, + "loss": 0.115, + "rewards/accuracies": 1.0, + "rewards/chosen": 9.374518394470215, + "rewards/margins": 2.632966995239258, + "rewards/rejected": 6.741551399230957, + "step": 8169 + }, + { + "epoch": 1.33, + "learning_rate": 2.6935887677329727e-06, + "logits/chosen": -1.2470424175262451, + "logits/rejected": -1.2482353448867798, + "logps/chosen": -30.254579544067383, + "logps/rejected": -45.04378890991211, + "loss": 0.7526, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.9298685193061829, + "rewards/margins": -1.1034984588623047, + "rewards/rejected": 2.0333669185638428, + "step": 8170 + }, + { + "epoch": 1.33, + "learning_rate": 2.6924227710630247e-06, + "logits/chosen": -1.1132656335830688, + "logits/rejected": -1.143239140510559, + "logps/chosen": -22.252174377441406, + "logps/rejected": -64.337890625, + "loss": 0.7137, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.926578164100647, + "rewards/margins": 0.6268604397773743, + "rewards/rejected": 0.2997177243232727, + "step": 8171 + }, + { + "epoch": 1.33, + "learning_rate": 2.6912569338248317e-06, + "logits/chosen": -1.5563759803771973, + "logits/rejected": -1.5181972980499268, + "logps/chosen": -42.202999114990234, + "logps/rejected": -66.66827392578125, + "loss": 0.2063, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.959932327270508, + "rewards/margins": 0.7060916423797607, + "rewards/rejected": 2.253840684890747, + "step": 8172 + }, + { + "epoch": 1.33, + "learning_rate": 2.6900912560989363e-06, + "logits/chosen": -0.950290858745575, + "logits/rejected": -0.950290858745575, + "logps/chosen": -53.10835647583008, + "logps/rejected": -53.10835647583008, + "loss": 1.8528, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.7823857069015503, + "rewards/margins": 0.0, + "rewards/rejected": 1.7823857069015503, + "step": 8173 + }, + { + "epoch": 1.33, + "learning_rate": 2.6889257379658804e-06, + "logits/chosen": -1.090555191040039, + "logits/rejected": -1.138810396194458, + "logps/chosen": -56.36248779296875, + "logps/rejected": -77.96411895751953, + "loss": 2.9866, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.8528457880020142, + "rewards/margins": -5.397898197174072, + "rewards/rejected": 7.250743865966797, + "step": 8174 + }, + { + "epoch": 1.33, + "learning_rate": 2.6877603795061868e-06, + "logits/chosen": -1.2761459350585938, + "logits/rejected": -1.3255155086517334, + "logps/chosen": -36.672664642333984, + "logps/rejected": -136.1995849609375, + "loss": 1.4153, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.1230123043060303, + "rewards/margins": -1.6723108291625977, + "rewards/rejected": 3.795323133468628, + "step": 8175 + }, + { + "epoch": 1.33, + "learning_rate": 2.6865951808003743e-06, + "logits/chosen": -1.1290990114212036, + "logits/rejected": -1.1211280822753906, + "logps/chosen": -64.08478546142578, + "logps/rejected": -47.00263595581055, + "loss": 0.2776, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.919276475906372, + "rewards/margins": 0.4226055145263672, + "rewards/rejected": 2.496670961380005, + "step": 8176 + }, + { + "epoch": 1.33, + "learning_rate": 2.685430141928943e-06, + "logits/chosen": -1.11409330368042, + "logits/rejected": -1.1464041471481323, + "logps/chosen": -81.8216781616211, + "logps/rejected": -82.47373962402344, + "loss": 1.6898, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.204946279525757, + "rewards/margins": -3.2418463230133057, + "rewards/rejected": 5.4467926025390625, + "step": 8177 + }, + { + "epoch": 1.33, + "learning_rate": 2.6842652629723907e-06, + "logits/chosen": -0.8478772640228271, + "logits/rejected": -0.8485353589057922, + "logps/chosen": -1.1294668912887573, + "logps/rejected": -2.562183380126953, + "loss": 0.3415, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.23850040137767792, + "rewards/margins": 0.021325156092643738, + "rewards/rejected": 0.21717524528503418, + "step": 8178 + }, + { + "epoch": 1.33, + "learning_rate": 2.6831005440111944e-06, + "logits/chosen": -1.3196892738342285, + "logits/rejected": -1.2990784645080566, + "logps/chosen": -92.05538177490234, + "logps/rejected": -44.94453048706055, + "loss": 0.279, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.3000435829162598, + "rewards/margins": 0.6042263507843018, + "rewards/rejected": 1.695817232131958, + "step": 8179 + }, + { + "epoch": 1.33, + "learning_rate": 2.6819359851258302e-06, + "logits/chosen": -1.327705979347229, + "logits/rejected": -1.327705979347229, + "logps/chosen": -47.823402404785156, + "logps/rejected": -47.823402404785156, + "loss": 0.4175, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.8885459899902344, + "rewards/margins": 0.0, + "rewards/rejected": 3.8885459899902344, + "step": 8180 + }, + { + "epoch": 1.33, + "learning_rate": 2.680771586396754e-06, + "logits/chosen": -1.1902540922164917, + "logits/rejected": -1.2692161798477173, + "logps/chosen": -98.91162109375, + "logps/rejected": -146.0034637451172, + "loss": 1.2524, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.707479953765869, + "rewards/margins": -2.0560975074768066, + "rewards/rejected": 5.763577461242676, + "step": 8181 + }, + { + "epoch": 1.33, + "learning_rate": 2.6796073479044175e-06, + "logits/chosen": -1.3601397275924683, + "logits/rejected": -1.3112927675247192, + "logps/chosen": -53.76610565185547, + "logps/rejected": -115.79006958007812, + "loss": 0.4333, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.0986557006835938, + "rewards/margins": -0.2766404151916504, + "rewards/rejected": 2.375296115875244, + "step": 8182 + }, + { + "epoch": 1.33, + "learning_rate": 2.678443269729256e-06, + "logits/chosen": -1.3102346658706665, + "logits/rejected": -1.329970359802246, + "logps/chosen": -60.70613479614258, + "logps/rejected": -42.73273468017578, + "loss": 0.5432, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.687904119491577, + "rewards/margins": 1.1392903327941895, + "rewards/rejected": 1.5486137866973877, + "step": 8183 + }, + { + "epoch": 1.33, + "learning_rate": 2.6772793519517003e-06, + "logits/chosen": -1.3602396249771118, + "logits/rejected": -1.059788703918457, + "logps/chosen": -123.05382537841797, + "logps/rejected": -53.642921447753906, + "loss": 0.0663, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.205725908279419, + "rewards/margins": 3.2831809520721436, + "rewards/rejected": -0.07745514065027237, + "step": 8184 + }, + { + "epoch": 1.33, + "learning_rate": 2.6761155946521613e-06, + "logits/chosen": -1.5385409593582153, + "logits/rejected": -1.2515534162521362, + "logps/chosen": -95.59161376953125, + "logps/rejected": -153.5406494140625, + "loss": 1.5877, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.6503281593322754, + "rewards/margins": -3.045924186706543, + "rewards/rejected": 5.696252346038818, + "step": 8185 + }, + { + "epoch": 1.33, + "learning_rate": 2.6749519979110454e-06, + "logits/chosen": -1.1986027956008911, + "logits/rejected": -1.1586958169937134, + "logps/chosen": -100.33361053466797, + "logps/rejected": -61.24714279174805, + "loss": 1.3123, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.9846779108047485, + "rewards/margins": 0.006634175777435303, + "rewards/rejected": 0.9780437350273132, + "step": 8186 + }, + { + "epoch": 1.33, + "learning_rate": 2.673788561808749e-06, + "logits/chosen": -1.2106599807739258, + "logits/rejected": -1.1587769985198975, + "logps/chosen": -96.8077392578125, + "logps/rejected": -46.91518783569336, + "loss": 0.1664, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.8475745916366577, + "rewards/margins": 1.028491497039795, + "rewards/rejected": 0.819083034992218, + "step": 8187 + }, + { + "epoch": 1.33, + "learning_rate": 2.6726252864256502e-06, + "logits/chosen": -1.7203071117401123, + "logits/rejected": -1.7025271654129028, + "logps/chosen": -125.724365234375, + "logps/rejected": -15.20553970336914, + "loss": 0.0045, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.3927459716796875, + "rewards/margins": 5.494965076446533, + "rewards/rejected": 0.8977810144424438, + "step": 8188 + }, + { + "epoch": 1.33, + "learning_rate": 2.6714621718421242e-06, + "logits/chosen": -1.3799039125442505, + "logits/rejected": -1.4373092651367188, + "logps/chosen": -212.98374938964844, + "logps/rejected": -109.46878051757812, + "loss": 0.0675, + "rewards/accuracies": 1.0, + "rewards/chosen": 7.9596147537231445, + "rewards/margins": 2.2476611137390137, + "rewards/rejected": 5.711953639984131, + "step": 8189 + }, + { + "epoch": 1.33, + "learning_rate": 2.670299218138527e-06, + "logits/chosen": -1.3509788513183594, + "logits/rejected": -1.281792402267456, + "logps/chosen": -54.07575988769531, + "logps/rejected": -35.774658203125, + "loss": 0.8118, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.458378553390503, + "rewards/margins": 2.793691635131836, + "rewards/rejected": 0.6646869778633118, + "step": 8190 + }, + { + "epoch": 1.33, + "learning_rate": 2.6691364253952123e-06, + "logits/chosen": -1.231482744216919, + "logits/rejected": -1.2724841833114624, + "logps/chosen": -82.85303497314453, + "logps/rejected": -99.62992858886719, + "loss": 0.9263, + "rewards/accuracies": 0.0, + "rewards/chosen": 5.217189788818359, + "rewards/margins": -1.5779471397399902, + "rewards/rejected": 6.79513692855835, + "step": 8191 + }, + { + "epoch": 1.33, + "learning_rate": 2.6679737936925145e-06, + "logits/chosen": -1.4815822839736938, + "logits/rejected": -1.4985852241516113, + "logps/chosen": -170.48727416992188, + "logps/rejected": -50.363807678222656, + "loss": 0.0245, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.310101509094238, + "rewards/margins": 3.955045700073242, + "rewards/rejected": 1.3550556898117065, + "step": 8192 + }, + { + "epoch": 1.33, + "learning_rate": 2.666811323110763e-06, + "logits/chosen": -1.4982542991638184, + "logits/rejected": -1.546789288520813, + "logps/chosen": -55.38029861450195, + "logps/rejected": -85.22248840332031, + "loss": 0.6662, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.928809881210327, + "rewards/margins": 0.8378186225891113, + "rewards/rejected": 2.090991258621216, + "step": 8193 + }, + { + "epoch": 1.33, + "learning_rate": 2.66564901373027e-06, + "logits/chosen": -1.0834405422210693, + "logits/rejected": -1.108189344406128, + "logps/chosen": -13.756735801696777, + "logps/rejected": -54.22125244140625, + "loss": 0.9511, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.46257659792900085, + "rewards/margins": -0.13995179533958435, + "rewards/rejected": 0.6025283932685852, + "step": 8194 + }, + { + "epoch": 1.33, + "learning_rate": 2.6644868656313442e-06, + "logits/chosen": -0.876996636390686, + "logits/rejected": -0.876996636390686, + "logps/chosen": -29.29917335510254, + "logps/rejected": -29.29917335510254, + "loss": 0.9571, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.3830816745758057, + "rewards/margins": 0.0, + "rewards/rejected": 1.3830816745758057, + "step": 8195 + }, + { + "epoch": 1.33, + "learning_rate": 2.6633248788942756e-06, + "logits/chosen": -1.3097662925720215, + "logits/rejected": -1.29685640335083, + "logps/chosen": -87.81015014648438, + "logps/rejected": -74.66133117675781, + "loss": 0.7421, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.8017578125, + "rewards/margins": -0.23180389404296875, + "rewards/rejected": 2.0335617065429688, + "step": 8196 + }, + { + "epoch": 1.33, + "learning_rate": 2.66216305359935e-06, + "logits/chosen": -0.6049615144729614, + "logits/rejected": -0.6067713499069214, + "logps/chosen": -4.096052169799805, + "logps/rejected": -2.31022047996521, + "loss": 1.1643, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.19851922988891602, + "rewards/margins": -0.04587052762508392, + "rewards/rejected": 0.24438975751399994, + "step": 8197 + }, + { + "epoch": 1.33, + "learning_rate": 2.661001389826835e-06, + "logits/chosen": -1.4274321794509888, + "logits/rejected": -1.4216643571853638, + "logps/chosen": -75.0441665649414, + "logps/rejected": -29.669849395751953, + "loss": 1.5862, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.537029266357422, + "rewards/margins": -3.128157615661621, + "rewards/rejected": 5.665186882019043, + "step": 8198 + }, + { + "epoch": 1.33, + "learning_rate": 2.659839887656993e-06, + "logits/chosen": -1.003010630607605, + "logits/rejected": -0.9171916842460632, + "logps/chosen": -62.70948791503906, + "logps/rejected": -26.836566925048828, + "loss": 0.3089, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.8477569818496704, + "rewards/margins": 0.3857250213623047, + "rewards/rejected": 1.4620319604873657, + "step": 8199 + }, + { + "epoch": 1.33, + "learning_rate": 2.658678547170071e-06, + "logits/chosen": -1.0702029466629028, + "logits/rejected": -0.9099695682525635, + "logps/chosen": -57.96650314331055, + "logps/rejected": -31.544475555419922, + "loss": 0.2216, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.8210529088974, + "rewards/margins": 1.5543490648269653, + "rewards/rejected": 0.2667038142681122, + "step": 8200 + }, + { + "epoch": 1.33, + "learning_rate": 2.6575173684463096e-06, + "logits/chosen": -1.260543704032898, + "logits/rejected": -1.347001075744629, + "logps/chosen": -59.581031799316406, + "logps/rejected": -117.15402221679688, + "loss": 2.2322, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.3553764820098877, + "rewards/margins": -3.604227304458618, + "rewards/rejected": 4.959603786468506, + "step": 8201 + }, + { + "epoch": 1.33, + "learning_rate": 2.656356351565931e-06, + "logits/chosen": -0.7310308218002319, + "logits/rejected": -0.6713460087776184, + "logps/chosen": -36.05791091918945, + "logps/rejected": -42.163673400878906, + "loss": 0.4916, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.9747692346572876, + "rewards/margins": -0.22613799571990967, + "rewards/rejected": 2.2009072303771973, + "step": 8202 + }, + { + "epoch": 1.33, + "learning_rate": 2.655195496609154e-06, + "logits/chosen": -1.055608868598938, + "logits/rejected": -0.9936903119087219, + "logps/chosen": -126.05217742919922, + "logps/rejected": -104.28907775878906, + "loss": 0.2503, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.585144996643066, + "rewards/margins": 0.751655101776123, + "rewards/rejected": 5.833489894866943, + "step": 8203 + }, + { + "epoch": 1.33, + "learning_rate": 2.6540348036561804e-06, + "logits/chosen": -1.2563481330871582, + "logits/rejected": -1.1088542938232422, + "logps/chosen": -94.78356170654297, + "logps/rejected": -28.86153793334961, + "loss": 0.0548, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0040841102600098, + "rewards/margins": 3.9397287368774414, + "rewards/rejected": -0.9356445670127869, + "step": 8204 + }, + { + "epoch": 1.33, + "learning_rate": 2.652874272787206e-06, + "logits/chosen": -1.2539299726486206, + "logits/rejected": -1.2617400884628296, + "logps/chosen": -119.86444091796875, + "logps/rejected": -134.77621459960938, + "loss": 0.4511, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.230250597000122, + "rewards/margins": -0.33620452880859375, + "rewards/rejected": 1.5664551258087158, + "step": 8205 + }, + { + "epoch": 1.33, + "learning_rate": 2.651713904082408e-06, + "logits/chosen": -1.178512692451477, + "logits/rejected": -1.1141833066940308, + "logps/chosen": -103.4026107788086, + "logps/rejected": -54.384483337402344, + "loss": 0.3134, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.634459972381592, + "rewards/margins": 1.1235895156860352, + "rewards/rejected": 4.510870456695557, + "step": 8206 + }, + { + "epoch": 1.33, + "learning_rate": 2.6505536976219625e-06, + "logits/chosen": -1.3848702907562256, + "logits/rejected": -1.4024462699890137, + "logps/chosen": -129.39572143554688, + "logps/rejected": -85.70243835449219, + "loss": 1.2855, + "rewards/accuracies": 0.0, + "rewards/chosen": 5.762510776519775, + "rewards/margins": -2.4914183616638184, + "rewards/rejected": 8.253929138183594, + "step": 8207 + }, + { + "epoch": 1.33, + "learning_rate": 2.649393653486023e-06, + "logits/chosen": -1.3972574472427368, + "logits/rejected": -1.4542181491851807, + "logps/chosen": -96.0908203125, + "logps/rejected": -117.30265808105469, + "loss": 0.605, + "rewards/accuracies": 0.0, + "rewards/chosen": 6.978076457977295, + "rewards/margins": -0.27217531204223633, + "rewards/rejected": 7.250251770019531, + "step": 8208 + }, + { + "epoch": 1.33, + "learning_rate": 2.648233771754743e-06, + "logits/chosen": -1.283896565437317, + "logits/rejected": -1.240755558013916, + "logps/chosen": -94.84614562988281, + "logps/rejected": -130.3960418701172, + "loss": 0.6381, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.9990020990371704, + "rewards/margins": -0.01002800464630127, + "rewards/rejected": 2.0090301036834717, + "step": 8209 + }, + { + "epoch": 1.33, + "learning_rate": 2.6470740525082544e-06, + "logits/chosen": -1.49799382686615, + "logits/rejected": -1.3858510255813599, + "logps/chosen": -59.83070755004883, + "logps/rejected": -12.369280815124512, + "loss": 0.0196, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.521753311157227, + "rewards/margins": 3.75573468208313, + "rewards/rejected": 0.7660185694694519, + "step": 8210 + }, + { + "epoch": 1.33, + "learning_rate": 2.645914495826687e-06, + "logits/chosen": -1.3636952638626099, + "logits/rejected": -1.3446561098098755, + "logps/chosen": -33.958641052246094, + "logps/rejected": -56.22395324707031, + "loss": 0.5739, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.298985719680786, + "rewards/margins": 0.17675447463989258, + "rewards/rejected": 2.1222312450408936, + "step": 8211 + }, + { + "epoch": 1.33, + "learning_rate": 2.644755101790152e-06, + "logits/chosen": -1.2213505506515503, + "logits/rejected": -1.2213505506515503, + "logps/chosen": -25.04340171813965, + "logps/rejected": -25.04340171813965, + "loss": 0.7558, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.0094530582427979, + "rewards/margins": 0.0, + "rewards/rejected": 1.0094530582427979, + "step": 8212 + }, + { + "epoch": 1.33, + "learning_rate": 2.6435958704787558e-06, + "logits/chosen": -1.3644537925720215, + "logits/rejected": -1.3350698947906494, + "logps/chosen": -233.51458740234375, + "logps/rejected": -66.49609375, + "loss": 0.266, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.60707426071167, + "rewards/margins": 2.335710287094116, + "rewards/rejected": 2.2713639736175537, + "step": 8213 + }, + { + "epoch": 1.33, + "learning_rate": 2.6424368019725877e-06, + "logits/chosen": -1.3558845520019531, + "logits/rejected": -1.3550728559494019, + "logps/chosen": -47.85255432128906, + "logps/rejected": -66.54736328125, + "loss": 0.4802, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.711157202720642, + "rewards/margins": -0.4635978937149048, + "rewards/rejected": 2.174755096435547, + "step": 8214 + }, + { + "epoch": 1.33, + "learning_rate": 2.641277896351728e-06, + "logits/chosen": -1.259250521659851, + "logits/rejected": -1.2221693992614746, + "logps/chosen": -72.98487854003906, + "logps/rejected": -118.49256896972656, + "loss": 0.0418, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.319867134094238, + "rewards/margins": 2.6703248023986816, + "rewards/rejected": 2.6495423316955566, + "step": 8215 + }, + { + "epoch": 1.33, + "learning_rate": 2.640119153696249e-06, + "logits/chosen": -1.0835825204849243, + "logits/rejected": -1.0613455772399902, + "logps/chosen": -28.30162811279297, + "logps/rejected": -27.263914108276367, + "loss": 0.9916, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3995422422885895, + "rewards/margins": 0.29548129439353943, + "rewards/rejected": 0.10406094044446945, + "step": 8216 + }, + { + "epoch": 1.33, + "learning_rate": 2.6389605740862046e-06, + "logits/chosen": -1.2194856405258179, + "logits/rejected": -1.3218061923980713, + "logps/chosen": -55.3013916015625, + "logps/rejected": -125.69465637207031, + "loss": 1.1821, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.311316728591919, + "rewards/margins": -2.1578071117401123, + "rewards/rejected": 5.469123840332031, + "step": 8217 + }, + { + "epoch": 1.33, + "learning_rate": 2.6378021576016467e-06, + "logits/chosen": -1.4384844303131104, + "logits/rejected": -1.2174046039581299, + "logps/chosen": -91.24903869628906, + "logps/rejected": -159.8745574951172, + "loss": 0.4655, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.414987087249756, + "rewards/margins": 0.12201976776123047, + "rewards/rejected": 6.292967319488525, + "step": 8218 + }, + { + "epoch": 1.33, + "learning_rate": 2.636643904322606e-06, + "logits/chosen": -1.4023115634918213, + "logits/rejected": -1.4354236125946045, + "logps/chosen": -240.7451171875, + "logps/rejected": -106.80529022216797, + "loss": 0.151, + "rewards/accuracies": 1.0, + "rewards/chosen": 8.241930961608887, + "rewards/margins": 3.410187244415283, + "rewards/rejected": 4.8317437171936035, + "step": 8219 + }, + { + "epoch": 1.33, + "learning_rate": 2.6354858143291117e-06, + "logits/chosen": -1.3888033628463745, + "logits/rejected": -1.2358068227767944, + "logps/chosen": -59.7305908203125, + "logps/rejected": -13.324161529541016, + "loss": 0.0964, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.5865960121154785, + "rewards/margins": 1.7637591361999512, + "rewards/rejected": 0.8228368759155273, + "step": 8220 + }, + { + "epoch": 1.33, + "learning_rate": 2.6343278877011717e-06, + "logits/chosen": -1.0901014804840088, + "logits/rejected": -1.1740427017211914, + "logps/chosen": -55.15496826171875, + "logps/rejected": -108.0249252319336, + "loss": 0.6746, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.8409156799316406, + "rewards/margins": -0.7512941360473633, + "rewards/rejected": 4.592209815979004, + "step": 8221 + }, + { + "epoch": 1.33, + "learning_rate": 2.6331701245187934e-06, + "logits/chosen": -1.1991404294967651, + "logits/rejected": -0.9888275265693665, + "logps/chosen": -64.90911102294922, + "logps/rejected": -16.560768127441406, + "loss": 0.1499, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9066827297210693, + "rewards/margins": 2.7771694660186768, + "rewards/rejected": 0.12951336801052094, + "step": 8222 + }, + { + "epoch": 1.33, + "learning_rate": 2.6320125248619616e-06, + "logits/chosen": -1.2801154851913452, + "logits/rejected": -1.3033467531204224, + "logps/chosen": -70.87837982177734, + "logps/rejected": -39.754539489746094, + "loss": 0.0171, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.363245487213135, + "rewards/margins": 4.25847053527832, + "rewards/rejected": 1.104774832725525, + "step": 8223 + }, + { + "epoch": 1.33, + "learning_rate": 2.6308550888106603e-06, + "logits/chosen": -1.3759351968765259, + "logits/rejected": -1.3935366868972778, + "logps/chosen": -37.006126403808594, + "logps/rejected": -45.717308044433594, + "loss": 1.7583, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.7451351881027222, + "rewards/margins": -3.225978374481201, + "rewards/rejected": 4.971113681793213, + "step": 8224 + }, + { + "epoch": 1.34, + "learning_rate": 2.6296978164448538e-06, + "logits/chosen": -1.2944920063018799, + "logits/rejected": -1.29674232006073, + "logps/chosen": -101.667236328125, + "logps/rejected": -91.03369903564453, + "loss": 0.303, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.991980791091919, + "rewards/margins": 0.7331428527832031, + "rewards/rejected": 1.2588379383087158, + "step": 8225 + }, + { + "epoch": 1.34, + "learning_rate": 2.6285407078445015e-06, + "logits/chosen": -1.154396891593933, + "logits/rejected": -1.1572074890136719, + "logps/chosen": -5.858303546905518, + "logps/rejected": -5.9617743492126465, + "loss": 1.4499, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.26441437005996704, + "rewards/margins": -0.09132203459739685, + "rewards/rejected": 0.3557364046573639, + "step": 8226 + }, + { + "epoch": 1.34, + "learning_rate": 2.627383763089546e-06, + "logits/chosen": -1.5158443450927734, + "logits/rejected": -1.386168122291565, + "logps/chosen": -110.5829849243164, + "logps/rejected": -26.79456329345703, + "loss": 0.1328, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.9795814752578735, + "rewards/margins": 1.491360068321228, + "rewards/rejected": 0.4882213771343231, + "step": 8227 + }, + { + "epoch": 1.34, + "learning_rate": 2.626226982259924e-06, + "logits/chosen": -1.1286870241165161, + "logits/rejected": -0.9611828327178955, + "logps/chosen": -46.600955963134766, + "logps/rejected": -35.53858184814453, + "loss": 0.5461, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.0609982013702393, + "rewards/margins": -0.6472408771514893, + "rewards/rejected": 3.7082390785217285, + "step": 8228 + }, + { + "epoch": 1.34, + "learning_rate": 2.6250703654355545e-06, + "logits/chosen": -1.2518295049667358, + "logits/rejected": -1.246462106704712, + "logps/chosen": -87.647216796875, + "logps/rejected": -59.0773811340332, + "loss": 1.4915, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0338242053985596, + "rewards/margins": 1.6474170684814453, + "rewards/rejected": 1.3864071369171143, + "step": 8229 + }, + { + "epoch": 1.34, + "learning_rate": 2.6239139126963543e-06, + "logits/chosen": -1.1836808919906616, + "logits/rejected": -1.1836808919906616, + "logps/chosen": -21.72685432434082, + "logps/rejected": -21.72685432434082, + "loss": 0.3533, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.5083636045455933, + "rewards/margins": 0.0, + "rewards/rejected": 1.5083636045455933, + "step": 8230 + }, + { + "epoch": 1.34, + "learning_rate": 2.6227576241222164e-06, + "logits/chosen": -1.1225587129592896, + "logits/rejected": -1.1225587129592896, + "logps/chosen": -15.316879272460938, + "logps/rejected": -15.316879272460938, + "loss": 0.3594, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.8056373596191406, + "rewards/margins": 0.0, + "rewards/rejected": 2.8056373596191406, + "step": 8231 + }, + { + "epoch": 1.34, + "learning_rate": 2.621601499793036e-06, + "logits/chosen": -1.1387444734573364, + "logits/rejected": -1.2006412744522095, + "logps/chosen": -74.72537231445312, + "logps/rejected": -48.914833068847656, + "loss": 1.5606, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.2713241577148438, + "rewards/margins": -2.36610746383667, + "rewards/rejected": 4.637431621551514, + "step": 8232 + }, + { + "epoch": 1.34, + "learning_rate": 2.620445539788685e-06, + "logits/chosen": -1.3308144807815552, + "logits/rejected": -1.3506053686141968, + "logps/chosen": -44.319801330566406, + "logps/rejected": -81.58439636230469, + "loss": 0.5618, + "rewards/accuracies": 0.0, + "rewards/chosen": 4.672471046447754, + "rewards/margins": -0.09850215911865234, + "rewards/rejected": 4.770973205566406, + "step": 8233 + }, + { + "epoch": 1.34, + "learning_rate": 2.6192897441890337e-06, + "logits/chosen": -0.674385130405426, + "logits/rejected": -0.7122413516044617, + "logps/chosen": -15.570221900939941, + "logps/rejected": -42.632347106933594, + "loss": 0.4085, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.39682379364967346, + "rewards/margins": 0.10588103532791138, + "rewards/rejected": 0.2909427583217621, + "step": 8234 + }, + { + "epoch": 1.34, + "learning_rate": 2.6181341130739324e-06, + "logits/chosen": -1.1128495931625366, + "logits/rejected": -1.0963408946990967, + "logps/chosen": -8.302922248840332, + "logps/rejected": -13.590429306030273, + "loss": 0.5507, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.4494485855102539, + "rewards/margins": -0.4021087884902954, + "rewards/rejected": 0.8515573740005493, + "step": 8235 + }, + { + "epoch": 1.34, + "learning_rate": 2.6169786465232283e-06, + "logits/chosen": -1.3010393381118774, + "logits/rejected": -1.2641195058822632, + "logps/chosen": -52.44811248779297, + "logps/rejected": -97.6822280883789, + "loss": 0.2017, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.4176461696624756, + "rewards/margins": 1.2203949689865112, + "rewards/rejected": 1.1972512006759644, + "step": 8236 + }, + { + "epoch": 1.34, + "learning_rate": 2.61582334461675e-06, + "logits/chosen": -1.3105214834213257, + "logits/rejected": -1.0794202089309692, + "logps/chosen": -170.9557342529297, + "logps/rejected": -51.90184783935547, + "loss": 0.0674, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.0355730056762695, + "rewards/margins": 2.1991326808929443, + "rewards/rejected": 3.836440324783325, + "step": 8237 + }, + { + "epoch": 1.34, + "learning_rate": 2.614668207434321e-06, + "logits/chosen": -1.1040070056915283, + "logits/rejected": -1.1060044765472412, + "logps/chosen": -49.77027893066406, + "logps/rejected": -82.97966003417969, + "loss": 0.223, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.022747039794922, + "rewards/margins": 0.8425201177597046, + "rewards/rejected": 1.1802269220352173, + "step": 8238 + }, + { + "epoch": 1.34, + "learning_rate": 2.613513235055747e-06, + "logits/chosen": -1.221283197402954, + "logits/rejected": -1.2142809629440308, + "logps/chosen": -43.19755935668945, + "logps/rejected": -143.98292541503906, + "loss": 0.6954, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.184570074081421, + "rewards/margins": 1.8349714279174805, + "rewards/rejected": 0.3495987057685852, + "step": 8239 + }, + { + "epoch": 1.34, + "learning_rate": 2.6123584275608284e-06, + "logits/chosen": -0.8992397785186768, + "logits/rejected": -0.8655189275741577, + "logps/chosen": -17.98758888244629, + "logps/rejected": -1.621895432472229, + "loss": 0.5755, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.03079853020608425, + "rewards/margins": -0.0680854469537735, + "rewards/rejected": 0.0988839790225029, + "step": 8240 + }, + { + "epoch": 1.34, + "learning_rate": 2.6112037850293494e-06, + "logits/chosen": -1.0401350259780884, + "logits/rejected": -1.0295612812042236, + "logps/chosen": -62.258506774902344, + "logps/rejected": -108.1668472290039, + "loss": 0.2986, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.638166904449463, + "rewards/margins": 0.6294364929199219, + "rewards/rejected": 2.008730411529541, + "step": 8241 + }, + { + "epoch": 1.34, + "learning_rate": 2.610049307541085e-06, + "logits/chosen": -1.4443771839141846, + "logits/rejected": -1.3979476690292358, + "logps/chosen": -123.66371154785156, + "logps/rejected": -172.60763549804688, + "loss": 0.2201, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.383444309234619, + "rewards/margins": 0.6131744384765625, + "rewards/rejected": 5.770269870758057, + "step": 8242 + }, + { + "epoch": 1.34, + "learning_rate": 2.608894995175802e-06, + "logits/chosen": -1.4211876392364502, + "logits/rejected": -1.3709211349487305, + "logps/chosen": -62.573726654052734, + "logps/rejected": -56.04173278808594, + "loss": 0.152, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.919649124145508, + "rewards/margins": 1.3340046405792236, + "rewards/rejected": 2.585644483566284, + "step": 8243 + }, + { + "epoch": 1.34, + "learning_rate": 2.6077408480132476e-06, + "logits/chosen": -1.2982629537582397, + "logits/rejected": -1.2982629537582397, + "logps/chosen": -0.7297161817550659, + "logps/rejected": -0.7297161817550659, + "loss": 0.5807, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.30318590998649597, + "rewards/margins": 0.0, + "rewards/rejected": 0.30318590998649597, + "step": 8244 + }, + { + "epoch": 1.34, + "learning_rate": 2.6065868661331673e-06, + "logits/chosen": -1.2526224851608276, + "logits/rejected": -1.2977913618087769, + "logps/chosen": -53.29270935058594, + "logps/rejected": -140.25799560546875, + "loss": 1.1906, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.3501648902893066, + "rewards/margins": -0.9719130992889404, + "rewards/rejected": 3.322077989578247, + "step": 8245 + }, + { + "epoch": 1.34, + "learning_rate": 2.6054330496152856e-06, + "logits/chosen": -1.017004370689392, + "logits/rejected": -1.1124526262283325, + "logps/chosen": -62.69805145263672, + "logps/rejected": -84.54502868652344, + "loss": 1.7007, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.217231035232544, + "rewards/margins": -3.1867926120758057, + "rewards/rejected": 6.40402364730835, + "step": 8246 + }, + { + "epoch": 1.34, + "learning_rate": 2.6042793985393243e-06, + "logits/chosen": -1.2437161207199097, + "logits/rejected": -1.053348422050476, + "logps/chosen": -128.7462158203125, + "logps/rejected": -87.38298034667969, + "loss": 0.2615, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.9177093505859375, + "rewards/margins": 1.6087265014648438, + "rewards/rejected": 5.308982849121094, + "step": 8247 + }, + { + "epoch": 1.34, + "learning_rate": 2.6031259129849868e-06, + "logits/chosen": -1.1062661409378052, + "logits/rejected": -1.1062661409378052, + "logps/chosen": -76.11737823486328, + "logps/rejected": -76.11737823486328, + "loss": 1.055, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.3443421125411987, + "rewards/margins": 0.0, + "rewards/rejected": 1.3443421125411987, + "step": 8248 + }, + { + "epoch": 1.34, + "learning_rate": 2.6019725930319707e-06, + "logits/chosen": -1.2950243949890137, + "logits/rejected": -1.232122778892517, + "logps/chosen": -81.50962829589844, + "logps/rejected": -85.55266571044922, + "loss": 0.6055, + "rewards/accuracies": 0.0, + "rewards/chosen": 6.734593391418457, + "rewards/margins": -0.050173282623291016, + "rewards/rejected": 6.784766674041748, + "step": 8249 + }, + { + "epoch": 1.34, + "learning_rate": 2.600819438759956e-06, + "logits/chosen": -1.036159634590149, + "logits/rejected": -0.9163257479667664, + "logps/chosen": -43.662506103515625, + "logps/rejected": -39.979522705078125, + "loss": 0.4151, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.064427137374878, + "rewards/margins": 1.1927188634872437, + "rewards/rejected": 1.8717082738876343, + "step": 8250 + }, + { + "epoch": 1.34, + "learning_rate": 2.599666450248619e-06, + "logits/chosen": -1.058298945426941, + "logits/rejected": -1.2044743299484253, + "logps/chosen": -113.785888671875, + "logps/rejected": -137.96542358398438, + "loss": 2.2358, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.441162109375, + "rewards/margins": -1.7061614990234375, + "rewards/rejected": 3.1473236083984375, + "step": 8251 + }, + { + "epoch": 1.34, + "learning_rate": 2.598513627577617e-06, + "logits/chosen": -1.4048014879226685, + "logits/rejected": -1.3607274293899536, + "logps/chosen": -110.69666290283203, + "logps/rejected": -61.871734619140625, + "loss": 0.1843, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.199367523193359, + "rewards/margins": 0.8345832824707031, + "rewards/rejected": 3.3647842407226562, + "step": 8252 + }, + { + "epoch": 1.34, + "learning_rate": 2.5973609708266012e-06, + "logits/chosen": -0.90742427110672, + "logits/rejected": -0.9080138802528381, + "logps/chosen": -47.096046447753906, + "logps/rejected": -74.2599105834961, + "loss": 0.6961, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.5345550775527954, + "rewards/margins": -1.089867353439331, + "rewards/rejected": 1.6244224309921265, + "step": 8253 + }, + { + "epoch": 1.34, + "learning_rate": 2.5962084800752064e-06, + "logits/chosen": -1.3579738140106201, + "logits/rejected": -1.3339228630065918, + "logps/chosen": -71.32674407958984, + "logps/rejected": -65.53581237792969, + "loss": 0.8741, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.4510324001312256, + "rewards/margins": -0.7776358127593994, + "rewards/rejected": 3.228668212890625, + "step": 8254 + }, + { + "epoch": 1.34, + "learning_rate": 2.595056155403063e-06, + "logits/chosen": -1.382813572883606, + "logits/rejected": -1.3458236455917358, + "logps/chosen": -51.15782928466797, + "logps/rejected": -76.08724975585938, + "loss": 0.5039, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.5477073192596436, + "rewards/margins": -0.5189018249511719, + "rewards/rejected": 3.0666091442108154, + "step": 8255 + }, + { + "epoch": 1.34, + "learning_rate": 2.593903996889782e-06, + "logits/chosen": -1.302072525024414, + "logits/rejected": -1.2733064889907837, + "logps/chosen": -67.11102294921875, + "logps/rejected": -48.03050994873047, + "loss": 0.4243, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.3963851928710938, + "rewards/margins": 0.3252769708633423, + "rewards/rejected": 1.0711082220077515, + "step": 8256 + }, + { + "epoch": 1.34, + "learning_rate": 2.592752004614969e-06, + "logits/chosen": -1.208722472190857, + "logits/rejected": -1.1433324813842773, + "logps/chosen": -67.03669738769531, + "logps/rejected": -69.5827407836914, + "loss": 0.564, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.7440086603164673, + "rewards/margins": -0.4064575433731079, + "rewards/rejected": 2.150466203689575, + "step": 8257 + }, + { + "epoch": 1.34, + "learning_rate": 2.5916001786582135e-06, + "logits/chosen": -1.4091908931732178, + "logits/rejected": -1.4603508710861206, + "logps/chosen": -50.449684143066406, + "logps/rejected": -70.68122863769531, + "loss": 1.5208, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.6166809797286987, + "rewards/margins": -2.456002712249756, + "rewards/rejected": 4.072683811187744, + "step": 8258 + }, + { + "epoch": 1.34, + "learning_rate": 2.590448519099099e-06, + "logits/chosen": -1.1508612632751465, + "logits/rejected": -1.0583562850952148, + "logps/chosen": -57.58477783203125, + "logps/rejected": -40.316471099853516, + "loss": 0.401, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.335546970367432, + "rewards/margins": 0.07846355438232422, + "rewards/rejected": 4.257083415985107, + "step": 8259 + }, + { + "epoch": 1.34, + "learning_rate": 2.58929702601719e-06, + "logits/chosen": -1.0511773824691772, + "logits/rejected": -1.0511773824691772, + "logps/chosen": -0.6114261150360107, + "logps/rejected": -0.6114261150360107, + "loss": 0.4328, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.40191707015037537, + "rewards/margins": 0.0, + "rewards/rejected": 0.40191707015037537, + "step": 8260 + }, + { + "epoch": 1.34, + "learning_rate": 2.5881456994920488e-06, + "logits/chosen": -1.4186843633651733, + "logits/rejected": -1.2643908262252808, + "logps/chosen": -128.28207397460938, + "logps/rejected": -64.9196548461914, + "loss": 0.1353, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.30557107925415, + "rewards/margins": 1.1734304428100586, + "rewards/rejected": 4.132140636444092, + "step": 8261 + }, + { + "epoch": 1.34, + "learning_rate": 2.586994539603217e-06, + "logits/chosen": -1.4990715980529785, + "logits/rejected": -1.3306009769439697, + "logps/chosen": -147.70358276367188, + "logps/rejected": -41.12957000732422, + "loss": 0.1533, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.998225688934326, + "rewards/margins": 1.067124366760254, + "rewards/rejected": 3.9311013221740723, + "step": 8262 + }, + { + "epoch": 1.34, + "learning_rate": 2.585843546430232e-06, + "logits/chosen": -1.114484190940857, + "logits/rejected": -1.0697754621505737, + "logps/chosen": -60.555458068847656, + "logps/rejected": -74.24336242675781, + "loss": 0.1282, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.8807625770568848, + "rewards/margins": 1.347048282623291, + "rewards/rejected": 1.5337142944335938, + "step": 8263 + }, + { + "epoch": 1.34, + "learning_rate": 2.584692720052613e-06, + "logits/chosen": -1.5192596912384033, + "logits/rejected": -1.6401013135910034, + "logps/chosen": -82.63822937011719, + "logps/rejected": -36.067657470703125, + "loss": 0.2168, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0803964138031006, + "rewards/margins": 2.812643527984619, + "rewards/rejected": 0.26775285601615906, + "step": 8264 + }, + { + "epoch": 1.34, + "learning_rate": 2.583542060549875e-06, + "logits/chosen": -0.9998533129692078, + "logits/rejected": -0.9822107553482056, + "logps/chosen": -58.76245880126953, + "logps/rejected": -79.60201263427734, + "loss": 1.5178, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.3639473915100098, + "rewards/margins": 0.30678725242614746, + "rewards/rejected": 2.0571601390838623, + "step": 8265 + }, + { + "epoch": 1.34, + "learning_rate": 2.5823915680015136e-06, + "logits/chosen": -1.07919180393219, + "logits/rejected": -1.0213615894317627, + "logps/chosen": -58.341026306152344, + "logps/rejected": -48.28266143798828, + "loss": 0.757, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.3205406665802002, + "rewards/margins": -1.2446174621582031, + "rewards/rejected": 2.5651581287384033, + "step": 8266 + }, + { + "epoch": 1.34, + "learning_rate": 2.5812412424870213e-06, + "logits/chosen": -1.3397531509399414, + "logits/rejected": -1.254052758216858, + "logps/chosen": -115.30928802490234, + "logps/rejected": -169.37615966796875, + "loss": 0.2667, + "rewards/accuracies": 1.0, + "rewards/chosen": 9.301590919494629, + "rewards/margins": 0.35029029846191406, + "rewards/rejected": 8.951300621032715, + "step": 8267 + }, + { + "epoch": 1.34, + "learning_rate": 2.5800910840858717e-06, + "logits/chosen": -1.3103370666503906, + "logits/rejected": -1.1026523113250732, + "logps/chosen": -198.88755798339844, + "logps/rejected": -88.84133911132812, + "loss": 0.1028, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.465858459472656, + "rewards/margins": 1.735220193862915, + "rewards/rejected": 3.730638265609741, + "step": 8268 + }, + { + "epoch": 1.34, + "learning_rate": 2.5789410928775316e-06, + "logits/chosen": -1.3615179061889648, + "logits/rejected": -1.2415084838867188, + "logps/chosen": -163.85073852539062, + "logps/rejected": -60.891456604003906, + "loss": 0.7455, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.838555932044983, + "rewards/margins": -0.6189781427383423, + "rewards/rejected": 2.457534074783325, + "step": 8269 + }, + { + "epoch": 1.34, + "learning_rate": 2.5777912689414523e-06, + "logits/chosen": -0.8535012006759644, + "logits/rejected": -0.8535012006759644, + "logps/chosen": -32.06060791015625, + "logps/rejected": -32.06060791015625, + "loss": 0.5273, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.0848660469055176, + "rewards/margins": 0.0, + "rewards/rejected": 2.0848660469055176, + "step": 8270 + }, + { + "epoch": 1.34, + "learning_rate": 2.5766416123570792e-06, + "logits/chosen": -1.1342368125915527, + "logits/rejected": -1.0581512451171875, + "logps/chosen": -42.554752349853516, + "logps/rejected": -12.381918907165527, + "loss": 0.3356, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.0897655487060547, + "rewards/margins": 0.09925907850265503, + "rewards/rejected": 0.9905064702033997, + "step": 8271 + }, + { + "epoch": 1.34, + "learning_rate": 2.575492123203839e-06, + "logits/chosen": -1.2439483404159546, + "logits/rejected": -1.2206065654754639, + "logps/chosen": -44.79454803466797, + "logps/rejected": -108.52340698242188, + "loss": 0.7829, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.6747264862060547, + "rewards/margins": -0.6251819133758545, + "rewards/rejected": 2.299908399581909, + "step": 8272 + }, + { + "epoch": 1.34, + "learning_rate": 2.574342801561153e-06, + "logits/chosen": -1.6155999898910522, + "logits/rejected": -1.4315203428268433, + "logps/chosen": -116.33177947998047, + "logps/rejected": -22.404741287231445, + "loss": 0.1235, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.8277106285095215, + "rewards/margins": 6.086097240447998, + "rewards/rejected": 0.7416133880615234, + "step": 8273 + }, + { + "epoch": 1.34, + "learning_rate": 2.573193647508426e-06, + "logits/chosen": -1.3549299240112305, + "logits/rejected": -1.356428623199463, + "logps/chosen": -73.99163818359375, + "logps/rejected": -40.94176483154297, + "loss": 0.1976, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.4788339138031006, + "rewards/margins": 0.8829057216644287, + "rewards/rejected": 1.5959281921386719, + "step": 8274 + }, + { + "epoch": 1.34, + "learning_rate": 2.5720446611250583e-06, + "logits/chosen": -1.641310214996338, + "logits/rejected": -1.5495179891586304, + "logps/chosen": -115.68153381347656, + "logps/rejected": -88.79765319824219, + "loss": 0.0651, + "rewards/accuracies": 1.0, + "rewards/chosen": 8.850130081176758, + "rewards/margins": 2.1781787872314453, + "rewards/rejected": 6.6719512939453125, + "step": 8275 + }, + { + "epoch": 1.34, + "learning_rate": 2.5708958424904285e-06, + "logits/chosen": -1.2221345901489258, + "logits/rejected": -1.2197141647338867, + "logps/chosen": -62.167869567871094, + "logps/rejected": -97.87130737304688, + "loss": 0.2432, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.3723785877227783, + "rewards/margins": 1.5184502601623535, + "rewards/rejected": 0.8539283871650696, + "step": 8276 + }, + { + "epoch": 1.34, + "learning_rate": 2.5697471916839134e-06, + "logits/chosen": -0.8073027729988098, + "logits/rejected": -0.7786539196968079, + "logps/chosen": -72.97267150878906, + "logps/rejected": -74.56440734863281, + "loss": 0.6813, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.4270600080490112, + "rewards/margins": -1.0663307905197144, + "rewards/rejected": 2.4933907985687256, + "step": 8277 + }, + { + "epoch": 1.34, + "learning_rate": 2.5685987087848694e-06, + "logits/chosen": -0.8618548512458801, + "logits/rejected": -0.8618548512458801, + "logps/chosen": -38.73395919799805, + "logps/rejected": -38.73395919799805, + "loss": 0.6965, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.6067235469818115, + "rewards/margins": 0.0, + "rewards/rejected": 2.6067235469818115, + "step": 8278 + }, + { + "epoch": 1.34, + "learning_rate": 2.5674503938726493e-06, + "logits/chosen": -1.2171686887741089, + "logits/rejected": -1.2177869081497192, + "logps/chosen": -64.53388977050781, + "logps/rejected": -79.97535705566406, + "loss": 0.7194, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.163811445236206, + "rewards/margins": 0.10318279266357422, + "rewards/rejected": 2.060628652572632, + "step": 8279 + }, + { + "epoch": 1.34, + "learning_rate": 2.566302247026592e-06, + "logits/chosen": -1.0291869640350342, + "logits/rejected": -0.9812817573547363, + "logps/chosen": -52.395973205566406, + "logps/rejected": -1.422464370727539, + "loss": 0.5255, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.163340762257576, + "rewards/margins": -0.3749549984931946, + "rewards/rejected": 0.5382957458496094, + "step": 8280 + }, + { + "epoch": 1.34, + "learning_rate": 2.5651542683260193e-06, + "logits/chosen": -1.389618158340454, + "logits/rejected": -1.3568438291549683, + "logps/chosen": -80.67418670654297, + "logps/rejected": -87.55559539794922, + "loss": 0.2376, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.927239179611206, + "rewards/margins": 0.6000266075134277, + "rewards/rejected": 2.3272125720977783, + "step": 8281 + }, + { + "epoch": 1.34, + "learning_rate": 2.56400645785025e-06, + "logits/chosen": -0.9165042638778687, + "logits/rejected": -0.877305269241333, + "logps/chosen": -6.9664626121521, + "logps/rejected": -16.494234085083008, + "loss": 0.3014, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.7460550665855408, + "rewards/margins": 0.2257421612739563, + "rewards/rejected": 0.5203129053115845, + "step": 8282 + }, + { + "epoch": 1.34, + "learning_rate": 2.5628588156785834e-06, + "logits/chosen": -1.1588391065597534, + "logits/rejected": -1.1588391065597534, + "logps/chosen": -73.61444091796875, + "logps/rejected": -73.61444091796875, + "loss": 0.3584, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.6990578174591064, + "rewards/margins": 0.0, + "rewards/rejected": 3.6990578174591064, + "step": 8283 + }, + { + "epoch": 1.34, + "learning_rate": 2.5617113418903137e-06, + "logits/chosen": -1.4866136312484741, + "logits/rejected": -1.354842185974121, + "logps/chosen": -106.32112121582031, + "logps/rejected": -150.9254913330078, + "loss": 0.5094, + "rewards/accuracies": 0.0, + "rewards/chosen": 5.79434061050415, + "rewards/margins": -0.509368896484375, + "rewards/rejected": 6.303709506988525, + "step": 8284 + }, + { + "epoch": 1.34, + "learning_rate": 2.5605640365647165e-06, + "logits/chosen": -1.449967861175537, + "logits/rejected": -1.421531319618225, + "logps/chosen": -84.14142608642578, + "logps/rejected": -63.35888671875, + "loss": 0.1657, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.123220920562744, + "rewards/margins": 0.9457671642303467, + "rewards/rejected": 2.1774537563323975, + "step": 8285 + }, + { + "epoch": 1.34, + "learning_rate": 2.559416899781065e-06, + "logits/chosen": -1.6447927951812744, + "logits/rejected": -1.6714363098144531, + "logps/chosen": -91.9472885131836, + "logps/rejected": -59.524593353271484, + "loss": 1.7032, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.339376926422119, + "rewards/margins": 0.6735272407531738, + "rewards/rejected": 2.6658496856689453, + "step": 8286 + }, + { + "epoch": 1.35, + "learning_rate": 2.5582699316186106e-06, + "logits/chosen": -1.3298183679580688, + "logits/rejected": -1.3587538003921509, + "logps/chosen": -54.9942741394043, + "logps/rejected": -99.78955841064453, + "loss": 0.458, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.9227283000946045, + "rewards/margins": -0.33667802810668945, + "rewards/rejected": 3.259406328201294, + "step": 8287 + }, + { + "epoch": 1.35, + "learning_rate": 2.5571231321566017e-06, + "logits/chosen": -1.5193936824798584, + "logits/rejected": -1.6481895446777344, + "logps/chosen": -73.85484313964844, + "logps/rejected": -158.91143798828125, + "loss": 3.6181, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.2053468227386475, + "rewards/margins": -7.229070663452148, + "rewards/rejected": 9.434417724609375, + "step": 8288 + }, + { + "epoch": 1.35, + "learning_rate": 2.5559765014742677e-06, + "logits/chosen": -1.0807100534439087, + "logits/rejected": -1.1089650392532349, + "logps/chosen": -85.93132781982422, + "logps/rejected": -99.90332794189453, + "loss": 0.4212, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.0174691677093506, + "rewards/margins": -0.0693349838256836, + "rewards/rejected": 2.086804151535034, + "step": 8289 + }, + { + "epoch": 1.35, + "learning_rate": 2.554830039650834e-06, + "logits/chosen": -1.3996542692184448, + "logits/rejected": -1.3996542692184448, + "logps/chosen": -153.49215698242188, + "logps/rejected": -153.49215698242188, + "loss": 0.3476, + "rewards/accuracies": 0.0, + "rewards/chosen": 7.766690254211426, + "rewards/margins": 0.0, + "rewards/rejected": 7.766690254211426, + "step": 8290 + }, + { + "epoch": 1.35, + "learning_rate": 2.5536837467655064e-06, + "logits/chosen": -0.8516774773597717, + "logits/rejected": -0.8251303434371948, + "logps/chosen": -48.58106231689453, + "logps/rejected": -100.41932678222656, + "loss": 0.5863, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.4427589178085327, + "rewards/margins": 0.017693281173706055, + "rewards/rejected": 1.4250656366348267, + "step": 8291 + }, + { + "epoch": 1.35, + "learning_rate": 2.5525376228974865e-06, + "logits/chosen": -1.0684934854507446, + "logits/rejected": -1.0460495948791504, + "logps/chosen": -58.65865707397461, + "logps/rejected": -75.44085693359375, + "loss": 0.527, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.5586140155792236, + "rewards/margins": -0.41625332832336426, + "rewards/rejected": 2.974867343902588, + "step": 8292 + }, + { + "epoch": 1.35, + "learning_rate": 2.5513916681259564e-06, + "logits/chosen": -1.159271240234375, + "logits/rejected": -1.1586358547210693, + "logps/chosen": -0.5393175482749939, + "logps/rejected": -5.014538288116455, + "loss": 0.443, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.394839346408844, + "rewards/margins": -0.03582039475440979, + "rewards/rejected": 0.4306597411632538, + "step": 8293 + }, + { + "epoch": 1.35, + "learning_rate": 2.5502458825300956e-06, + "logits/chosen": -1.2344934940338135, + "logits/rejected": -1.252479910850525, + "logps/chosen": -209.31724548339844, + "logps/rejected": -188.84051513671875, + "loss": 0.2331, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.15341329574585, + "rewards/margins": 0.8932723999023438, + "rewards/rejected": 5.260140895843506, + "step": 8294 + }, + { + "epoch": 1.35, + "learning_rate": 2.549100266189062e-06, + "logits/chosen": -1.1651955842971802, + "logits/rejected": -1.1296626329421997, + "logps/chosen": -101.51840209960938, + "logps/rejected": -75.90191650390625, + "loss": 0.094, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.1024887561798096, + "rewards/margins": 1.6035370826721191, + "rewards/rejected": 0.4989517331123352, + "step": 8295 + }, + { + "epoch": 1.35, + "learning_rate": 2.547954819182012e-06, + "logits/chosen": -1.1840327978134155, + "logits/rejected": -1.0326991081237793, + "logps/chosen": -49.94324493408203, + "logps/rejected": -41.93840408325195, + "loss": 0.0584, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.8944129943847656, + "rewards/margins": 2.527189254760742, + "rewards/rejected": 0.36722373962402344, + "step": 8296 + }, + { + "epoch": 1.35, + "learning_rate": 2.54680954158808e-06, + "logits/chosen": -1.2225415706634521, + "logits/rejected": -1.169313907623291, + "logps/chosen": -64.93204498291016, + "logps/rejected": -113.70384979248047, + "loss": 0.4143, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.066671848297119, + "rewards/margins": -0.24636006355285645, + "rewards/rejected": 3.3130319118499756, + "step": 8297 + }, + { + "epoch": 1.35, + "learning_rate": 2.545664433486399e-06, + "logits/chosen": -0.9493095278739929, + "logits/rejected": -0.8699697256088257, + "logps/chosen": -87.35399627685547, + "logps/rejected": -64.78690338134766, + "loss": 0.0362, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.284401893615723, + "rewards/margins": 2.658139228820801, + "rewards/rejected": 1.6262626647949219, + "step": 8298 + }, + { + "epoch": 1.35, + "learning_rate": 2.5445194949560797e-06, + "logits/chosen": -1.1333764791488647, + "logits/rejected": -1.0358095169067383, + "logps/chosen": -27.865215301513672, + "logps/rejected": -19.058107376098633, + "loss": 0.5308, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.4709911346435547, + "rewards/margins": 1.1491817235946655, + "rewards/rejected": 0.3218093812465668, + "step": 8299 + }, + { + "epoch": 1.35, + "learning_rate": 2.543374726076232e-06, + "logits/chosen": -0.7348077297210693, + "logits/rejected": -0.7344717383384705, + "logps/chosen": -6.2345781326293945, + "logps/rejected": -2.3322269916534424, + "loss": 1.0766, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.011915874667465687, + "rewards/margins": -0.2658756971359253, + "rewards/rejected": 0.2539598345756531, + "step": 8300 + }, + { + "epoch": 1.35, + "learning_rate": 2.542230126925943e-06, + "logits/chosen": -1.244847297668457, + "logits/rejected": -1.2583507299423218, + "logps/chosen": -56.815887451171875, + "logps/rejected": -60.199790954589844, + "loss": 1.3066, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.9306914806365967, + "rewards/margins": -1.3586175441741943, + "rewards/rejected": 4.289309024810791, + "step": 8301 + }, + { + "epoch": 1.35, + "learning_rate": 2.5410856975842996e-06, + "logits/chosen": -1.0503565073013306, + "logits/rejected": -1.1139705181121826, + "logps/chosen": -81.23033905029297, + "logps/rejected": -61.09037780761719, + "loss": 1.5368, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.315641164779663, + "rewards/margins": -2.941474676132202, + "rewards/rejected": 6.257115840911865, + "step": 8302 + }, + { + "epoch": 1.35, + "learning_rate": 2.5399414381303654e-06, + "logits/chosen": -1.4380615949630737, + "logits/rejected": -1.451608657836914, + "logps/chosen": -56.87993621826172, + "logps/rejected": -78.35549926757812, + "loss": 0.9615, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.917768955230713, + "rewards/margins": 0.9911758899688721, + "rewards/rejected": 1.9265930652618408, + "step": 8303 + }, + { + "epoch": 1.35, + "learning_rate": 2.538797348643203e-06, + "logits/chosen": -1.4273481369018555, + "logits/rejected": -1.4434367418289185, + "logps/chosen": -99.09321594238281, + "logps/rejected": -123.79312896728516, + "loss": 0.0883, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.62753438949585, + "rewards/margins": 1.6539173126220703, + "rewards/rejected": 4.973617076873779, + "step": 8304 + }, + { + "epoch": 1.35, + "learning_rate": 2.5376534292018533e-06, + "logits/chosen": -1.1007791757583618, + "logits/rejected": -1.1139464378356934, + "logps/chosen": -7.969857215881348, + "logps/rejected": -4.36760139465332, + "loss": 0.6143, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5899298787117004, + "rewards/margins": 0.28930264711380005, + "rewards/rejected": 0.3006272315979004, + "step": 8305 + }, + { + "epoch": 1.35, + "learning_rate": 2.536509679885355e-06, + "logits/chosen": -1.4361263513565063, + "logits/rejected": -1.3409149646759033, + "logps/chosen": -44.279945373535156, + "logps/rejected": -59.40087127685547, + "loss": 0.1822, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.3073501586914062, + "rewards/margins": 0.8233840465545654, + "rewards/rejected": 1.4839661121368408, + "step": 8306 + }, + { + "epoch": 1.35, + "learning_rate": 2.535366100772726e-06, + "logits/chosen": -1.362188696861267, + "logits/rejected": -1.3548660278320312, + "logps/chosen": -88.8276138305664, + "logps/rejected": -63.926326751708984, + "loss": 0.437, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.3939545154571533, + "rewards/margins": 0.0906977653503418, + "rewards/rejected": 3.3032567501068115, + "step": 8307 + }, + { + "epoch": 1.35, + "learning_rate": 2.5342226919429807e-06, + "logits/chosen": -1.7559210062026978, + "logits/rejected": -1.5745795965194702, + "logps/chosen": -140.33160400390625, + "logps/rejected": -97.9212875366211, + "loss": 0.0486, + "rewards/accuracies": 1.0, + "rewards/chosen": 7.044691562652588, + "rewards/margins": 2.4733972549438477, + "rewards/rejected": 4.57129430770874, + "step": 8308 + }, + { + "epoch": 1.35, + "learning_rate": 2.533079453475114e-06, + "logits/chosen": -1.1242244243621826, + "logits/rejected": -0.9954364895820618, + "logps/chosen": -68.8801040649414, + "logps/rejected": -100.21710968017578, + "loss": 0.164, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.227881908416748, + "rewards/margins": 1.2156634330749512, + "rewards/rejected": 3.012218475341797, + "step": 8309 + }, + { + "epoch": 1.35, + "learning_rate": 2.5319363854481173e-06, + "logits/chosen": -0.9096994996070862, + "logits/rejected": -0.9274923205375671, + "logps/chosen": -59.38739013671875, + "logps/rejected": -94.29383850097656, + "loss": 0.7421, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.8563034534454346, + "rewards/margins": -0.9377777576446533, + "rewards/rejected": 2.794081211090088, + "step": 8310 + }, + { + "epoch": 1.35, + "learning_rate": 2.5307934879409606e-06, + "logits/chosen": -1.4431500434875488, + "logits/rejected": -1.322774887084961, + "logps/chosen": -118.30490112304688, + "logps/rejected": -79.57186126708984, + "loss": 0.2124, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.98140287399292, + "rewards/margins": 1.3743007183074951, + "rewards/rejected": 3.607102155685425, + "step": 8311 + }, + { + "epoch": 1.35, + "learning_rate": 2.5296507610326126e-06, + "logits/chosen": -1.423284649848938, + "logits/rejected": -1.4581235647201538, + "logps/chosen": -95.43899536132812, + "logps/rejected": -163.51319885253906, + "loss": 0.3873, + "rewards/accuracies": 0.0, + "rewards/chosen": 7.462626934051514, + "rewards/margins": -0.091949462890625, + "rewards/rejected": 7.554576396942139, + "step": 8312 + }, + { + "epoch": 1.35, + "learning_rate": 2.5285082048020205e-06, + "logits/chosen": -1.2673001289367676, + "logits/rejected": -1.2380859851837158, + "logps/chosen": -71.69451904296875, + "logps/rejected": -105.86943054199219, + "loss": 0.4631, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.96038818359375, + "rewards/margins": -0.09252548217773438, + "rewards/rejected": 2.0529136657714844, + "step": 8313 + }, + { + "epoch": 1.35, + "learning_rate": 2.5273658193281252e-06, + "logits/chosen": -0.9840260148048401, + "logits/rejected": -0.9822936058044434, + "logps/chosen": -2.644469738006592, + "logps/rejected": -3.6482977867126465, + "loss": 0.4703, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2889818847179413, + "rewards/margins": 0.055934056639671326, + "rewards/rejected": 0.23304782807826996, + "step": 8314 + }, + { + "epoch": 1.35, + "learning_rate": 2.526223604689858e-06, + "logits/chosen": -1.166576862335205, + "logits/rejected": -1.336033582687378, + "logps/chosen": -53.316349029541016, + "logps/rejected": -100.072509765625, + "loss": 2.1881, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.7259509563446045, + "rewards/margins": -4.329368591308594, + "rewards/rejected": 7.055319309234619, + "step": 8315 + }, + { + "epoch": 1.35, + "learning_rate": 2.5250815609661306e-06, + "logits/chosen": -1.329763412475586, + "logits/rejected": -1.3316034078598022, + "logps/chosen": -85.64610290527344, + "logps/rejected": -99.64108276367188, + "loss": 0.1721, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.004786968231201, + "rewards/margins": 1.0171358585357666, + "rewards/rejected": 2.9876511096954346, + "step": 8316 + }, + { + "epoch": 1.35, + "learning_rate": 2.5239396882358513e-06, + "logits/chosen": -0.934627890586853, + "logits/rejected": -0.8826383948326111, + "logps/chosen": -74.33358764648438, + "logps/rejected": -63.38220977783203, + "loss": 0.2182, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9062340259552, + "rewards/margins": 1.3615134954452515, + "rewards/rejected": 1.5447205305099487, + "step": 8317 + }, + { + "epoch": 1.35, + "learning_rate": 2.522797986577909e-06, + "logits/chosen": -1.1309542655944824, + "logits/rejected": -1.0035033226013184, + "logps/chosen": -67.10565948486328, + "logps/rejected": -36.786930084228516, + "loss": 0.7451, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.3739250898361206, + "rewards/margins": -0.7138453722000122, + "rewards/rejected": 2.087770462036133, + "step": 8318 + }, + { + "epoch": 1.35, + "learning_rate": 2.521656456071188e-06, + "logits/chosen": -1.1443796157836914, + "logits/rejected": -1.2450189590454102, + "logps/chosen": -114.51375579833984, + "logps/rejected": -92.42161560058594, + "loss": 1.9579, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.8521416187286377, + "rewards/margins": -2.976513147354126, + "rewards/rejected": 4.828654766082764, + "step": 8319 + }, + { + "epoch": 1.35, + "learning_rate": 2.520515096794554e-06, + "logits/chosen": -1.1453418731689453, + "logits/rejected": -1.2018698453903198, + "logps/chosen": -79.21186828613281, + "logps/rejected": -161.05258178710938, + "loss": 0.2498, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.1166276931762695, + "rewards/margins": 0.4393172264099121, + "rewards/rejected": 5.677310466766357, + "step": 8320 + }, + { + "epoch": 1.35, + "learning_rate": 2.519373908826869e-06, + "logits/chosen": -0.9476609230041504, + "logits/rejected": -0.9283546209335327, + "logps/chosen": -58.880775451660156, + "logps/rejected": -52.10698699951172, + "loss": 0.5884, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.2317605018615723, + "rewards/margins": 0.001458883285522461, + "rewards/rejected": 2.23030161857605, + "step": 8321 + }, + { + "epoch": 1.35, + "learning_rate": 2.518232892246972e-06, + "logits/chosen": -0.8623020052909851, + "logits/rejected": -0.8623020052909851, + "logps/chosen": -16.900529861450195, + "logps/rejected": -16.900529861450195, + "loss": 0.3847, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.1039285659790039, + "rewards/margins": 0.0, + "rewards/rejected": 0.1039285659790039, + "step": 8322 + }, + { + "epoch": 1.35, + "learning_rate": 2.5170920471337012e-06, + "logits/chosen": -1.4366095066070557, + "logits/rejected": -1.455666422843933, + "logps/chosen": -67.03007507324219, + "logps/rejected": -100.60974884033203, + "loss": 1.0623, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.317413330078125, + "rewards/margins": -1.7270622253417969, + "rewards/rejected": 5.044475555419922, + "step": 8323 + }, + { + "epoch": 1.35, + "learning_rate": 2.5159513735658737e-06, + "logits/chosen": -1.2674179077148438, + "logits/rejected": -1.2018532752990723, + "logps/chosen": -65.87910461425781, + "logps/rejected": -41.119571685791016, + "loss": 0.287, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2898032665252686, + "rewards/margins": 1.3655413389205933, + "rewards/rejected": 1.9242619276046753, + "step": 8324 + }, + { + "epoch": 1.35, + "learning_rate": 2.5148108716223047e-06, + "logits/chosen": -1.140430212020874, + "logits/rejected": -1.1611329317092896, + "logps/chosen": -53.698699951171875, + "logps/rejected": -51.516212463378906, + "loss": 0.3601, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0016555786132812, + "rewards/margins": 0.6066412925720215, + "rewards/rejected": 2.3950142860412598, + "step": 8325 + }, + { + "epoch": 1.35, + "learning_rate": 2.5136705413817873e-06, + "logits/chosen": -1.0766817331314087, + "logits/rejected": -1.0610169172286987, + "logps/chosen": -40.551971435546875, + "logps/rejected": -14.023417472839355, + "loss": 0.2942, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.07793390750885, + "rewards/margins": 0.2610163688659668, + "rewards/rejected": 0.8169175386428833, + "step": 8326 + }, + { + "epoch": 1.35, + "learning_rate": 2.5125303829231117e-06, + "logits/chosen": -1.2172951698303223, + "logits/rejected": -1.2096319198608398, + "logps/chosen": -44.21678924560547, + "logps/rejected": -83.35137939453125, + "loss": 0.1297, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.088904619216919, + "rewards/margins": 1.9849449396133423, + "rewards/rejected": 0.10395965725183487, + "step": 8327 + }, + { + "epoch": 1.35, + "learning_rate": 2.5113903963250473e-06, + "logits/chosen": -1.564963698387146, + "logits/rejected": -1.6387770175933838, + "logps/chosen": -114.00619506835938, + "logps/rejected": -78.80480194091797, + "loss": 0.2469, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.653419494628906, + "rewards/margins": 0.8188729286193848, + "rewards/rejected": 5.8345465660095215, + "step": 8328 + }, + { + "epoch": 1.35, + "learning_rate": 2.5102505816663618e-06, + "logits/chosen": -1.297139048576355, + "logits/rejected": -1.365496039390564, + "logps/chosen": -62.19234848022461, + "logps/rejected": -44.803382873535156, + "loss": 0.7201, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.1936802864074707, + "rewards/margins": -1.1686267852783203, + "rewards/rejected": 3.362307071685791, + "step": 8329 + }, + { + "epoch": 1.35, + "learning_rate": 2.5091109390258004e-06, + "logits/chosen": -1.2473217248916626, + "logits/rejected": -1.2144722938537598, + "logps/chosen": -76.65989685058594, + "logps/rejected": -71.79704284667969, + "loss": 0.028, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.183265686035156, + "rewards/margins": 2.868853807449341, + "rewards/rejected": 2.3144118785858154, + "step": 8330 + }, + { + "epoch": 1.35, + "learning_rate": 2.5079714684821064e-06, + "logits/chosen": -1.2474265098571777, + "logits/rejected": -1.2573189735412598, + "logps/chosen": -116.5346908569336, + "logps/rejected": -137.253173828125, + "loss": 0.9934, + "rewards/accuracies": 0.0, + "rewards/chosen": 5.046803951263428, + "rewards/margins": -1.781205177307129, + "rewards/rejected": 6.828009128570557, + "step": 8331 + }, + { + "epoch": 1.35, + "learning_rate": 2.5068321701140025e-06, + "logits/chosen": -1.4305819272994995, + "logits/rejected": -1.4526454210281372, + "logps/chosen": -76.18646240234375, + "logps/rejected": -113.13211059570312, + "loss": 0.829, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.059823751449585, + "rewards/margins": -1.099186658859253, + "rewards/rejected": 3.159010410308838, + "step": 8332 + }, + { + "epoch": 1.35, + "learning_rate": 2.5056930440002047e-06, + "logits/chosen": -1.4285743236541748, + "logits/rejected": -1.0964995622634888, + "logps/chosen": -144.7856903076172, + "logps/rejected": -89.85346984863281, + "loss": 0.1535, + "rewards/accuracies": 1.0, + "rewards/chosen": 7.40756368637085, + "rewards/margins": 3.123599052429199, + "rewards/rejected": 4.28396463394165, + "step": 8333 + }, + { + "epoch": 1.35, + "learning_rate": 2.504554090219418e-06, + "logits/chosen": -1.3418806791305542, + "logits/rejected": -1.4106709957122803, + "logps/chosen": -152.13018798828125, + "logps/rejected": -11.200678825378418, + "loss": 0.0055, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.929896831512451, + "rewards/margins": 5.071861743927002, + "rewards/rejected": 0.8580352067947388, + "step": 8334 + }, + { + "epoch": 1.35, + "learning_rate": 2.50341530885033e-06, + "logits/chosen": -1.8038440942764282, + "logits/rejected": -1.7865843772888184, + "logps/chosen": -221.94464111328125, + "logps/rejected": -22.269546508789062, + "loss": 0.0092, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.981152534484863, + "rewards/margins": 4.93938684463501, + "rewards/rejected": 0.041765596717596054, + "step": 8335 + }, + { + "epoch": 1.35, + "learning_rate": 2.502276699971623e-06, + "logits/chosen": -1.667464256286621, + "logits/rejected": -1.6461215019226074, + "logps/chosen": -113.24238586425781, + "logps/rejected": -28.897289276123047, + "loss": 0.0127, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.121013164520264, + "rewards/margins": 3.7413039207458496, + "rewards/rejected": 1.379709243774414, + "step": 8336 + }, + { + "epoch": 1.35, + "learning_rate": 2.501138263661961e-06, + "logits/chosen": -1.6954573392868042, + "logits/rejected": -1.6614174842834473, + "logps/chosen": -68.57058715820312, + "logps/rejected": -13.332995414733887, + "loss": 1.4286, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.8748092651367188, + "rewards/margins": 2.027477979660034, + "rewards/rejected": 0.8473313450813293, + "step": 8337 + }, + { + "epoch": 1.35, + "learning_rate": 2.5000000000000015e-06, + "logits/chosen": -1.3789308071136475, + "logits/rejected": -1.2458338737487793, + "logps/chosen": -70.94036102294922, + "logps/rejected": -39.775184631347656, + "loss": 0.0246, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.537909030914307, + "rewards/margins": 3.4682724475860596, + "rewards/rejected": 2.069636583328247, + "step": 8338 + }, + { + "epoch": 1.35, + "learning_rate": 2.498861909064385e-06, + "logits/chosen": -1.3956304788589478, + "logits/rejected": -1.3307774066925049, + "logps/chosen": -114.17015075683594, + "logps/rejected": -81.39442443847656, + "loss": 0.8591, + "rewards/accuracies": 0.0, + "rewards/chosen": 8.041403770446777, + "rewards/margins": -0.4292030334472656, + "rewards/rejected": 8.470606803894043, + "step": 8339 + }, + { + "epoch": 1.35, + "learning_rate": 2.4977239909337465e-06, + "logits/chosen": -1.1688337326049805, + "logits/rejected": -1.1450845003128052, + "logps/chosen": -31.752744674682617, + "logps/rejected": -26.055458068847656, + "loss": 0.2221, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.7149927616119385, + "rewards/margins": 0.6972265243530273, + "rewards/rejected": 2.017766237258911, + "step": 8340 + }, + { + "epoch": 1.35, + "learning_rate": 2.496586245686702e-06, + "logits/chosen": -1.2286325693130493, + "logits/rejected": -1.2721024751663208, + "logps/chosen": -57.872657775878906, + "logps/rejected": -62.800392150878906, + "loss": 3.4082, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.2896111011505127, + "rewards/margins": -4.1287841796875, + "rewards/rejected": 7.418395519256592, + "step": 8341 + }, + { + "epoch": 1.35, + "learning_rate": 2.4954486734018618e-06, + "logits/chosen": -1.5292284488677979, + "logits/rejected": -1.5292284488677979, + "logps/chosen": -64.90394592285156, + "logps/rejected": -64.90394592285156, + "loss": 0.4394, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.8366653323173523, + "rewards/margins": 0.0, + "rewards/rejected": 0.8366653323173523, + "step": 8342 + }, + { + "epoch": 1.35, + "learning_rate": 2.4943112741578183e-06, + "logits/chosen": -0.6114632487297058, + "logits/rejected": -0.5802863836288452, + "logps/chosen": -6.010540008544922, + "logps/rejected": -8.971683502197266, + "loss": 0.6671, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.5975915789604187, + "rewards/margins": -0.4563303589820862, + "rewards/rejected": 1.0539219379425049, + "step": 8343 + }, + { + "epoch": 1.35, + "learning_rate": 2.493174048033159e-06, + "logits/chosen": -0.8383442759513855, + "logits/rejected": -0.8471035361289978, + "logps/chosen": -2.4963557720184326, + "logps/rejected": -0.8760992288589478, + "loss": 0.581, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3663789927959442, + "rewards/margins": 0.05934569239616394, + "rewards/rejected": 0.3070333003997803, + "step": 8344 + }, + { + "epoch": 1.35, + "learning_rate": 2.492036995106451e-06, + "logits/chosen": -1.320494294166565, + "logits/rejected": -1.440198302268982, + "logps/chosen": -58.91847229003906, + "logps/rejected": -79.21479797363281, + "loss": 1.3216, + "rewards/accuracies": 0.0, + "rewards/chosen": 4.197325229644775, + "rewards/margins": -1.5314621925354004, + "rewards/rejected": 5.728787422180176, + "step": 8345 + }, + { + "epoch": 1.35, + "learning_rate": 2.490900115456258e-06, + "logits/chosen": -0.9045721292495728, + "logits/rejected": -0.8857132792472839, + "logps/chosen": -58.167694091796875, + "logps/rejected": -106.09595489501953, + "loss": 0.1784, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.709845781326294, + "rewards/margins": 1.2718071937561035, + "rewards/rejected": 0.4380386471748352, + "step": 8346 + }, + { + "epoch": 1.35, + "learning_rate": 2.4897634091611235e-06, + "logits/chosen": -1.330215573310852, + "logits/rejected": -1.2746185064315796, + "logps/chosen": -44.80817413330078, + "logps/rejected": -41.331722259521484, + "loss": 0.5074, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.175649404525757, + "rewards/margins": -0.5395631790161133, + "rewards/rejected": 2.71521258354187, + "step": 8347 + }, + { + "epoch": 1.35, + "learning_rate": 2.4886268762995874e-06, + "logits/chosen": -1.1315913200378418, + "logits/rejected": -1.1357094049453735, + "logps/chosen": -5.65755558013916, + "logps/rejected": -3.626610279083252, + "loss": 1.9539, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.10131873935461044, + "rewards/margins": -0.1711132526397705, + "rewards/rejected": 0.27243199944496155, + "step": 8348 + }, + { + "epoch": 1.36, + "learning_rate": 2.4874905169501696e-06, + "logits/chosen": -0.91981440782547, + "logits/rejected": -0.7882462739944458, + "logps/chosen": -46.47294616699219, + "logps/rejected": -36.32503890991211, + "loss": 0.2362, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.1608555316925049, + "rewards/margins": 1.4018181562423706, + "rewards/rejected": -0.24096260964870453, + "step": 8349 + }, + { + "epoch": 1.36, + "learning_rate": 2.486354331191385e-06, + "logits/chosen": -1.0839260816574097, + "logits/rejected": -1.0646412372589111, + "logps/chosen": -53.500450134277344, + "logps/rejected": -47.697723388671875, + "loss": 0.2372, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.1647156476974487, + "rewards/margins": 0.9044137001037598, + "rewards/rejected": 0.26030197739601135, + "step": 8350 + }, + { + "epoch": 1.36, + "learning_rate": 2.4852183191017304e-06, + "logits/chosen": -0.7693675756454468, + "logits/rejected": -0.874586284160614, + "logps/chosen": -86.07495880126953, + "logps/rejected": -100.01319885253906, + "loss": 1.7939, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.5537643432617188, + "rewards/margins": -2.8094420433044434, + "rewards/rejected": 5.363206386566162, + "step": 8351 + }, + { + "epoch": 1.36, + "learning_rate": 2.4840824807596963e-06, + "logits/chosen": -1.3117213249206543, + "logits/rejected": -1.2787854671478271, + "logps/chosen": -32.28618621826172, + "logps/rejected": -28.419326782226562, + "loss": 0.3962, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.3400520086288452, + "rewards/margins": -0.029042482376098633, + "rewards/rejected": 1.3690944910049438, + "step": 8352 + }, + { + "epoch": 1.36, + "learning_rate": 2.4829468162437553e-06, + "logits/chosen": -1.3302574157714844, + "logits/rejected": -1.312753438949585, + "logps/chosen": -91.16910552978516, + "logps/rejected": -61.4459228515625, + "loss": 2.0706, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.605913519859314, + "rewards/margins": -4.12302303314209, + "rewards/rejected": 5.728936672210693, + "step": 8353 + }, + { + "epoch": 1.36, + "learning_rate": 2.4818113256323745e-06, + "logits/chosen": -1.1307374238967896, + "logits/rejected": -1.1473379135131836, + "logps/chosen": -59.79857635498047, + "logps/rejected": -58.233951568603516, + "loss": 0.8918, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.0353065729141235, + "rewards/margins": -1.1317378282546997, + "rewards/rejected": 2.1670444011688232, + "step": 8354 + }, + { + "epoch": 1.36, + "learning_rate": 2.480676009004002e-06, + "logits/chosen": -1.1652363538742065, + "logits/rejected": -1.1237719058990479, + "logps/chosen": -85.37454223632812, + "logps/rejected": -80.40486145019531, + "loss": 0.2965, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.674896240234375, + "rewards/margins": 0.24125337600708008, + "rewards/rejected": 4.433642864227295, + "step": 8355 + }, + { + "epoch": 1.36, + "learning_rate": 2.4795408664370812e-06, + "logits/chosen": -1.4963176250457764, + "logits/rejected": -1.4717131853103638, + "logps/chosen": -61.58985900878906, + "logps/rejected": -64.84567260742188, + "loss": 0.3092, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.5866119861602783, + "rewards/margins": 1.3201050758361816, + "rewards/rejected": 2.2665069103240967, + "step": 8356 + }, + { + "epoch": 1.36, + "learning_rate": 2.4784058980100355e-06, + "logits/chosen": -1.6332755088806152, + "logits/rejected": -1.5792489051818848, + "logps/chosen": -77.15560150146484, + "logps/rejected": -38.615299224853516, + "loss": 0.326, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0348503589630127, + "rewards/margins": 0.38950300216674805, + "rewards/rejected": 2.6453473567962646, + "step": 8357 + }, + { + "epoch": 1.36, + "learning_rate": 2.4772711038012846e-06, + "logits/chosen": -1.1040388345718384, + "logits/rejected": -1.145814299583435, + "logps/chosen": -53.18004608154297, + "logps/rejected": -82.87593078613281, + "loss": 0.4337, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9308876991271973, + "rewards/margins": 1.6583199501037598, + "rewards/rejected": 1.2725677490234375, + "step": 8358 + }, + { + "epoch": 1.36, + "learning_rate": 2.476136483889229e-06, + "logits/chosen": -1.2049894332885742, + "logits/rejected": -1.041351556777954, + "logps/chosen": -97.01742553710938, + "logps/rejected": -70.5921859741211, + "loss": 0.0364, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.95871114730835, + "rewards/margins": 2.6429219245910645, + "rewards/rejected": 4.315789222717285, + "step": 8359 + }, + { + "epoch": 1.36, + "learning_rate": 2.4750020383522624e-06, + "logits/chosen": -1.2509536743164062, + "logits/rejected": -1.231095314025879, + "logps/chosen": -34.30335235595703, + "logps/rejected": -56.00389099121094, + "loss": 0.4629, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.390225648880005, + "rewards/margins": -0.3610720634460449, + "rewards/rejected": 2.75129771232605, + "step": 8360 + }, + { + "epoch": 1.36, + "learning_rate": 2.4738677672687617e-06, + "logits/chosen": -0.9741929173469543, + "logits/rejected": -0.9252168536186218, + "logps/chosen": -44.09648132324219, + "logps/rejected": -17.214582443237305, + "loss": 0.5282, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.9889179468154907, + "rewards/margins": 0.42845386266708374, + "rewards/rejected": 0.560464084148407, + "step": 8361 + }, + { + "epoch": 1.36, + "learning_rate": 2.4727336707170973e-06, + "logits/chosen": -0.9116990566253662, + "logits/rejected": -0.9116990566253662, + "logps/chosen": -42.014137268066406, + "logps/rejected": -42.014137268066406, + "loss": 1.2317, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.19427262246608734, + "rewards/margins": 0.0, + "rewards/rejected": 0.19427262246608734, + "step": 8362 + }, + { + "epoch": 1.36, + "learning_rate": 2.4715997487756214e-06, + "logits/chosen": -1.3810820579528809, + "logits/rejected": -1.3399505615234375, + "logps/chosen": -89.90023803710938, + "logps/rejected": -64.60610961914062, + "loss": 1.4075, + "rewards/accuracies": 0.0, + "rewards/chosen": 5.727522373199463, + "rewards/margins": -1.3463544845581055, + "rewards/rejected": 7.073876857757568, + "step": 8363 + }, + { + "epoch": 1.36, + "learning_rate": 2.4704660015226813e-06, + "logits/chosen": -0.8729323148727417, + "logits/rejected": -0.8911654353141785, + "logps/chosen": -108.814453125, + "logps/rejected": -51.414459228515625, + "loss": 2.8613, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.3286583125591278, + "rewards/margins": -4.489415168762207, + "rewards/rejected": 4.818073272705078, + "step": 8364 + }, + { + "epoch": 1.36, + "learning_rate": 2.4693324290366033e-06, + "logits/chosen": -1.4630630016326904, + "logits/rejected": -1.4809653759002686, + "logps/chosen": -90.66324615478516, + "logps/rejected": -121.22752380371094, + "loss": 0.2598, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.7423064708709717, + "rewards/margins": 0.39318370819091797, + "rewards/rejected": 2.3491227626800537, + "step": 8365 + }, + { + "epoch": 1.36, + "learning_rate": 2.468199031395711e-06, + "logits/chosen": -1.4762643575668335, + "logits/rejected": -1.3423128128051758, + "logps/chosen": -154.77682495117188, + "logps/rejected": -23.76405143737793, + "loss": 0.0812, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.595965623855591, + "rewards/margins": 1.7466219663619995, + "rewards/rejected": 1.8493436574935913, + "step": 8366 + }, + { + "epoch": 1.36, + "learning_rate": 2.467065808678308e-06, + "logits/chosen": -1.1552786827087402, + "logits/rejected": -1.1373882293701172, + "logps/chosen": -53.972740173339844, + "logps/rejected": -43.68665313720703, + "loss": 0.627, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.591973900794983, + "rewards/margins": -0.7948523759841919, + "rewards/rejected": 2.386826276779175, + "step": 8367 + }, + { + "epoch": 1.36, + "learning_rate": 2.465932760962692e-06, + "logits/chosen": -1.0685495138168335, + "logits/rejected": -1.0695911645889282, + "logps/chosen": -87.30242156982422, + "logps/rejected": -57.061378479003906, + "loss": 0.6649, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.2551201581954956, + "rewards/margins": -1.0195921659469604, + "rewards/rejected": 2.274712324142456, + "step": 8368 + }, + { + "epoch": 1.36, + "learning_rate": 2.464799888327143e-06, + "logits/chosen": -1.3182086944580078, + "logits/rejected": -1.383665680885315, + "logps/chosen": -149.89706420898438, + "logps/rejected": -152.52291870117188, + "loss": 0.6527, + "rewards/accuracies": 0.0, + "rewards/chosen": 6.333795070648193, + "rewards/margins": -0.9874696731567383, + "rewards/rejected": 7.321264743804932, + "step": 8369 + }, + { + "epoch": 1.36, + "learning_rate": 2.4636671908499336e-06, + "logits/chosen": -1.2595982551574707, + "logits/rejected": -1.2898650169372559, + "logps/chosen": -78.30014038085938, + "logps/rejected": -118.64732360839844, + "loss": 1.3313, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.211759328842163, + "rewards/margins": -2.562079668045044, + "rewards/rejected": 5.773838996887207, + "step": 8370 + }, + { + "epoch": 1.36, + "learning_rate": 2.4625346686093243e-06, + "logits/chosen": -1.5950947999954224, + "logits/rejected": -1.6401197910308838, + "logps/chosen": -97.23226165771484, + "logps/rejected": -186.53907775878906, + "loss": 1.2394, + "rewards/accuracies": 0.0, + "rewards/chosen": 4.671883583068848, + "rewards/margins": -2.372203826904297, + "rewards/rejected": 7.0440874099731445, + "step": 8371 + }, + { + "epoch": 1.36, + "learning_rate": 2.4614023216835574e-06, + "logits/chosen": -1.012872338294983, + "logits/rejected": -1.0117677450180054, + "logps/chosen": -26.607521057128906, + "logps/rejected": -67.40596771240234, + "loss": 1.3073, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.39567071199417114, + "rewards/margins": -0.1622897982597351, + "rewards/rejected": 0.5579605102539062, + "step": 8372 + }, + { + "epoch": 1.36, + "learning_rate": 2.4602701501508717e-06, + "logits/chosen": -1.2827880382537842, + "logits/rejected": -1.2138842344284058, + "logps/chosen": -107.50260925292969, + "logps/rejected": -57.06614685058594, + "loss": 0.0083, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.5291337966918945, + "rewards/margins": 4.141456604003906, + "rewards/rejected": 0.38767701387405396, + "step": 8373 + }, + { + "epoch": 1.36, + "learning_rate": 2.459138154089486e-06, + "logits/chosen": -0.745175302028656, + "logits/rejected": -0.7620455622673035, + "logps/chosen": -4.068580627441406, + "logps/rejected": -20.874982833862305, + "loss": 0.4757, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.16086025536060333, + "rewards/margins": -0.29210585355758667, + "rewards/rejected": 0.4529661238193512, + "step": 8374 + }, + { + "epoch": 1.36, + "learning_rate": 2.4580063335776134e-06, + "logits/chosen": -1.2686532735824585, + "logits/rejected": -1.21626877784729, + "logps/chosen": -65.3438949584961, + "logps/rejected": -63.95881652832031, + "loss": 0.3067, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.284886837005615, + "rewards/margins": 0.1689443588256836, + "rewards/rejected": 4.115942478179932, + "step": 8375 + }, + { + "epoch": 1.36, + "learning_rate": 2.456874688693449e-06, + "logits/chosen": -1.2446010112762451, + "logits/rejected": -1.2512245178222656, + "logps/chosen": -54.907684326171875, + "logps/rejected": -61.92347717285156, + "loss": 0.6402, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.4802582263946533, + "rewards/margins": -0.8442423343658447, + "rewards/rejected": 4.324500560760498, + "step": 8376 + }, + { + "epoch": 1.36, + "learning_rate": 2.455743219515182e-06, + "logits/chosen": -1.339656114578247, + "logits/rejected": -1.3536653518676758, + "logps/chosen": -35.01899719238281, + "logps/rejected": -110.34880828857422, + "loss": 0.1133, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.1457817554473877, + "rewards/margins": 1.4482582807540894, + "rewards/rejected": -0.3024764955043793, + "step": 8377 + }, + { + "epoch": 1.36, + "learning_rate": 2.4546119261209826e-06, + "logits/chosen": -1.4480406045913696, + "logits/rejected": -1.4812231063842773, + "logps/chosen": -101.0042724609375, + "logps/rejected": -84.28924560546875, + "loss": 1.8956, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.3659608364105225, + "rewards/margins": -0.784665584564209, + "rewards/rejected": 3.1506264209747314, + "step": 8378 + }, + { + "epoch": 1.36, + "learning_rate": 2.4534808085890167e-06, + "logits/chosen": -1.2812989950180054, + "logits/rejected": -1.2951794862747192, + "logps/chosen": -62.82666778564453, + "logps/rejected": -56.67543029785156, + "loss": 0.7934, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.135195255279541, + "rewards/margins": -1.355283260345459, + "rewards/rejected": 3.490478515625, + "step": 8379 + }, + { + "epoch": 1.36, + "learning_rate": 2.452349866997429e-06, + "logits/chosen": -1.1810102462768555, + "logits/rejected": -1.1807608604431152, + "logps/chosen": -2.0490951538085938, + "logps/rejected": -9.090164184570312, + "loss": 0.3719, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.1965990513563156, + "rewards/margins": -0.062705859541893, + "rewards/rejected": 0.2593049108982086, + "step": 8380 + }, + { + "epoch": 1.36, + "learning_rate": 2.451219101424362e-06, + "logits/chosen": -1.3188321590423584, + "logits/rejected": -1.3543821573257446, + "logps/chosen": -79.1336898803711, + "logps/rejected": -80.89720153808594, + "loss": 1.3196, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.1688783168792725, + "rewards/margins": -2.5509965419769287, + "rewards/rejected": 4.719874858856201, + "step": 8381 + }, + { + "epoch": 1.36, + "learning_rate": 2.450088511947936e-06, + "logits/chosen": -1.292404294013977, + "logits/rejected": -1.316908836364746, + "logps/chosen": -12.011372566223145, + "logps/rejected": -55.555484771728516, + "loss": 1.4247, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.4199795722961426, + "rewards/margins": -0.6314475536346436, + "rewards/rejected": 1.0514271259307861, + "step": 8382 + }, + { + "epoch": 1.36, + "learning_rate": 2.4489580986462684e-06, + "logits/chosen": -0.9048559665679932, + "logits/rejected": -0.9312268495559692, + "logps/chosen": -68.34835815429688, + "logps/rejected": -54.40369415283203, + "loss": 0.3133, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.150566816329956, + "rewards/margins": 0.2168874740600586, + "rewards/rejected": 2.9336793422698975, + "step": 8383 + }, + { + "epoch": 1.36, + "learning_rate": 2.447827861597456e-06, + "logits/chosen": -1.361350417137146, + "logits/rejected": -1.3502800464630127, + "logps/chosen": -118.65605163574219, + "logps/rejected": -116.79842376708984, + "loss": 0.3015, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.621060371398926, + "rewards/margins": 0.2989344596862793, + "rewards/rejected": 6.3221259117126465, + "step": 8384 + }, + { + "epoch": 1.36, + "learning_rate": 2.446697800879591e-06, + "logits/chosen": -1.0208940505981445, + "logits/rejected": -0.8655126094818115, + "logps/chosen": -101.23737335205078, + "logps/rejected": -0.9948583841323853, + "loss": 0.4726, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.14942626655101776, + "rewards/margins": -0.40290510654449463, + "rewards/rejected": 0.5523313879966736, + "step": 8385 + }, + { + "epoch": 1.36, + "learning_rate": 2.4455679165707473e-06, + "logits/chosen": -1.1174489259719849, + "logits/rejected": -1.2117832899093628, + "logps/chosen": -47.89566421508789, + "logps/rejected": -93.68867492675781, + "loss": 1.2318, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.1391537189483643, + "rewards/margins": -2.167116403579712, + "rewards/rejected": 4.306270122528076, + "step": 8386 + }, + { + "epoch": 1.36, + "learning_rate": 2.4444382087489914e-06, + "logits/chosen": -1.3035672903060913, + "logits/rejected": -0.9009597897529602, + "logps/chosen": -106.40028381347656, + "logps/rejected": -54.92706298828125, + "loss": 0.658, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.636009216308594, + "rewards/margins": 0.6236824989318848, + "rewards/rejected": 4.012326717376709, + "step": 8387 + }, + { + "epoch": 1.36, + "learning_rate": 2.443308677492373e-06, + "logits/chosen": -1.4654479026794434, + "logits/rejected": -1.4837510585784912, + "logps/chosen": -119.3446044921875, + "logps/rejected": -165.309326171875, + "loss": 0.1797, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.1995849609375, + "rewards/margins": 1.2375977039337158, + "rewards/rejected": 2.961987257003784, + "step": 8388 + }, + { + "epoch": 1.36, + "learning_rate": 2.4421793228789354e-06, + "logits/chosen": -1.399159550666809, + "logits/rejected": -1.159248948097229, + "logps/chosen": -91.73579406738281, + "logps/rejected": -67.9483871459961, + "loss": 0.0394, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.575131416320801, + "rewards/margins": 2.684206485748291, + "rewards/rejected": 1.8909248113632202, + "step": 8389 + }, + { + "epoch": 1.36, + "learning_rate": 2.4410501449867017e-06, + "logits/chosen": -1.3830623626708984, + "logits/rejected": -1.4065836668014526, + "logps/chosen": -92.31817626953125, + "logps/rejected": -73.28512573242188, + "loss": 0.3932, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.968182563781738, + "rewards/margins": 0.1219635009765625, + "rewards/rejected": 5.846219062805176, + "step": 8390 + }, + { + "epoch": 1.36, + "learning_rate": 2.4399211438936925e-06, + "logits/chosen": -1.1152328252792358, + "logits/rejected": -0.8152098059654236, + "logps/chosen": -82.77433776855469, + "logps/rejected": -26.632709503173828, + "loss": 0.017, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.798472881317139, + "rewards/margins": 4.930078029632568, + "rewards/rejected": -0.1316051483154297, + "step": 8391 + }, + { + "epoch": 1.36, + "learning_rate": 2.4387923196779063e-06, + "logits/chosen": -1.279445767402649, + "logits/rejected": -1.1432056427001953, + "logps/chosen": -56.42804718017578, + "logps/rejected": -22.164077758789062, + "loss": 0.1838, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.7209625244140625, + "rewards/margins": 2.207702159881592, + "rewards/rejected": 0.5132603049278259, + "step": 8392 + }, + { + "epoch": 1.36, + "learning_rate": 2.4376636724173385e-06, + "logits/chosen": -1.4495936632156372, + "logits/rejected": -1.410051941871643, + "logps/chosen": -41.429222106933594, + "logps/rejected": -46.527732849121094, + "loss": 0.4505, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.7865655422210693, + "rewards/margins": 0.12337040901184082, + "rewards/rejected": 2.6631951332092285, + "step": 8393 + }, + { + "epoch": 1.36, + "learning_rate": 2.4365352021899635e-06, + "logits/chosen": -0.9172390103340149, + "logits/rejected": -0.9172390103340149, + "logps/chosen": -65.54066467285156, + "logps/rejected": -65.54066467285156, + "loss": 0.4044, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.4740982055664062, + "rewards/margins": 0.0, + "rewards/rejected": 2.4740982055664062, + "step": 8394 + }, + { + "epoch": 1.36, + "learning_rate": 2.4354069090737533e-06, + "logits/chosen": -0.8442531228065491, + "logits/rejected": -0.8442531228065491, + "logps/chosen": -18.110916137695312, + "logps/rejected": -18.110916137695312, + "loss": 1.0229, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.3379993438720703, + "rewards/margins": 0.0, + "rewards/rejected": 1.3379993438720703, + "step": 8395 + }, + { + "epoch": 1.36, + "learning_rate": 2.434278793146656e-06, + "logits/chosen": -1.4581104516983032, + "logits/rejected": -1.2875837087631226, + "logps/chosen": -56.898895263671875, + "logps/rejected": -21.076393127441406, + "loss": 0.084, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.266366481781006, + "rewards/margins": 4.107411861419678, + "rewards/rejected": 0.15895462036132812, + "step": 8396 + }, + { + "epoch": 1.36, + "learning_rate": 2.43315085448662e-06, + "logits/chosen": -1.3865320682525635, + "logits/rejected": -1.1621118783950806, + "logps/chosen": -152.65744018554688, + "logps/rejected": -70.87553405761719, + "loss": 0.0945, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.570706367492676, + "rewards/margins": 1.8618669509887695, + "rewards/rejected": 3.7088394165039062, + "step": 8397 + }, + { + "epoch": 1.36, + "learning_rate": 2.43202309317157e-06, + "logits/chosen": -1.117143154144287, + "logits/rejected": -1.0202895402908325, + "logps/chosen": -55.499412536621094, + "logps/rejected": -58.44839859008789, + "loss": 0.9767, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.432025909423828, + "rewards/margins": 1.7647013664245605, + "rewards/rejected": 0.6673244833946228, + "step": 8398 + }, + { + "epoch": 1.36, + "learning_rate": 2.4308955092794273e-06, + "logits/chosen": -0.8494746685028076, + "logits/rejected": -0.7633694410324097, + "logps/chosen": -45.67057800292969, + "logps/rejected": -46.507476806640625, + "loss": 0.4975, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.9841041564941406, + "rewards/margins": 0.6720321178436279, + "rewards/rejected": 1.3120720386505127, + "step": 8399 + }, + { + "epoch": 1.36, + "learning_rate": 2.4297681028880943e-06, + "logits/chosen": -1.428055763244629, + "logits/rejected": -1.3419071435928345, + "logps/chosen": -40.495609283447266, + "logps/rejected": -18.51026153564453, + "loss": 0.1988, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1138761043548584, + "rewards/margins": 1.564818263053894, + "rewards/rejected": 1.5490578413009644, + "step": 8400 + }, + { + "epoch": 1.36, + "learning_rate": 2.4286408740754674e-06, + "logits/chosen": -1.2392102479934692, + "logits/rejected": -1.2392102479934692, + "logps/chosen": -48.398170471191406, + "logps/rejected": -48.398170471191406, + "loss": 0.363, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.965874433517456, + "rewards/margins": 0.0, + "rewards/rejected": 3.965874433517456, + "step": 8401 + }, + { + "epoch": 1.36, + "learning_rate": 2.4275138229194238e-06, + "logits/chosen": -1.252498745918274, + "logits/rejected": -1.125548243522644, + "logps/chosen": -60.18607711791992, + "logps/rejected": -24.13967514038086, + "loss": 0.1078, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.7646023035049438, + "rewards/margins": 2.0399250984191895, + "rewards/rejected": -0.27532273530960083, + "step": 8402 + }, + { + "epoch": 1.36, + "learning_rate": 2.4263869494978357e-06, + "logits/chosen": -1.4511820077896118, + "logits/rejected": -1.339106559753418, + "logps/chosen": -99.97787475585938, + "logps/rejected": -53.370906829833984, + "loss": 0.2211, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.148232936859131, + "rewards/margins": 1.8198390007019043, + "rewards/rejected": 4.328393936157227, + "step": 8403 + }, + { + "epoch": 1.36, + "learning_rate": 2.4252602538885562e-06, + "logits/chosen": -1.7429006099700928, + "logits/rejected": -1.7954258918762207, + "logps/chosen": -81.42424011230469, + "logps/rejected": -120.32783508300781, + "loss": 1.0209, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.994244337081909, + "rewards/margins": -1.5673110485076904, + "rewards/rejected": 4.5615553855896, + "step": 8404 + }, + { + "epoch": 1.36, + "learning_rate": 2.4241337361694305e-06, + "logits/chosen": -1.336670160293579, + "logits/rejected": -1.3241691589355469, + "logps/chosen": -7.844825744628906, + "logps/rejected": -1.195510983467102, + "loss": 0.3647, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.28788071870803833, + "rewards/margins": -0.06287583708763123, + "rewards/rejected": 0.35075655579566956, + "step": 8405 + }, + { + "epoch": 1.36, + "learning_rate": 2.4230073964182925e-06, + "logits/chosen": -1.3612834215164185, + "logits/rejected": -1.2799911499023438, + "logps/chosen": -75.88021850585938, + "logps/rejected": -26.781105041503906, + "loss": 0.8755, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.7210052013397217, + "rewards/margins": 2.344536066055298, + "rewards/rejected": 0.37646904587745667, + "step": 8406 + }, + { + "epoch": 1.36, + "learning_rate": 2.4218812347129578e-06, + "logits/chosen": -1.4254693984985352, + "logits/rejected": -1.2926056385040283, + "logps/chosen": -127.614501953125, + "logps/rejected": -48.43028259277344, + "loss": 0.0166, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.460626125335693, + "rewards/margins": 3.9656729698181152, + "rewards/rejected": 2.494953155517578, + "step": 8407 + }, + { + "epoch": 1.36, + "learning_rate": 2.420755251131237e-06, + "logits/chosen": -1.4931726455688477, + "logits/rejected": -1.4447780847549438, + "logps/chosen": -246.77120971679688, + "logps/rejected": -61.24785614013672, + "loss": 1.3586, + "rewards/accuracies": 1.0, + "rewards/chosen": 7.058068752288818, + "rewards/margins": 2.443737506866455, + "rewards/rejected": 4.614331245422363, + "step": 8408 + }, + { + "epoch": 1.36, + "learning_rate": 2.419629445750922e-06, + "logits/chosen": -1.0459620952606201, + "logits/rejected": -1.0470781326293945, + "logps/chosen": -102.14695739746094, + "logps/rejected": -90.302734375, + "loss": 0.1251, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.522573947906494, + "rewards/margins": 2.43514347076416, + "rewards/rejected": 0.08743057399988174, + "step": 8409 + }, + { + "epoch": 1.37, + "learning_rate": 2.418503818649798e-06, + "logits/chosen": -1.2776999473571777, + "logits/rejected": -1.2735621929168701, + "logps/chosen": -115.1297836303711, + "logps/rejected": -53.70330047607422, + "loss": 0.2798, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.6369667053222656, + "rewards/margins": 0.3185126781463623, + "rewards/rejected": 2.3184540271759033, + "step": 8410 + }, + { + "epoch": 1.37, + "learning_rate": 2.417378369905632e-06, + "logits/chosen": -1.179437279701233, + "logits/rejected": -1.179437279701233, + "logps/chosen": -39.099098205566406, + "logps/rejected": -39.099098205566406, + "loss": 0.8034, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.294088125228882, + "rewards/margins": 0.0, + "rewards/rejected": 3.294088125228882, + "step": 8411 + }, + { + "epoch": 1.37, + "learning_rate": 2.4162530995961853e-06, + "logits/chosen": -1.424111008644104, + "logits/rejected": -1.4873789548873901, + "logps/chosen": -144.43466186523438, + "logps/rejected": -124.60523986816406, + "loss": 0.4636, + "rewards/accuracies": 0.0, + "rewards/chosen": 4.686913967132568, + "rewards/margins": -0.3695392608642578, + "rewards/rejected": 5.056453227996826, + "step": 8412 + }, + { + "epoch": 1.37, + "learning_rate": 2.4151280077992e-06, + "logits/chosen": -1.4741405248641968, + "logits/rejected": -1.4923540353775024, + "logps/chosen": -66.5822982788086, + "logps/rejected": -129.0876007080078, + "loss": 1.3584, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.7213845252990723, + "rewards/margins": -0.4291846752166748, + "rewards/rejected": 3.150569200515747, + "step": 8413 + }, + { + "epoch": 1.37, + "learning_rate": 2.4140030945924137e-06, + "logits/chosen": -0.8183466196060181, + "logits/rejected": -0.8353911638259888, + "logps/chosen": -39.86049270629883, + "logps/rejected": -45.362220764160156, + "loss": 0.3107, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.3264577388763428, + "rewards/margins": 0.15070772171020508, + "rewards/rejected": 2.1757500171661377, + "step": 8414 + }, + { + "epoch": 1.37, + "learning_rate": 2.4128783600535415e-06, + "logits/chosen": -1.258070945739746, + "logits/rejected": -1.2146129608154297, + "logps/chosen": -47.315792083740234, + "logps/rejected": -56.44981384277344, + "loss": 0.5367, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.877202272415161, + "rewards/margins": 0.7705173492431641, + "rewards/rejected": 2.106684923171997, + "step": 8415 + }, + { + "epoch": 1.37, + "learning_rate": 2.411753804260298e-06, + "logits/chosen": -1.2108618021011353, + "logits/rejected": -1.2706047296524048, + "logps/chosen": -37.912166595458984, + "logps/rejected": -90.46426391601562, + "loss": 1.3511, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.5952374935150146, + "rewards/margins": -0.3429758548736572, + "rewards/rejected": 2.938213348388672, + "step": 8416 + }, + { + "epoch": 1.37, + "learning_rate": 2.410629427290374e-06, + "logits/chosen": -0.9808793663978577, + "logits/rejected": -1.0263251066207886, + "logps/chosen": -5.669346809387207, + "logps/rejected": -48.24483871459961, + "loss": 0.5386, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.38608551025390625, + "rewards/margins": -0.5411216616630554, + "rewards/rejected": 0.9272071719169617, + "step": 8417 + }, + { + "epoch": 1.37, + "learning_rate": 2.409505229221458e-06, + "logits/chosen": -1.7019494771957397, + "logits/rejected": -1.523492455482483, + "logps/chosen": -102.6297836303711, + "logps/rejected": -19.924091339111328, + "loss": 0.0092, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.9945597648620605, + "rewards/margins": 5.285704135894775, + "rewards/rejected": 0.7088556289672852, + "step": 8418 + }, + { + "epoch": 1.37, + "learning_rate": 2.4083812101312163e-06, + "logits/chosen": -1.1511105298995972, + "logits/rejected": -1.0616135597229004, + "logps/chosen": -48.292083740234375, + "logps/rejected": -41.88124084472656, + "loss": 0.144, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.938580274581909, + "rewards/margins": 1.3568453788757324, + "rewards/rejected": 1.5817348957061768, + "step": 8419 + }, + { + "epoch": 1.37, + "learning_rate": 2.407257370097314e-06, + "logits/chosen": -1.2928402423858643, + "logits/rejected": -1.3106766939163208, + "logps/chosen": -65.05992126464844, + "logps/rejected": -90.14173889160156, + "loss": 0.4387, + "rewards/accuracies": 0.0, + "rewards/chosen": 4.366697788238525, + "rewards/margins": -0.2719449996948242, + "rewards/rejected": 4.63864278793335, + "step": 8420 + }, + { + "epoch": 1.37, + "learning_rate": 2.406133709197392e-06, + "logits/chosen": -1.2996972799301147, + "logits/rejected": -1.4199944734573364, + "logps/chosen": -58.47930908203125, + "logps/rejected": -145.90170288085938, + "loss": 2.246, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.9061249494552612, + "rewards/margins": -4.167488098144531, + "rewards/rejected": 6.073613166809082, + "step": 8421 + }, + { + "epoch": 1.37, + "learning_rate": 2.40501022750909e-06, + "logits/chosen": -1.1958894729614258, + "logits/rejected": -1.2055214643478394, + "logps/chosen": -55.731964111328125, + "logps/rejected": -92.78024291992188, + "loss": 1.2803, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.0455864667892456, + "rewards/margins": -0.22491216659545898, + "rewards/rejected": 1.2704986333847046, + "step": 8422 + }, + { + "epoch": 1.37, + "learning_rate": 2.4038869251100244e-06, + "logits/chosen": -1.1095086336135864, + "logits/rejected": -1.1974756717681885, + "logps/chosen": -105.01641845703125, + "logps/rejected": -89.39053344726562, + "loss": 2.5066, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.3352432250976562, + "rewards/margins": -4.965864658355713, + "rewards/rejected": 7.301107883453369, + "step": 8423 + }, + { + "epoch": 1.37, + "learning_rate": 2.40276380207781e-06, + "logits/chosen": -1.6464687585830688, + "logits/rejected": -1.6744636297225952, + "logps/chosen": -107.67891693115234, + "logps/rejected": -53.87970733642578, + "loss": 0.4365, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.5356147289276123, + "rewards/margins": 1.860996961593628, + "rewards/rejected": 1.6746177673339844, + "step": 8424 + }, + { + "epoch": 1.37, + "learning_rate": 2.4016408584900397e-06, + "logits/chosen": -1.613795518875122, + "logits/rejected": -1.6214544773101807, + "logps/chosen": -115.13870239257812, + "logps/rejected": -107.2394790649414, + "loss": 0.5483, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.408221483230591, + "rewards/margins": 0.8524391651153564, + "rewards/rejected": 1.5557823181152344, + "step": 8425 + }, + { + "epoch": 1.37, + "learning_rate": 2.4005180944243016e-06, + "logits/chosen": -1.1975882053375244, + "logits/rejected": -0.9633520245552063, + "logps/chosen": -143.21063232421875, + "logps/rejected": -74.22203063964844, + "loss": 0.22, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.989117622375488, + "rewards/margins": 1.8185791969299316, + "rewards/rejected": 3.1705384254455566, + "step": 8426 + }, + { + "epoch": 1.37, + "learning_rate": 2.3993955099581674e-06, + "logits/chosen": -1.2818740606307983, + "logits/rejected": -1.284106969833374, + "logps/chosen": -68.76222229003906, + "logps/rejected": -58.429168701171875, + "loss": 0.3679, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.337027072906494, + "rewards/margins": -0.020879268646240234, + "rewards/rejected": 2.3579063415527344, + "step": 8427 + }, + { + "epoch": 1.37, + "learning_rate": 2.3982731051691937e-06, + "logits/chosen": -1.3842929601669312, + "logits/rejected": -1.3247102499008179, + "logps/chosen": -56.9406623840332, + "logps/rejected": -59.65166091918945, + "loss": 0.3538, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.5537761449813843, + "rewards/margins": 0.4796614646911621, + "rewards/rejected": 1.0741146802902222, + "step": 8428 + }, + { + "epoch": 1.37, + "learning_rate": 2.397150880134933e-06, + "logits/chosen": -0.8909707069396973, + "logits/rejected": -0.874226987361908, + "logps/chosen": -13.646761894226074, + "logps/rejected": -16.260873794555664, + "loss": 0.74, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.825031578540802, + "rewards/margins": -0.029858708381652832, + "rewards/rejected": 0.8548902869224548, + "step": 8429 + }, + { + "epoch": 1.37, + "learning_rate": 2.3960288349329164e-06, + "logits/chosen": -1.343473196029663, + "logits/rejected": -1.2588894367218018, + "logps/chosen": -88.01327514648438, + "logps/rejected": -72.43344116210938, + "loss": 0.2772, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.6750214099884033, + "rewards/margins": 0.3074219226837158, + "rewards/rejected": 2.3675994873046875, + "step": 8430 + }, + { + "epoch": 1.37, + "learning_rate": 2.39490696964067e-06, + "logits/chosen": -1.3609426021575928, + "logits/rejected": -1.4180058240890503, + "logps/chosen": -106.23954772949219, + "logps/rejected": -121.46444702148438, + "loss": 2.1389, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.647541880607605, + "rewards/margins": -0.11673116683959961, + "rewards/rejected": 1.7642730474472046, + "step": 8431 + }, + { + "epoch": 1.37, + "learning_rate": 2.3937852843357e-06, + "logits/chosen": -1.288947343826294, + "logits/rejected": -1.2532122135162354, + "logps/chosen": -83.0002212524414, + "logps/rejected": -40.912925720214844, + "loss": 0.2933, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0624306201934814, + "rewards/margins": 0.977806806564331, + "rewards/rejected": 2.0846238136291504, + "step": 8432 + }, + { + "epoch": 1.37, + "learning_rate": 2.392663779095509e-06, + "logits/chosen": -1.3251378536224365, + "logits/rejected": -1.2829498052597046, + "logps/chosen": -77.01216125488281, + "logps/rejected": -58.58661651611328, + "loss": 0.1753, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.9772628545761108, + "rewards/margins": 0.8714476823806763, + "rewards/rejected": 1.1058151721954346, + "step": 8433 + }, + { + "epoch": 1.37, + "learning_rate": 2.391542453997578e-06, + "logits/chosen": -1.38986074924469, + "logits/rejected": -1.1216965913772583, + "logps/chosen": -161.65003967285156, + "logps/rejected": -22.548419952392578, + "loss": 0.0152, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.262306213378906, + "rewards/margins": 5.750235557556152, + "rewards/rejected": 0.5120706558227539, + "step": 8434 + }, + { + "epoch": 1.37, + "learning_rate": 2.390421309119384e-06, + "logits/chosen": -1.3135498762130737, + "logits/rejected": -1.1560477018356323, + "logps/chosen": -121.43941497802734, + "logps/rejected": -40.843658447265625, + "loss": 0.096, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.833133697509766, + "rewards/margins": 1.722771167755127, + "rewards/rejected": 4.110362529754639, + "step": 8435 + }, + { + "epoch": 1.37, + "learning_rate": 2.389300344538383e-06, + "logits/chosen": -1.264129400253296, + "logits/rejected": -1.3006906509399414, + "logps/chosen": -54.53933334350586, + "logps/rejected": -81.18450927734375, + "loss": 0.6331, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.7497646808624268, + "rewards/margins": -0.36449694633483887, + "rewards/rejected": 2.1142616271972656, + "step": 8436 + }, + { + "epoch": 1.37, + "learning_rate": 2.388179560332028e-06, + "logits/chosen": -0.9400432705879211, + "logits/rejected": -0.949819028377533, + "logps/chosen": -54.19561004638672, + "logps/rejected": -61.85723876953125, + "loss": 0.4389, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.4173622131347656, + "rewards/margins": 1.5950629711151123, + "rewards/rejected": 1.8222992420196533, + "step": 8437 + }, + { + "epoch": 1.37, + "learning_rate": 2.3870589565777503e-06, + "logits/chosen": -0.9134849309921265, + "logits/rejected": -0.9134849309921265, + "logps/chosen": -22.706398010253906, + "logps/rejected": -22.706398010253906, + "loss": 0.3466, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.756421685218811, + "rewards/margins": 0.0, + "rewards/rejected": 1.756421685218811, + "step": 8438 + }, + { + "epoch": 1.37, + "learning_rate": 2.3859385333529766e-06, + "logits/chosen": -1.781719446182251, + "logits/rejected": -1.8267364501953125, + "logps/chosen": -59.175132751464844, + "logps/rejected": -108.48085021972656, + "loss": 1.3129, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.235440969467163, + "rewards/margins": -1.7063238620758057, + "rewards/rejected": 3.9417648315429688, + "step": 8439 + }, + { + "epoch": 1.37, + "learning_rate": 2.3848182907351137e-06, + "logits/chosen": -1.5073057413101196, + "logits/rejected": -1.5649182796478271, + "logps/chosen": -57.69069290161133, + "logps/rejected": -215.50279235839844, + "loss": 2.8302, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.9232699871063232, + "rewards/margins": -5.592121124267578, + "rewards/rejected": 9.51539134979248, + "step": 8440 + }, + { + "epoch": 1.37, + "learning_rate": 2.3836982288015634e-06, + "logits/chosen": -1.013553500175476, + "logits/rejected": -1.0953727960586548, + "logps/chosen": -254.91078186035156, + "logps/rejected": -129.72909545898438, + "loss": 0.4923, + "rewards/accuracies": 0.0, + "rewards/chosen": 6.715171813964844, + "rewards/margins": -0.4215989112854004, + "rewards/rejected": 7.136770725250244, + "step": 8441 + }, + { + "epoch": 1.37, + "learning_rate": 2.3825783476297086e-06, + "logits/chosen": -1.239719033241272, + "logits/rejected": -1.3045599460601807, + "logps/chosen": -89.28755187988281, + "logps/rejected": -65.21372985839844, + "loss": 0.8864, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.419300079345703, + "rewards/margins": 0.16968297958374023, + "rewards/rejected": 2.249617099761963, + "step": 8442 + }, + { + "epoch": 1.37, + "learning_rate": 2.3814586472969253e-06, + "logits/chosen": -1.3998665809631348, + "logits/rejected": -1.3998665809631348, + "logps/chosen": -81.1932373046875, + "logps/rejected": -81.1932373046875, + "loss": 0.3985, + "rewards/accuracies": 0.0, + "rewards/chosen": 4.096118450164795, + "rewards/margins": 0.0, + "rewards/rejected": 4.096118450164795, + "step": 8443 + }, + { + "epoch": 1.37, + "learning_rate": 2.380339127880571e-06, + "logits/chosen": -1.1124579906463623, + "logits/rejected": -1.1284561157226562, + "logps/chosen": -32.98490905761719, + "logps/rejected": -48.81555938720703, + "loss": 0.5752, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.1009438037872314, + "rewards/margins": 0.8682488203048706, + "rewards/rejected": 1.2326949834823608, + "step": 8444 + }, + { + "epoch": 1.37, + "learning_rate": 2.379219789457997e-06, + "logits/chosen": -0.9919710159301758, + "logits/rejected": -0.9919710159301758, + "logps/chosen": -59.58318328857422, + "logps/rejected": -59.58318328857422, + "loss": 0.6919, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.5501747131347656, + "rewards/margins": 0.0, + "rewards/rejected": 1.5501747131347656, + "step": 8445 + }, + { + "epoch": 1.37, + "learning_rate": 2.378100632106536e-06, + "logits/chosen": -0.8225594758987427, + "logits/rejected": -0.8225594758987427, + "logps/chosen": -0.9111135005950928, + "logps/rejected": -0.9111135005950928, + "loss": 0.3652, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.15625524520874023, + "rewards/margins": 0.0, + "rewards/rejected": 0.15625524520874023, + "step": 8446 + }, + { + "epoch": 1.37, + "learning_rate": 2.3769816559035143e-06, + "logits/chosen": -1.2229725122451782, + "logits/rejected": -1.4658291339874268, + "logps/chosen": -38.12403869628906, + "logps/rejected": -128.96493530273438, + "loss": 3.0374, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.6867660284042358, + "rewards/margins": -4.093269348144531, + "rewards/rejected": 5.780035495758057, + "step": 8447 + }, + { + "epoch": 1.37, + "learning_rate": 2.375862860926239e-06, + "logits/chosen": -0.9828826785087585, + "logits/rejected": -1.009588599205017, + "logps/chosen": -52.446414947509766, + "logps/rejected": -56.40174865722656, + "loss": 0.753, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.3801562786102295, + "rewards/margins": 0.3497912883758545, + "rewards/rejected": 2.030364990234375, + "step": 8448 + }, + { + "epoch": 1.37, + "learning_rate": 2.374744247252012e-06, + "logits/chosen": -1.1884942054748535, + "logits/rejected": -1.1201962232589722, + "logps/chosen": -77.12809753417969, + "logps/rejected": -69.34797668457031, + "loss": 0.4446, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.119274854660034, + "rewards/margins": 0.8029806613922119, + "rewards/rejected": 2.3162941932678223, + "step": 8449 + }, + { + "epoch": 1.37, + "learning_rate": 2.3736258149581152e-06, + "logits/chosen": -1.5342081785202026, + "logits/rejected": -1.385000467300415, + "logps/chosen": -162.95480346679688, + "logps/rejected": -52.19550323486328, + "loss": 0.0736, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.12691068649292, + "rewards/margins": 1.9049921035766602, + "rewards/rejected": 4.22191858291626, + "step": 8450 + }, + { + "epoch": 1.37, + "learning_rate": 2.372507564121826e-06, + "logits/chosen": -1.1213446855545044, + "logits/rejected": -1.154224157333374, + "logps/chosen": -70.2926254272461, + "logps/rejected": -80.08003234863281, + "loss": 0.6011, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.235135793685913, + "rewards/margins": -0.5868628025054932, + "rewards/rejected": 2.8219985961914062, + "step": 8451 + }, + { + "epoch": 1.37, + "learning_rate": 2.3713894948204e-06, + "logits/chosen": -0.8658279180526733, + "logits/rejected": -1.0077356100082397, + "logps/chosen": -46.22247314453125, + "logps/rejected": -86.01092529296875, + "loss": 0.9605, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.3893914222717285, + "rewards/margins": -1.143125057220459, + "rewards/rejected": 3.5325164794921875, + "step": 8452 + }, + { + "epoch": 1.37, + "learning_rate": 2.3702716071310906e-06, + "logits/chosen": -1.1685124635696411, + "logits/rejected": -0.9948754906654358, + "logps/chosen": -219.99293518066406, + "logps/rejected": -40.16223907470703, + "loss": 0.6888, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.9637832641601562, + "rewards/margins": 0.7249946594238281, + "rewards/rejected": 1.2387886047363281, + "step": 8453 + }, + { + "epoch": 1.37, + "learning_rate": 2.3691539011311276e-06, + "logits/chosen": -1.5864903926849365, + "logits/rejected": -1.5673260688781738, + "logps/chosen": -30.969669342041016, + "logps/rejected": -49.965240478515625, + "loss": 0.2821, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.3632893562316895, + "rewards/margins": 0.7424435615539551, + "rewards/rejected": 1.6208457946777344, + "step": 8454 + }, + { + "epoch": 1.37, + "learning_rate": 2.3680363768977387e-06, + "logits/chosen": -1.3781816959381104, + "logits/rejected": -1.3209693431854248, + "logps/chosen": -110.86653137207031, + "logps/rejected": -68.68173217773438, + "loss": 0.1142, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.3512864112854, + "rewards/margins": 1.3622231483459473, + "rewards/rejected": 4.989063262939453, + "step": 8455 + }, + { + "epoch": 1.37, + "learning_rate": 2.366919034508131e-06, + "logits/chosen": -1.337306022644043, + "logits/rejected": -1.0062365531921387, + "logps/chosen": -100.77665710449219, + "logps/rejected": -23.32953453063965, + "loss": 0.0045, + "rewards/accuracies": 1.0, + "rewards/chosen": 7.436418056488037, + "rewards/margins": 7.541835308074951, + "rewards/rejected": -0.10541706532239914, + "step": 8456 + }, + { + "epoch": 1.37, + "learning_rate": 2.365801874039505e-06, + "logits/chosen": -1.1517728567123413, + "logits/rejected": -1.1041842699050903, + "logps/chosen": -90.24324798583984, + "logps/rejected": -72.1151123046875, + "loss": 0.9259, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.5914665460586548, + "rewards/margins": -0.5050727128982544, + "rewards/rejected": 2.096539258956909, + "step": 8457 + }, + { + "epoch": 1.37, + "learning_rate": 2.3646848955690426e-06, + "logits/chosen": -1.2151561975479126, + "logits/rejected": -1.1942278146743774, + "logps/chosen": -63.0369873046875, + "logps/rejected": -45.0018424987793, + "loss": 0.306, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.3713722229003906, + "rewards/margins": 0.482313871383667, + "rewards/rejected": 2.8890583515167236, + "step": 8458 + }, + { + "epoch": 1.37, + "learning_rate": 2.36356809917392e-06, + "logits/chosen": -1.0924832820892334, + "logits/rejected": -1.0787001848220825, + "logps/chosen": -33.733558654785156, + "logps/rejected": -33.81991958618164, + "loss": 0.5395, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.442122220993042, + "rewards/margins": -0.5986185073852539, + "rewards/rejected": 3.040740728378296, + "step": 8459 + }, + { + "epoch": 1.37, + "learning_rate": 2.3624514849312945e-06, + "logits/chosen": -1.7803698778152466, + "logits/rejected": -1.7926687002182007, + "logps/chosen": -51.06045913696289, + "logps/rejected": -30.0238094329834, + "loss": 0.519, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.2004902362823486, + "rewards/margins": 1.24227774143219, + "rewards/rejected": 0.9582124948501587, + "step": 8460 + }, + { + "epoch": 1.37, + "learning_rate": 2.3613350529183144e-06, + "logits/chosen": -1.2219319343566895, + "logits/rejected": -1.0091211795806885, + "logps/chosen": -54.218528747558594, + "logps/rejected": -24.795013427734375, + "loss": 1.7889, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.1270217895507812, + "rewards/margins": 1.8607301712036133, + "rewards/rejected": 0.26629161834716797, + "step": 8461 + }, + { + "epoch": 1.37, + "learning_rate": 2.3602188032121165e-06, + "logits/chosen": -1.2194342613220215, + "logits/rejected": -1.20038902759552, + "logps/chosen": -111.28007507324219, + "logps/rejected": -70.81150817871094, + "loss": 0.0691, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.5586090087890625, + "rewards/margins": 2.495706081390381, + "rewards/rejected": 2.0629029273986816, + "step": 8462 + }, + { + "epoch": 1.37, + "learning_rate": 2.3591027358898194e-06, + "logits/chosen": -1.4509028196334839, + "logits/rejected": -1.4789658784866333, + "logps/chosen": -30.074050903320312, + "logps/rejected": -110.01920318603516, + "loss": 0.5206, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.9219932556152344, + "rewards/margins": 1.5562820434570312, + "rewards/rejected": 2.365711212158203, + "step": 8463 + }, + { + "epoch": 1.37, + "learning_rate": 2.357986851028537e-06, + "logits/chosen": -1.0441628694534302, + "logits/rejected": -1.1192487478256226, + "logps/chosen": -76.91331481933594, + "logps/rejected": -84.95492553710938, + "loss": 0.0323, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.33267068862915, + "rewards/margins": 3.0263519287109375, + "rewards/rejected": 2.306318759918213, + "step": 8464 + }, + { + "epoch": 1.37, + "learning_rate": 2.3568711487053627e-06, + "logits/chosen": -1.3264645338058472, + "logits/rejected": -1.163092017173767, + "logps/chosen": -84.88825225830078, + "logps/rejected": -68.05400085449219, + "loss": 0.0405, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.695436954498291, + "rewards/margins": 2.5493996143341064, + "rewards/rejected": 3.1460373401641846, + "step": 8465 + }, + { + "epoch": 1.37, + "learning_rate": 2.3557556289973838e-06, + "logits/chosen": -0.9427955746650696, + "logits/rejected": -0.9427955746650696, + "logps/chosen": -55.471378326416016, + "logps/rejected": -55.471378326416016, + "loss": 1.7, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.4732780456542969, + "rewards/margins": 0.0, + "rewards/rejected": 0.4732780456542969, + "step": 8466 + }, + { + "epoch": 1.37, + "learning_rate": 2.3546402919816693e-06, + "logits/chosen": -1.385117530822754, + "logits/rejected": -1.5052270889282227, + "logps/chosen": -40.80147171020508, + "logps/rejected": -67.03919982910156, + "loss": 3.0151, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.1477324962615967, + "rewards/margins": -3.195103406906128, + "rewards/rejected": 5.342835903167725, + "step": 8467 + }, + { + "epoch": 1.37, + "learning_rate": 2.3535251377352814e-06, + "logits/chosen": -1.362623691558838, + "logits/rejected": -1.2261381149291992, + "logps/chosen": -197.79278564453125, + "logps/rejected": -31.592439651489258, + "loss": 0.0122, + "rewards/accuracies": 1.0, + "rewards/chosen": 8.747031211853027, + "rewards/margins": 6.369069576263428, + "rewards/rejected": 2.3779616355895996, + "step": 8468 + }, + { + "epoch": 1.37, + "learning_rate": 2.352410166335264e-06, + "logits/chosen": -1.5100750923156738, + "logits/rejected": -1.4911810159683228, + "logps/chosen": -122.16153717041016, + "logps/rejected": -166.41375732421875, + "loss": 1.1818, + "rewards/accuracies": 0.0, + "rewards/chosen": 6.284484386444092, + "rewards/margins": -0.9711003303527832, + "rewards/rejected": 7.255584716796875, + "step": 8469 + }, + { + "epoch": 1.37, + "learning_rate": 2.3512953778586537e-06, + "logits/chosen": -1.1063164472579956, + "logits/rejected": -1.052873134613037, + "logps/chosen": -66.59666442871094, + "logps/rejected": -45.393497467041016, + "loss": 0.1384, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.2714927196502686, + "rewards/margins": 1.1731818914413452, + "rewards/rejected": 1.0983108282089233, + "step": 8470 + }, + { + "epoch": 1.37, + "learning_rate": 2.3501807723824693e-06, + "logits/chosen": -1.3814040422439575, + "logits/rejected": -1.4872708320617676, + "logps/chosen": -72.36957550048828, + "logps/rejected": -128.49496459960938, + "loss": 2.3602, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.5092506408691406, + "rewards/margins": -1.4355370998382568, + "rewards/rejected": 3.9447877407073975, + "step": 8471 + }, + { + "epoch": 1.38, + "learning_rate": 2.3490663499837224e-06, + "logits/chosen": -0.9337295889854431, + "logits/rejected": -1.0056495666503906, + "logps/chosen": -59.189388275146484, + "logps/rejected": -129.4891357421875, + "loss": 0.0867, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1210453510284424, + "rewards/margins": 1.7522648572921753, + "rewards/rejected": 1.368780493736267, + "step": 8472 + }, + { + "epoch": 1.38, + "learning_rate": 2.3479521107394055e-06, + "logits/chosen": -1.3010681867599487, + "logits/rejected": -1.2646180391311646, + "logps/chosen": -93.27391052246094, + "logps/rejected": -105.26945495605469, + "loss": 0.2887, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.295323133468628, + "rewards/margins": 0.33597707748413086, + "rewards/rejected": 2.959346055984497, + "step": 8473 + }, + { + "epoch": 1.38, + "learning_rate": 2.346838054726505e-06, + "logits/chosen": -1.3746631145477295, + "logits/rejected": -1.448931336402893, + "logps/chosen": -114.17713928222656, + "logps/rejected": -68.64859008789062, + "loss": 0.2049, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.6478469371795654, + "rewards/margins": 1.7005606889724731, + "rewards/rejected": 1.9472862482070923, + "step": 8474 + }, + { + "epoch": 1.38, + "learning_rate": 2.3457241820219895e-06, + "logits/chosen": -1.1615110635757446, + "logits/rejected": -1.2259645462036133, + "logps/chosen": -10.374164581298828, + "logps/rejected": -86.70027923583984, + "loss": 0.6701, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.6917055249214172, + "rewards/margins": -0.35776883363723755, + "rewards/rejected": 1.0494743585586548, + "step": 8475 + }, + { + "epoch": 1.38, + "learning_rate": 2.3446104927028196e-06, + "logits/chosen": -1.790661096572876, + "logits/rejected": -1.773329257965088, + "logps/chosen": -67.5526123046875, + "logps/rejected": -136.88412475585938, + "loss": 0.6183, + "rewards/accuracies": 0.0, + "rewards/chosen": 5.413529872894287, + "rewards/margins": -0.5941834449768066, + "rewards/rejected": 6.007713317871094, + "step": 8476 + }, + { + "epoch": 1.38, + "learning_rate": 2.3434969868459373e-06, + "logits/chosen": -1.4432810544967651, + "logits/rejected": -1.529239535331726, + "logps/chosen": -159.76187133789062, + "logps/rejected": -187.03707885742188, + "loss": 0.1457, + "rewards/accuracies": 1.0, + "rewards/chosen": 7.102330207824707, + "rewards/margins": 1.1229662895202637, + "rewards/rejected": 5.979363918304443, + "step": 8477 + }, + { + "epoch": 1.38, + "learning_rate": 2.3423836645282786e-06, + "logits/chosen": -1.224048137664795, + "logits/rejected": -1.1309248208999634, + "logps/chosen": -121.69424438476562, + "logps/rejected": -72.5580825805664, + "loss": 0.06, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.6455230712890625, + "rewards/margins": 2.1395576000213623, + "rewards/rejected": 1.5059654712677002, + "step": 8478 + }, + { + "epoch": 1.38, + "learning_rate": 2.3412705258267605e-06, + "logits/chosen": -1.212882161140442, + "logits/rejected": -1.1361279487609863, + "logps/chosen": -101.31192016601562, + "logps/rejected": -41.75321578979492, + "loss": 0.3339, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.079648017883301, + "rewards/margins": 1.1495580673217773, + "rewards/rejected": 3.9300899505615234, + "step": 8479 + }, + { + "epoch": 1.38, + "learning_rate": 2.340157570818294e-06, + "logits/chosen": -0.8391836285591125, + "logits/rejected": -0.830349862575531, + "logps/chosen": -87.87474060058594, + "logps/rejected": -34.03614044189453, + "loss": 0.6722, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.8870102167129517, + "rewards/margins": 0.5947411060333252, + "rewards/rejected": 1.2922691106796265, + "step": 8480 + }, + { + "epoch": 1.38, + "learning_rate": 2.3390447995797694e-06, + "logits/chosen": -1.2642760276794434, + "logits/rejected": -1.2642760276794434, + "logps/chosen": -34.09529113769531, + "logps/rejected": -34.09529113769531, + "loss": 0.3539, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.178147554397583, + "rewards/margins": 0.0, + "rewards/rejected": 2.178147554397583, + "step": 8481 + }, + { + "epoch": 1.38, + "learning_rate": 2.337932212188073e-06, + "logits/chosen": -1.3591678142547607, + "logits/rejected": -1.2894221544265747, + "logps/chosen": -147.19920349121094, + "logps/rejected": -29.076662063598633, + "loss": 1.6213, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.9403122663497925, + "rewards/margins": 1.6385316848754883, + "rewards/rejected": 0.3017805218696594, + "step": 8482 + }, + { + "epoch": 1.38, + "learning_rate": 2.3368198087200704e-06, + "logits/chosen": -1.1390743255615234, + "logits/rejected": -1.1708614826202393, + "logps/chosen": -30.885578155517578, + "logps/rejected": -21.44573974609375, + "loss": 0.4255, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.137096881866455, + "rewards/margins": -0.2841784954071045, + "rewards/rejected": 2.4212753772735596, + "step": 8483 + }, + { + "epoch": 1.38, + "learning_rate": 2.3357075892526215e-06, + "logits/chosen": -1.2391486167907715, + "logits/rejected": -1.3007389307022095, + "logps/chosen": -56.75874328613281, + "logps/rejected": -67.15562438964844, + "loss": 0.1581, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.176339864730835, + "rewards/margins": 1.1164567470550537, + "rewards/rejected": 2.0598831176757812, + "step": 8484 + }, + { + "epoch": 1.38, + "learning_rate": 2.3345955538625666e-06, + "logits/chosen": -1.2182061672210693, + "logits/rejected": -1.1997464895248413, + "logps/chosen": -94.94860076904297, + "logps/rejected": -48.602962493896484, + "loss": 0.207, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.1926331520080566, + "rewards/margins": 0.8673237562179565, + "rewards/rejected": 1.3253093957901, + "step": 8485 + }, + { + "epoch": 1.38, + "learning_rate": 2.3334837026267406e-06, + "logits/chosen": -1.353065013885498, + "logits/rejected": -1.3762547969818115, + "logps/chosen": -227.3896026611328, + "logps/rejected": -99.09508514404297, + "loss": 0.0063, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.462138652801514, + "rewards/margins": 4.693369388580322, + "rewards/rejected": 1.7687691450119019, + "step": 8486 + }, + { + "epoch": 1.38, + "learning_rate": 2.3323720356219574e-06, + "logits/chosen": -1.1222050189971924, + "logits/rejected": -1.0219272375106812, + "logps/chosen": -63.61053466796875, + "logps/rejected": -38.683929443359375, + "loss": 0.1737, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.6271042823791504, + "rewards/margins": 0.8824410438537598, + "rewards/rejected": 1.7446632385253906, + "step": 8487 + }, + { + "epoch": 1.38, + "learning_rate": 2.331260552925028e-06, + "logits/chosen": -1.1862889528274536, + "logits/rejected": -1.2167701721191406, + "logps/chosen": -33.49627685546875, + "logps/rejected": -110.8145980834961, + "loss": 0.5514, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.6759006977081299, + "rewards/margins": -0.5852177143096924, + "rewards/rejected": 2.2611184120178223, + "step": 8488 + }, + { + "epoch": 1.38, + "learning_rate": 2.3301492546127403e-06, + "logits/chosen": -1.4090386629104614, + "logits/rejected": -1.1470521688461304, + "logps/chosen": -58.938072204589844, + "logps/rejected": -218.61068725585938, + "loss": 2.2516, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.0640900135040283, + "rewards/margins": -4.422717094421387, + "rewards/rejected": 6.486807346343994, + "step": 8489 + }, + { + "epoch": 1.38, + "learning_rate": 2.3290381407618785e-06, + "logits/chosen": -1.5359368324279785, + "logits/rejected": -1.4503746032714844, + "logps/chosen": -78.68011474609375, + "logps/rejected": -53.927978515625, + "loss": 0.5325, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.6606476306915283, + "rewards/margins": -0.4468200206756592, + "rewards/rejected": 2.1074676513671875, + "step": 8490 + }, + { + "epoch": 1.38, + "learning_rate": 2.3279272114492065e-06, + "logits/chosen": -1.0150784254074097, + "logits/rejected": -0.9940385222434998, + "logps/chosen": -28.534671783447266, + "logps/rejected": -15.211038589477539, + "loss": 0.5335, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.5396869778633118, + "rewards/margins": -0.17668402194976807, + "rewards/rejected": 0.7163709998130798, + "step": 8491 + }, + { + "epoch": 1.38, + "learning_rate": 2.3268164667514827e-06, + "logits/chosen": -1.2147221565246582, + "logits/rejected": -1.2483711242675781, + "logps/chosen": -53.15269470214844, + "logps/rejected": -68.55608367919922, + "loss": 0.8381, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.054791212081909, + "rewards/margins": 0.4008368253707886, + "rewards/rejected": 1.6539543867111206, + "step": 8492 + }, + { + "epoch": 1.38, + "learning_rate": 2.325705906745445e-06, + "logits/chosen": -1.362613558769226, + "logits/rejected": -1.3644083738327026, + "logps/chosen": -82.02883911132812, + "logps/rejected": -110.24281311035156, + "loss": 0.5279, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.8173004388809204, + "rewards/margins": -0.6235870122909546, + "rewards/rejected": 2.440887451171875, + "step": 8493 + }, + { + "epoch": 1.38, + "learning_rate": 2.324595531507827e-06, + "logits/chosen": -1.138744831085205, + "logits/rejected": -1.1007118225097656, + "logps/chosen": -76.62569427490234, + "logps/rejected": -82.61204528808594, + "loss": 0.2189, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.564838409423828, + "rewards/margins": 0.6563055515289307, + "rewards/rejected": 2.9085328578948975, + "step": 8494 + }, + { + "epoch": 1.38, + "learning_rate": 2.323485341115341e-06, + "logits/chosen": -1.062848687171936, + "logits/rejected": -1.062848687171936, + "logps/chosen": -38.440799713134766, + "logps/rejected": -38.440799713134766, + "loss": 0.3471, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.4470608234405518, + "rewards/margins": 0.0, + "rewards/rejected": 2.4470608234405518, + "step": 8495 + }, + { + "epoch": 1.38, + "learning_rate": 2.322375335644692e-06, + "logits/chosen": -1.312628984451294, + "logits/rejected": -1.3291412591934204, + "logps/chosen": -39.877689361572266, + "logps/rejected": -40.61794662475586, + "loss": 0.9386, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.5315914154052734, + "rewards/margins": -0.8188278675079346, + "rewards/rejected": 3.350419282913208, + "step": 8496 + }, + { + "epoch": 1.38, + "learning_rate": 2.3212655151725737e-06, + "logits/chosen": -1.6597442626953125, + "logits/rejected": -1.660559892654419, + "logps/chosen": -10.468252182006836, + "logps/rejected": -20.661409378051758, + "loss": 0.492, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.20340843498706818, + "rewards/margins": -0.07721616327762604, + "rewards/rejected": 0.2806245982646942, + "step": 8497 + }, + { + "epoch": 1.38, + "learning_rate": 2.3201558797756602e-06, + "logits/chosen": -1.2194539308547974, + "logits/rejected": -1.2648561000823975, + "logps/chosen": -108.77081298828125, + "logps/rejected": -68.43437957763672, + "loss": 0.4533, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.7550339698791504, + "rewards/margins": 0.12105941772460938, + "rewards/rejected": 3.633974552154541, + "step": 8498 + }, + { + "epoch": 1.38, + "learning_rate": 2.31904642953062e-06, + "logits/chosen": -1.2978595495224, + "logits/rejected": -1.1732288599014282, + "logps/chosen": -77.84353637695312, + "logps/rejected": -28.656845092773438, + "loss": 1.2618, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.3127143383026123, + "rewards/margins": 1.6078940629959106, + "rewards/rejected": 1.7048202753067017, + "step": 8499 + }, + { + "epoch": 1.38, + "learning_rate": 2.317937164514102e-06, + "logits/chosen": -1.1008272171020508, + "logits/rejected": -1.1396903991699219, + "logps/chosen": -36.16997146606445, + "logps/rejected": -48.330467224121094, + "loss": 0.8835, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.693522334098816, + "rewards/margins": -1.5663326978683472, + "rewards/rejected": 3.259855031967163, + "step": 8500 + }, + { + "epoch": 1.38, + "learning_rate": 2.3168280848027503e-06, + "logits/chosen": -1.2565889358520508, + "logits/rejected": -1.2145791053771973, + "logps/chosen": -75.78744506835938, + "logps/rejected": -88.56658172607422, + "loss": 0.39, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.00144362449646, + "rewards/margins": 0.20732665061950684, + "rewards/rejected": 2.794116973876953, + "step": 8501 + }, + { + "epoch": 1.38, + "learning_rate": 2.3157191904731874e-06, + "logits/chosen": -1.296949863433838, + "logits/rejected": -1.4053351879119873, + "logps/chosen": -49.22612762451172, + "logps/rejected": -174.343994140625, + "loss": 1.9129, + "rewards/accuracies": 0.0, + "rewards/chosen": 4.3375372886657715, + "rewards/margins": -1.151540756225586, + "rewards/rejected": 5.489078044891357, + "step": 8502 + }, + { + "epoch": 1.38, + "learning_rate": 2.3146104816020317e-06, + "logits/chosen": -1.5389983654022217, + "logits/rejected": -1.5389983654022217, + "logps/chosen": -29.252487182617188, + "logps/rejected": -29.252487182617188, + "loss": 0.3539, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.4977951049804688, + "rewards/margins": 0.0, + "rewards/rejected": 3.4977951049804688, + "step": 8503 + }, + { + "epoch": 1.38, + "learning_rate": 2.3135019582658803e-06, + "logits/chosen": -1.1781035661697388, + "logits/rejected": -0.9831138849258423, + "logps/chosen": -167.10238647460938, + "logps/rejected": -34.49574279785156, + "loss": 2.4369, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.134967088699341, + "rewards/margins": 0.8202491998672485, + "rewards/rejected": 1.3147178888320923, + "step": 8504 + }, + { + "epoch": 1.38, + "learning_rate": 2.3123936205413254e-06, + "logits/chosen": -1.3767449855804443, + "logits/rejected": -1.1838823556900024, + "logps/chosen": -95.34963989257812, + "logps/rejected": -77.7615966796875, + "loss": 0.2199, + "rewards/accuracies": 1.0, + "rewards/chosen": 8.41272258758545, + "rewards/margins": 0.9591307640075684, + "rewards/rejected": 7.453591823577881, + "step": 8505 + }, + { + "epoch": 1.38, + "learning_rate": 2.3112854685049397e-06, + "logits/chosen": -1.0273245573043823, + "logits/rejected": -1.0562680959701538, + "logps/chosen": -46.332523345947266, + "logps/rejected": -39.21133041381836, + "loss": 0.4408, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.5980418920516968, + "rewards/margins": 1.5733916759490967, + "rewards/rejected": 0.024650191888213158, + "step": 8506 + }, + { + "epoch": 1.38, + "learning_rate": 2.3101775022332884e-06, + "logits/chosen": -1.5479658842086792, + "logits/rejected": -1.6410373449325562, + "logps/chosen": -65.45870971679688, + "logps/rejected": -101.37548828125, + "loss": 0.7239, + "rewards/accuracies": 0.0, + "rewards/chosen": 5.612979412078857, + "rewards/margins": -0.10839366912841797, + "rewards/rejected": 5.721373081207275, + "step": 8507 + }, + { + "epoch": 1.38, + "learning_rate": 2.309069721802919e-06, + "logits/chosen": -1.1186167001724243, + "logits/rejected": 0.03854385018348694, + "logps/chosen": -58.64086151123047, + "logps/rejected": -42.13774871826172, + "loss": 0.38, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.325171709060669, + "rewards/margins": 2.407059907913208, + "rewards/rejected": -0.08188819885253906, + "step": 8508 + }, + { + "epoch": 1.38, + "learning_rate": 2.307962127290372e-06, + "logits/chosen": -1.2863800525665283, + "logits/rejected": -1.2041836977005005, + "logps/chosen": -111.10506439208984, + "logps/rejected": -53.77317810058594, + "loss": 0.1502, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.8668205738067627, + "rewards/margins": 1.466132402420044, + "rewards/rejected": 1.4006881713867188, + "step": 8509 + }, + { + "epoch": 1.38, + "learning_rate": 2.3068547187721673e-06, + "logits/chosen": -1.2321745157241821, + "logits/rejected": -0.8655891418457031, + "logps/chosen": -37.551124572753906, + "logps/rejected": -44.956748962402344, + "loss": 1.0222, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.3934178352355957, + "rewards/margins": -1.6946725845336914, + "rewards/rejected": 4.088090419769287, + "step": 8510 + }, + { + "epoch": 1.38, + "learning_rate": 2.3057474963248203e-06, + "logits/chosen": -1.2607678174972534, + "logits/rejected": -1.2649898529052734, + "logps/chosen": -53.407081604003906, + "logps/rejected": -90.41409301757812, + "loss": 0.8093, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.963243842124939, + "rewards/margins": 0.0633307695388794, + "rewards/rejected": 1.8999130725860596, + "step": 8511 + }, + { + "epoch": 1.38, + "learning_rate": 2.3046404600248275e-06, + "logits/chosen": -0.6686139106750488, + "logits/rejected": -0.6686139106750488, + "logps/chosen": -2.111802577972412, + "logps/rejected": -2.111802577972412, + "loss": 1.2594, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.2551809251308441, + "rewards/margins": 0.0, + "rewards/rejected": 0.2551809251308441, + "step": 8512 + }, + { + "epoch": 1.38, + "learning_rate": 2.303533609948676e-06, + "logits/chosen": -0.9384796023368835, + "logits/rejected": -0.9558244347572327, + "logps/chosen": -41.04977035522461, + "logps/rejected": -76.79023742675781, + "loss": 0.4273, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.2792209386825562, + "rewards/margins": 0.10675156116485596, + "rewards/rejected": 1.1724693775177002, + "step": 8513 + }, + { + "epoch": 1.38, + "learning_rate": 2.302426946172836e-06, + "logits/chosen": -1.488624095916748, + "logits/rejected": -1.488624095916748, + "logps/chosen": -40.82998275756836, + "logps/rejected": -40.82998275756836, + "loss": 1.0372, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.07930064201355, + "rewards/margins": 0.0, + "rewards/rejected": 2.07930064201355, + "step": 8514 + }, + { + "epoch": 1.38, + "learning_rate": 2.301320468773772e-06, + "logits/chosen": -1.7895100116729736, + "logits/rejected": -1.758569598197937, + "logps/chosen": -52.08049011230469, + "logps/rejected": -24.616931915283203, + "loss": 0.3401, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.8502792119979858, + "rewards/margins": 0.04814481735229492, + "rewards/rejected": 1.802134394645691, + "step": 8515 + }, + { + "epoch": 1.38, + "learning_rate": 2.3002141778279257e-06, + "logits/chosen": -1.3886672258377075, + "logits/rejected": -1.361143946647644, + "logps/chosen": -65.7562484741211, + "logps/rejected": -73.35175323486328, + "loss": 0.0499, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.682961940765381, + "rewards/margins": 2.257939100265503, + "rewards/rejected": 3.425022840499878, + "step": 8516 + }, + { + "epoch": 1.38, + "learning_rate": 2.2991080734117365e-06, + "logits/chosen": -1.0240453481674194, + "logits/rejected": -1.0009723901748657, + "logps/chosen": -65.25126647949219, + "logps/rejected": -68.23197937011719, + "loss": 0.9703, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.75581431388855, + "rewards/margins": 1.6126868724822998, + "rewards/rejected": 1.14312744140625, + "step": 8517 + }, + { + "epoch": 1.38, + "learning_rate": 2.2980021556016214e-06, + "logits/chosen": -1.3264892101287842, + "logits/rejected": -1.109209656715393, + "logps/chosen": -104.12081146240234, + "logps/rejected": -64.72811126708984, + "loss": 0.5156, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.617890357971191, + "rewards/margins": 1.9186546802520752, + "rewards/rejected": 2.699235677719116, + "step": 8518 + }, + { + "epoch": 1.38, + "learning_rate": 2.296896424473992e-06, + "logits/chosen": -1.003623366355896, + "logits/rejected": -0.9531447291374207, + "logps/chosen": -85.17277526855469, + "logps/rejected": -55.65814208984375, + "loss": 1.4781, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.8144004344940186, + "rewards/margins": 0.5850973129272461, + "rewards/rejected": 2.2293031215667725, + "step": 8519 + }, + { + "epoch": 1.38, + "learning_rate": 2.2957908801052418e-06, + "logits/chosen": -1.3032130002975464, + "logits/rejected": -1.2160409688949585, + "logps/chosen": -108.08023834228516, + "logps/rejected": -81.27983856201172, + "loss": 1.9448, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.759323835372925, + "rewards/margins": -1.036647081375122, + "rewards/rejected": 3.795970916748047, + "step": 8520 + }, + { + "epoch": 1.38, + "learning_rate": 2.2946855225717557e-06, + "logits/chosen": -1.2029750347137451, + "logits/rejected": -1.203835129737854, + "logps/chosen": -72.18476104736328, + "logps/rejected": -71.89424133300781, + "loss": 0.1388, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.369342088699341, + "rewards/margins": 1.5724258422851562, + "rewards/rejected": 0.7969161868095398, + "step": 8521 + }, + { + "epoch": 1.38, + "learning_rate": 2.2935803519499e-06, + "logits/chosen": -1.3263224363327026, + "logits/rejected": -1.2626384496688843, + "logps/chosen": -60.19961166381836, + "logps/rejected": -34.35591506958008, + "loss": 0.9182, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.6201175451278687, + "rewards/margins": -0.48202788829803467, + "rewards/rejected": 2.1021454334259033, + "step": 8522 + }, + { + "epoch": 1.38, + "learning_rate": 2.292475368316036e-06, + "logits/chosen": -1.3942806720733643, + "logits/rejected": -1.3331756591796875, + "logps/chosen": -67.78451538085938, + "logps/rejected": -64.77371215820312, + "loss": 0.4241, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.1955583095550537, + "rewards/margins": 0.01027679443359375, + "rewards/rejected": 2.18528151512146, + "step": 8523 + }, + { + "epoch": 1.38, + "learning_rate": 2.2913705717465027e-06, + "logits/chosen": -1.0356426239013672, + "logits/rejected": -1.0326021909713745, + "logps/chosen": -2.5116357803344727, + "logps/rejected": -1.9344309568405151, + "loss": 0.7962, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.24509353935718536, + "rewards/margins": -0.10817278921604156, + "rewards/rejected": 0.35326632857322693, + "step": 8524 + }, + { + "epoch": 1.38, + "learning_rate": 2.2902659623176364e-06, + "logits/chosen": -1.4362614154815674, + "logits/rejected": -1.5226718187332153, + "logps/chosen": -91.79194641113281, + "logps/rejected": -99.68663024902344, + "loss": 1.0116, + "rewards/accuracies": 0.0, + "rewards/chosen": 5.831190586090088, + "rewards/margins": -1.8754334449768066, + "rewards/rejected": 7.7066240310668945, + "step": 8525 + }, + { + "epoch": 1.38, + "learning_rate": 2.28916154010575e-06, + "logits/chosen": -1.19364333152771, + "logits/rejected": -1.1778837442398071, + "logps/chosen": -49.730445861816406, + "logps/rejected": -2.7783608436584473, + "loss": 0.3522, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.7349079251289368, + "rewards/margins": 0.24930769205093384, + "rewards/rejected": 0.48560023307800293, + "step": 8526 + }, + { + "epoch": 1.38, + "learning_rate": 2.2880573051871525e-06, + "logits/chosen": -1.4015737771987915, + "logits/rejected": -1.2664344310760498, + "logps/chosen": -80.12886047363281, + "logps/rejected": -41.100494384765625, + "loss": 0.0988, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.980504035949707, + "rewards/margins": 1.5318596363067627, + "rewards/rejected": 3.4486443996429443, + "step": 8527 + }, + { + "epoch": 1.38, + "learning_rate": 2.286953257638133e-06, + "logits/chosen": -1.0452443361282349, + "logits/rejected": -1.0436663627624512, + "logps/chosen": -2.255899667739868, + "logps/rejected": -15.161802291870117, + "loss": 0.4015, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.43178310990333557, + "rewards/margins": -0.2005024254322052, + "rewards/rejected": 0.6322855353355408, + "step": 8528 + }, + { + "epoch": 1.38, + "learning_rate": 2.2858493975349743e-06, + "logits/chosen": -1.4185163974761963, + "logits/rejected": -1.4249122142791748, + "logps/chosen": -89.85430145263672, + "logps/rejected": -72.72447204589844, + "loss": 0.3257, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.793766021728516, + "rewards/margins": 2.06862473487854, + "rewards/rejected": 2.7251412868499756, + "step": 8529 + }, + { + "epoch": 1.38, + "learning_rate": 2.284745724953939e-06, + "logits/chosen": -1.2072243690490723, + "logits/rejected": -1.1346889734268188, + "logps/chosen": -63.21693420410156, + "logps/rejected": -56.20843505859375, + "loss": 0.6157, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9922256469726562, + "rewards/margins": 0.20809698104858398, + "rewards/rejected": 2.7841286659240723, + "step": 8530 + }, + { + "epoch": 1.38, + "learning_rate": 2.283642239971284e-06, + "logits/chosen": -1.3392411470413208, + "logits/rejected": -1.2152879238128662, + "logps/chosen": -70.79039001464844, + "logps/rejected": -24.19867515563965, + "loss": 0.0498, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.951197862625122, + "rewards/margins": 2.424302577972412, + "rewards/rejected": 0.5268953442573547, + "step": 8531 + }, + { + "epoch": 1.38, + "learning_rate": 2.2825389426632448e-06, + "logits/chosen": -0.9947308301925659, + "logits/rejected": -0.9856163263320923, + "logps/chosen": -20.990842819213867, + "logps/rejected": -3.186974048614502, + "loss": 0.749, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.519352912902832, + "rewards/margins": 0.13746923208236694, + "rewards/rejected": 0.3818836808204651, + "step": 8532 + }, + { + "epoch": 1.39, + "learning_rate": 2.2814358331060533e-06, + "logits/chosen": -1.3193739652633667, + "logits/rejected": -1.1448971033096313, + "logps/chosen": -146.403564453125, + "logps/rejected": -44.50637435913086, + "loss": 0.0116, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.133970737457275, + "rewards/margins": 3.761756658554077, + "rewards/rejected": 2.3722140789031982, + "step": 8533 + }, + { + "epoch": 1.39, + "learning_rate": 2.2803329113759256e-06, + "logits/chosen": -1.1934982538223267, + "logits/rejected": -1.2048195600509644, + "logps/chosen": -68.19017028808594, + "logps/rejected": -98.10165405273438, + "loss": 0.1999, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.29559326171875, + "rewards/margins": 0.8887549638748169, + "rewards/rejected": 1.406838297843933, + "step": 8534 + }, + { + "epoch": 1.39, + "learning_rate": 2.279230177549056e-06, + "logits/chosen": -1.5303497314453125, + "logits/rejected": -1.5392471551895142, + "logps/chosen": -21.332639694213867, + "logps/rejected": -56.56342315673828, + "loss": 1.6403, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.7432560324668884, + "rewards/margins": -2.1086535453796387, + "rewards/rejected": 2.851909637451172, + "step": 8535 + }, + { + "epoch": 1.39, + "learning_rate": 2.2781276317016387e-06, + "logits/chosen": -1.1012907028198242, + "logits/rejected": -1.0976685285568237, + "logps/chosen": -3.8472602367401123, + "logps/rejected": -3.0235657691955566, + "loss": 0.4943, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.22752021253108978, + "rewards/margins": -0.058784082531929016, + "rewards/rejected": 0.2863042950630188, + "step": 8536 + }, + { + "epoch": 1.39, + "learning_rate": 2.2770252739098464e-06, + "logits/chosen": -1.2747281789779663, + "logits/rejected": -1.163952112197876, + "logps/chosen": -99.28817749023438, + "logps/rejected": -55.996124267578125, + "loss": 0.3739, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.8237113952636719, + "rewards/margins": 1.5694472789764404, + "rewards/rejected": 0.25426408648490906, + "step": 8537 + }, + { + "epoch": 1.39, + "learning_rate": 2.2759231042498434e-06, + "logits/chosen": -0.9829792380332947, + "logits/rejected": -0.9805874228477478, + "logps/chosen": -1.3373894691467285, + "logps/rejected": -2.116746425628662, + "loss": 0.5511, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.11620106548070908, + "rewards/margins": 0.032143354415893555, + "rewards/rejected": 0.08405771106481552, + "step": 8538 + }, + { + "epoch": 1.39, + "learning_rate": 2.2748211227977775e-06, + "logits/chosen": -1.0574780702590942, + "logits/rejected": -1.0574780702590942, + "logps/chosen": -46.58674240112305, + "logps/rejected": -46.58674240112305, + "loss": 1.1867, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.375495195388794, + "rewards/margins": 0.0, + "rewards/rejected": 1.375495195388794, + "step": 8539 + }, + { + "epoch": 1.39, + "learning_rate": 2.2737193296297877e-06, + "logits/chosen": -1.4669219255447388, + "logits/rejected": -1.451301097869873, + "logps/chosen": -67.86872100830078, + "logps/rejected": -86.54261779785156, + "loss": 0.1091, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.775444984436035, + "rewards/margins": 1.5213778018951416, + "rewards/rejected": 3.2540671825408936, + "step": 8540 + }, + { + "epoch": 1.39, + "learning_rate": 2.272617724821994e-06, + "logits/chosen": -1.3549630641937256, + "logits/rejected": -1.438592553138733, + "logps/chosen": -40.961605072021484, + "logps/rejected": -96.64812469482422, + "loss": 1.81, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.3873326778411865, + "rewards/margins": -2.79660964012146, + "rewards/rejected": 6.1839423179626465, + "step": 8541 + }, + { + "epoch": 1.39, + "learning_rate": 2.271516308450511e-06, + "logits/chosen": -1.6631594896316528, + "logits/rejected": -1.6548717021942139, + "logps/chosen": -119.8618392944336, + "logps/rejected": -180.9224090576172, + "loss": 0.4104, + "rewards/accuracies": 0.0, + "rewards/chosen": 7.5703558921813965, + "rewards/margins": -0.1683664321899414, + "rewards/rejected": 7.738722324371338, + "step": 8542 + }, + { + "epoch": 1.39, + "learning_rate": 2.2704150805914314e-06, + "logits/chosen": -1.3762624263763428, + "logits/rejected": -1.3479188680648804, + "logps/chosen": -65.20858764648438, + "logps/rejected": -93.31111907958984, + "loss": 0.1328, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.442027568817139, + "rewards/margins": 1.199361801147461, + "rewards/rejected": 4.242665767669678, + "step": 8543 + }, + { + "epoch": 1.39, + "learning_rate": 2.2693140413208447e-06, + "logits/chosen": -1.3875731229782104, + "logits/rejected": -1.4481472969055176, + "logps/chosen": -43.477439880371094, + "logps/rejected": -122.4893798828125, + "loss": 0.2632, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.299772024154663, + "rewards/margins": 0.9021371603012085, + "rewards/rejected": 1.3976348638534546, + "step": 8544 + }, + { + "epoch": 1.39, + "learning_rate": 2.2682131907148174e-06, + "logits/chosen": -1.2879998683929443, + "logits/rejected": -1.0567831993103027, + "logps/chosen": -57.22828674316406, + "logps/rejected": -17.153934478759766, + "loss": 0.1508, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.3908944129943848, + "rewards/margins": 1.173316240310669, + "rewards/rejected": 1.2175781726837158, + "step": 8545 + }, + { + "epoch": 1.39, + "learning_rate": 2.2671125288494123e-06, + "logits/chosen": -1.2912918329238892, + "logits/rejected": -1.2912918329238892, + "logps/chosen": -20.190614700317383, + "logps/rejected": -20.190614700317383, + "loss": 0.5111, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.48703479766845703, + "rewards/margins": 0.0, + "rewards/rejected": 0.48703479766845703, + "step": 8546 + }, + { + "epoch": 1.39, + "learning_rate": 2.266012055800671e-06, + "logits/chosen": -1.3359287977218628, + "logits/rejected": -1.2547646760940552, + "logps/chosen": -56.119659423828125, + "logps/rejected": -10.93673038482666, + "loss": 0.3765, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.071950674057007, + "rewards/margins": 1.9175279140472412, + "rewards/rejected": 0.15442276000976562, + "step": 8547 + }, + { + "epoch": 1.39, + "learning_rate": 2.264911771644629e-06, + "logits/chosen": -1.35637366771698, + "logits/rejected": -1.2405784130096436, + "logps/chosen": -70.69147491455078, + "logps/rejected": -32.359989166259766, + "loss": 0.0369, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.215783596038818, + "rewards/margins": 2.597456932067871, + "rewards/rejected": 1.6183265447616577, + "step": 8548 + }, + { + "epoch": 1.39, + "learning_rate": 2.263811676457302e-06, + "logits/chosen": -0.8265113830566406, + "logits/rejected": -0.8649028539657593, + "logps/chosen": -28.808971405029297, + "logps/rejected": -52.43939208984375, + "loss": 0.5474, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.4893040657043457, + "rewards/margins": -0.3327159881591797, + "rewards/rejected": 2.8220200538635254, + "step": 8549 + }, + { + "epoch": 1.39, + "learning_rate": 2.2627117703147e-06, + "logits/chosen": -1.4388959407806396, + "logits/rejected": -1.4142301082611084, + "logps/chosen": -103.17911529541016, + "logps/rejected": -78.61338806152344, + "loss": 0.1989, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.695934295654297, + "rewards/margins": 1.1252418756484985, + "rewards/rejected": 1.5706924200057983, + "step": 8550 + }, + { + "epoch": 1.39, + "learning_rate": 2.2616120532928126e-06, + "logits/chosen": -1.1417652368545532, + "logits/rejected": -1.2319424152374268, + "logps/chosen": -51.55159378051758, + "logps/rejected": -193.87155151367188, + "loss": 0.6026, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.351999282836914, + "rewards/margins": 0.2208881378173828, + "rewards/rejected": 1.1311111450195312, + "step": 8551 + }, + { + "epoch": 1.39, + "learning_rate": 2.260512525467621e-06, + "logits/chosen": -1.3636894226074219, + "logits/rejected": -1.385464072227478, + "logps/chosen": -69.97186279296875, + "logps/rejected": -57.39262390136719, + "loss": 1.2694, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.5775718688964844, + "rewards/margins": -1.923774242401123, + "rewards/rejected": 4.501346111297607, + "step": 8552 + }, + { + "epoch": 1.39, + "learning_rate": 2.2594131869150947e-06, + "logits/chosen": -0.8281123042106628, + "logits/rejected": -0.8260846734046936, + "logps/chosen": -82.63058471679688, + "logps/rejected": -92.40455627441406, + "loss": 0.5245, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.5810402631759644, + "rewards/margins": 0.4127250909805298, + "rewards/rejected": 1.1683151721954346, + "step": 8553 + }, + { + "epoch": 1.39, + "learning_rate": 2.258314037711184e-06, + "logits/chosen": -1.0142155885696411, + "logits/rejected": -1.0142155885696411, + "logps/chosen": -72.94285583496094, + "logps/rejected": -72.94285583496094, + "loss": 0.5189, + "rewards/accuracies": 0.0, + "rewards/chosen": 4.950641632080078, + "rewards/margins": 0.0, + "rewards/rejected": 4.950641632080078, + "step": 8554 + }, + { + "epoch": 1.39, + "learning_rate": 2.2572150779318325e-06, + "logits/chosen": -1.5417518615722656, + "logits/rejected": -1.4373478889465332, + "logps/chosen": -87.24411010742188, + "logps/rejected": -21.17365264892578, + "loss": 0.1815, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.970121145248413, + "rewards/margins": 3.26949405670166, + "rewards/rejected": 0.7006271481513977, + "step": 8555 + }, + { + "epoch": 1.39, + "learning_rate": 2.2561163076529653e-06, + "logits/chosen": -1.3021955490112305, + "logits/rejected": -1.3038760423660278, + "logps/chosen": -66.24635314941406, + "logps/rejected": -67.39812469482422, + "loss": 0.105, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.3700103759765625, + "rewards/margins": 1.4673042297363281, + "rewards/rejected": 0.9027061462402344, + "step": 8556 + }, + { + "epoch": 1.39, + "learning_rate": 2.2550177269505e-06, + "logits/chosen": -1.3187450170516968, + "logits/rejected": -1.3690861463546753, + "logps/chosen": -65.04827117919922, + "logps/rejected": -87.35515594482422, + "loss": 0.0964, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.5763604640960693, + "rewards/margins": 1.7257341146469116, + "rewards/rejected": 1.8506263494491577, + "step": 8557 + }, + { + "epoch": 1.39, + "learning_rate": 2.2539193359003343e-06, + "logits/chosen": -1.107463002204895, + "logits/rejected": -1.0495589971542358, + "logps/chosen": -104.12850952148438, + "logps/rejected": -44.56500244140625, + "loss": 0.173, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.8178162574768066, + "rewards/margins": 1.479823350906372, + "rewards/rejected": 2.3379929065704346, + "step": 8558 + }, + { + "epoch": 1.39, + "learning_rate": 2.2528211345783614e-06, + "logits/chosen": -1.0720701217651367, + "logits/rejected": -1.0720701217651367, + "logps/chosen": -42.085044860839844, + "logps/rejected": -42.085044860839844, + "loss": 0.5188, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.0293068885803223, + "rewards/margins": 0.0, + "rewards/rejected": 3.0293068885803223, + "step": 8559 + }, + { + "epoch": 1.39, + "learning_rate": 2.2517231230604516e-06, + "logits/chosen": -1.3546690940856934, + "logits/rejected": -1.3546690940856934, + "logps/chosen": -91.39266967773438, + "logps/rejected": -91.39266967773438, + "loss": 0.353, + "rewards/accuracies": 0.0, + "rewards/chosen": 4.224450588226318, + "rewards/margins": 0.0, + "rewards/rejected": 4.224450588226318, + "step": 8560 + }, + { + "epoch": 1.39, + "learning_rate": 2.2506253014224715e-06, + "logits/chosen": -1.111265778541565, + "logits/rejected": -1.1106479167938232, + "logps/chosen": -72.9261703491211, + "logps/rejected": -56.7210693359375, + "loss": 1.0408, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.8079986572265625, + "rewards/margins": -0.01620328426361084, + "rewards/rejected": 1.8242019414901733, + "step": 8561 + }, + { + "epoch": 1.39, + "learning_rate": 2.2495276697402663e-06, + "logits/chosen": -1.3815467357635498, + "logits/rejected": -1.433166265487671, + "logps/chosen": -94.47016143798828, + "logps/rejected": -72.30528259277344, + "loss": 0.4203, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.945704698562622, + "rewards/margins": 0.41864848136901855, + "rewards/rejected": 2.5270562171936035, + "step": 8562 + }, + { + "epoch": 1.39, + "learning_rate": 2.2484302280896765e-06, + "logits/chosen": -0.9873601198196411, + "logits/rejected": -0.9743943810462952, + "logps/chosen": -73.43318939208984, + "logps/rejected": -68.92366027832031, + "loss": 0.33, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.0773627758026123, + "rewards/margins": 0.1940985918045044, + "rewards/rejected": 1.883264183998108, + "step": 8563 + }, + { + "epoch": 1.39, + "learning_rate": 2.2473329765465195e-06, + "logits/chosen": -1.234168291091919, + "logits/rejected": -1.1091413497924805, + "logps/chosen": -53.32423400878906, + "logps/rejected": -16.106069564819336, + "loss": 0.5299, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.9551193118095398, + "rewards/margins": -0.23274296522140503, + "rewards/rejected": 1.1878622770309448, + "step": 8564 + }, + { + "epoch": 1.39, + "learning_rate": 2.2462359151866104e-06, + "logits/chosen": -1.2619118690490723, + "logits/rejected": -1.1874337196350098, + "logps/chosen": -67.79621887207031, + "logps/rejected": -58.7980842590332, + "loss": 0.5526, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9256293773651123, + "rewards/margins": 0.6734004020690918, + "rewards/rejected": 2.2522289752960205, + "step": 8565 + }, + { + "epoch": 1.39, + "learning_rate": 2.2451390440857406e-06, + "logits/chosen": -1.2992033958435059, + "logits/rejected": -1.2183501720428467, + "logps/chosen": -78.96412658691406, + "logps/rejected": -79.2030029296875, + "loss": 0.2841, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.243992567062378, + "rewards/margins": 0.43010401725769043, + "rewards/rejected": 1.8138885498046875, + "step": 8566 + }, + { + "epoch": 1.39, + "learning_rate": 2.244042363319699e-06, + "logits/chosen": -1.4724299907684326, + "logits/rejected": -1.523711919784546, + "logps/chosen": -41.6338005065918, + "logps/rejected": -69.27925109863281, + "loss": 2.833, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.5415546894073486, + "rewards/margins": -0.44603991508483887, + "rewards/rejected": 2.9875946044921875, + "step": 8567 + }, + { + "epoch": 1.39, + "learning_rate": 2.2429458729642506e-06, + "logits/chosen": -1.0117835998535156, + "logits/rejected": -1.0409592390060425, + "logps/chosen": -0.8918878436088562, + "logps/rejected": -26.48333168029785, + "loss": 0.348, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5025641322135925, + "rewards/margins": 0.03480422496795654, + "rewards/rejected": 0.467759907245636, + "step": 8568 + }, + { + "epoch": 1.39, + "learning_rate": 2.2418495730951565e-06, + "logits/chosen": -1.3144590854644775, + "logits/rejected": -1.395334005355835, + "logps/chosen": -62.104713439941406, + "logps/rejected": -85.55428314208984, + "loss": 0.6039, + "rewards/accuracies": 0.0, + "rewards/chosen": 6.320592403411865, + "rewards/margins": -0.8372607231140137, + "rewards/rejected": 7.157853126525879, + "step": 8569 + }, + { + "epoch": 1.39, + "learning_rate": 2.2407534637881574e-06, + "logits/chosen": -1.1021955013275146, + "logits/rejected": -1.132716178894043, + "logps/chosen": -60.64630889892578, + "logps/rejected": -81.27552795410156, + "loss": 0.721, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.32354736328125, + "rewards/margins": -0.8835556507110596, + "rewards/rejected": 3.2071030139923096, + "step": 8570 + }, + { + "epoch": 1.39, + "learning_rate": 2.2396575451189878e-06, + "logits/chosen": -1.54263436794281, + "logits/rejected": -1.5324490070343018, + "logps/chosen": -53.753387451171875, + "logps/rejected": -52.94952392578125, + "loss": 0.425, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.8255394101142883, + "rewards/margins": -0.25010257959365845, + "rewards/rejected": 1.0756419897079468, + "step": 8571 + }, + { + "epoch": 1.39, + "learning_rate": 2.2385618171633612e-06, + "logits/chosen": -1.4828983545303345, + "logits/rejected": -1.468063235282898, + "logps/chosen": -91.22688293457031, + "logps/rejected": -50.171348571777344, + "loss": 0.1451, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.276007890701294, + "rewards/margins": 1.1109557151794434, + "rewards/rejected": 2.1650521755218506, + "step": 8572 + }, + { + "epoch": 1.39, + "learning_rate": 2.237466279996986e-06, + "logits/chosen": -1.3500865697860718, + "logits/rejected": -1.469844937324524, + "logps/chosen": -103.76826477050781, + "logps/rejected": -142.80984497070312, + "loss": 2.4667, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.594020128250122, + "rewards/margins": -3.2389237880706787, + "rewards/rejected": 6.832943916320801, + "step": 8573 + }, + { + "epoch": 1.39, + "learning_rate": 2.236370933695549e-06, + "logits/chosen": -1.4063825607299805, + "logits/rejected": -1.3968901634216309, + "logps/chosen": -85.43065643310547, + "logps/rejected": -84.51527404785156, + "loss": 0.5822, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.310736060142517, + "rewards/margins": -0.7772887945175171, + "rewards/rejected": 2.088024854660034, + "step": 8574 + }, + { + "epoch": 1.39, + "learning_rate": 2.2352757783347335e-06, + "logits/chosen": -1.301706075668335, + "logits/rejected": -1.3247807025909424, + "logps/chosen": -78.85794830322266, + "logps/rejected": -83.82145690917969, + "loss": 0.1744, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.243898868560791, + "rewards/margins": 1.3002526760101318, + "rewards/rejected": 3.943646192550659, + "step": 8575 + }, + { + "epoch": 1.39, + "learning_rate": 2.234180813990199e-06, + "logits/chosen": -0.8203490972518921, + "logits/rejected": -0.8089275360107422, + "logps/chosen": -0.8449994921684265, + "logps/rejected": -3.5278468132019043, + "loss": 0.3398, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.28336963057518005, + "rewards/margins": 0.11108015477657318, + "rewards/rejected": 0.17228947579860687, + "step": 8576 + }, + { + "epoch": 1.39, + "learning_rate": 2.2330860407376027e-06, + "logits/chosen": -1.1549794673919678, + "logits/rejected": -1.1524693965911865, + "logps/chosen": -52.013275146484375, + "logps/rejected": -56.14583969116211, + "loss": 0.6763, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.226680040359497, + "rewards/margins": 2.149290084838867, + "rewards/rejected": 1.0773899555206299, + "step": 8577 + }, + { + "epoch": 1.39, + "learning_rate": 2.2319914586525776e-06, + "logits/chosen": -0.9809325933456421, + "logits/rejected": -1.1118921041488647, + "logps/chosen": -20.137636184692383, + "logps/rejected": -52.693504333496094, + "loss": 2.0845, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.7698206305503845, + "rewards/margins": -4.15308141708374, + "rewards/rejected": 4.9229021072387695, + "step": 8578 + }, + { + "epoch": 1.39, + "learning_rate": 2.230897067810754e-06, + "logits/chosen": -1.2042174339294434, + "logits/rejected": -1.191887617111206, + "logps/chosen": -69.79847717285156, + "logps/rejected": -73.82899475097656, + "loss": 0.3139, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.114565372467041, + "rewards/margins": 0.8425843715667725, + "rewards/rejected": 2.2719810009002686, + "step": 8579 + }, + { + "epoch": 1.39, + "learning_rate": 2.2298028682877393e-06, + "logits/chosen": -1.272809624671936, + "logits/rejected": -1.270265817642212, + "logps/chosen": -114.60603332519531, + "logps/rejected": -96.46600341796875, + "loss": 1.2958, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.9788269996643066, + "rewards/margins": -2.5090150833129883, + "rewards/rejected": 6.487842082977295, + "step": 8580 + }, + { + "epoch": 1.39, + "learning_rate": 2.228708860159136e-06, + "logits/chosen": -1.5487998723983765, + "logits/rejected": -1.3950355052947998, + "logps/chosen": -155.14718627929688, + "logps/rejected": -42.10991668701172, + "loss": 0.0057, + "rewards/accuracies": 1.0, + "rewards/chosen": 7.640301704406738, + "rewards/margins": 4.645228385925293, + "rewards/rejected": 2.995073080062866, + "step": 8581 + }, + { + "epoch": 1.39, + "learning_rate": 2.227615043500527e-06, + "logits/chosen": -1.4000744819641113, + "logits/rejected": -1.1308172941207886, + "logps/chosen": -140.42417907714844, + "logps/rejected": -19.819990158081055, + "loss": 0.0718, + "rewards/accuracies": 1.0, + "rewards/chosen": 7.898889064788818, + "rewards/margins": 7.880463123321533, + "rewards/rejected": 0.018425941467285156, + "step": 8582 + }, + { + "epoch": 1.39, + "learning_rate": 2.2265214183874877e-06, + "logits/chosen": -1.278316617012024, + "logits/rejected": -1.2376048564910889, + "logps/chosen": -72.97601318359375, + "logps/rejected": -64.69929504394531, + "loss": 2.4694, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.4035751819610596, + "rewards/margins": 0.12587130069732666, + "rewards/rejected": 1.277703881263733, + "step": 8583 + }, + { + "epoch": 1.39, + "learning_rate": 2.2254279848955736e-06, + "logits/chosen": -1.0107285976409912, + "logits/rejected": -0.9284160137176514, + "logps/chosen": -27.244915008544922, + "logps/rejected": -44.494056701660156, + "loss": 1.1064, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.533552646636963, + "rewards/margins": 0.6580685377120972, + "rewards/rejected": 1.8754841089248657, + "step": 8584 + }, + { + "epoch": 1.39, + "learning_rate": 2.2243347431003344e-06, + "logits/chosen": -1.698399543762207, + "logits/rejected": -1.7581278085708618, + "logps/chosen": -116.12863159179688, + "logps/rejected": -87.2020492553711, + "loss": 0.2139, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.999890327453613, + "rewards/margins": 1.7539877891540527, + "rewards/rejected": 4.2459025382995605, + "step": 8585 + }, + { + "epoch": 1.39, + "learning_rate": 2.2232416930772986e-06, + "logits/chosen": -0.9188557863235474, + "logits/rejected": -0.9298407435417175, + "logps/chosen": -1.1322556734085083, + "logps/rejected": -23.01472282409668, + "loss": 0.4343, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.37408193945884705, + "rewards/margins": 0.12544474005699158, + "rewards/rejected": 0.24863719940185547, + "step": 8586 + }, + { + "epoch": 1.39, + "learning_rate": 2.2221488349019903e-06, + "logits/chosen": -0.9596850872039795, + "logits/rejected": -0.9253052473068237, + "logps/chosen": -114.16740417480469, + "logps/rejected": -80.31073760986328, + "loss": 0.4038, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.953723192214966, + "rewards/margins": 0.6774849891662598, + "rewards/rejected": 2.276238203048706, + "step": 8587 + }, + { + "epoch": 1.39, + "learning_rate": 2.221056168649911e-06, + "logits/chosen": -1.3492566347122192, + "logits/rejected": -1.3800331354141235, + "logps/chosen": -53.77410125732422, + "logps/rejected": -83.62432098388672, + "loss": 1.8329, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.311260223388672, + "rewards/margins": -3.2269396781921387, + "rewards/rejected": 6.5381999015808105, + "step": 8588 + }, + { + "epoch": 1.39, + "learning_rate": 2.2199636943965564e-06, + "logits/chosen": -1.3805713653564453, + "logits/rejected": -1.3060121536254883, + "logps/chosen": -86.38954162597656, + "logps/rejected": -84.39566802978516, + "loss": 0.3877, + "rewards/accuracies": 1.0, + "rewards/chosen": 7.937901496887207, + "rewards/margins": 3.284587860107422, + "rewards/rejected": 4.653313636779785, + "step": 8589 + }, + { + "epoch": 1.39, + "learning_rate": 2.2188714122174064e-06, + "logits/chosen": -1.1732754707336426, + "logits/rejected": -1.174329161643982, + "logps/chosen": -1.6821115016937256, + "logps/rejected": -1.5701415538787842, + "loss": 0.4294, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.28150713443756104, + "rewards/margins": 0.08099044859409332, + "rewards/rejected": 0.2005166858434677, + "step": 8590 + }, + { + "epoch": 1.39, + "learning_rate": 2.2177793221879256e-06, + "logits/chosen": -1.4257813692092896, + "logits/rejected": -1.357883334159851, + "logps/chosen": -97.37359619140625, + "logps/rejected": -35.54216766357422, + "loss": 0.4567, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.2574737071990967, + "rewards/margins": -0.35129785537719727, + "rewards/rejected": 2.608771562576294, + "step": 8591 + }, + { + "epoch": 1.39, + "learning_rate": 2.2166874243835696e-06, + "logits/chosen": -1.098618507385254, + "logits/rejected": -1.1699110269546509, + "logps/chosen": -51.63063049316406, + "logps/rejected": -62.85342788696289, + "loss": 0.8677, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2522568702697754, + "rewards/margins": 1.2524632215499878, + "rewards/rejected": 1.9997936487197876, + "step": 8592 + }, + { + "epoch": 1.39, + "learning_rate": 2.2155957188797745e-06, + "logits/chosen": -1.3655705451965332, + "logits/rejected": -1.3781912326812744, + "logps/chosen": -63.0052604675293, + "logps/rejected": -74.57272338867188, + "loss": 0.3132, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.227064847946167, + "rewards/margins": 1.9154101610183716, + "rewards/rejected": 0.311654657125473, + "step": 8593 + }, + { + "epoch": 1.39, + "learning_rate": 2.214504205751971e-06, + "logits/chosen": -1.3028794527053833, + "logits/rejected": -1.2731341123580933, + "logps/chosen": -85.85006713867188, + "logps/rejected": -91.02978515625, + "loss": 1.2222, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.837014079093933, + "rewards/margins": -2.2196431159973145, + "rewards/rejected": 4.056657314300537, + "step": 8594 + }, + { + "epoch": 1.4, + "learning_rate": 2.2134128850755688e-06, + "logits/chosen": -1.2738149166107178, + "logits/rejected": -1.29172682762146, + "logps/chosen": -53.38656234741211, + "logps/rejected": -55.27143096923828, + "loss": 0.5974, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.9543819427490234, + "rewards/margins": -0.3019289970397949, + "rewards/rejected": 4.256310939788818, + "step": 8595 + }, + { + "epoch": 1.4, + "learning_rate": 2.212321756925971e-06, + "logits/chosen": -1.0072189569473267, + "logits/rejected": -1.0353574752807617, + "logps/chosen": -3.117649793624878, + "logps/rejected": -29.767467498779297, + "loss": 0.8812, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.29474207758903503, + "rewards/margins": -0.6620019674301147, + "rewards/rejected": 0.9567440152168274, + "step": 8596 + }, + { + "epoch": 1.4, + "learning_rate": 2.2112308213785598e-06, + "logits/chosen": -1.186597466468811, + "logits/rejected": -1.2225780487060547, + "logps/chosen": -23.36034393310547, + "logps/rejected": -42.80512619018555, + "loss": 1.0596, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.3313912153244019, + "rewards/margins": -1.1779261827468872, + "rewards/rejected": 2.509317398071289, + "step": 8597 + }, + { + "epoch": 1.4, + "learning_rate": 2.210140078508714e-06, + "logits/chosen": -1.4563547372817993, + "logits/rejected": -1.4203550815582275, + "logps/chosen": -85.0569076538086, + "logps/rejected": -221.8984375, + "loss": 1.0511, + "rewards/accuracies": 0.0, + "rewards/chosen": 7.432718753814697, + "rewards/margins": -1.8230032920837402, + "rewards/rejected": 9.255722045898438, + "step": 8598 + }, + { + "epoch": 1.4, + "learning_rate": 2.209049528391789e-06, + "logits/chosen": -0.8224910497665405, + "logits/rejected": -0.8736858367919922, + "logps/chosen": -86.87188720703125, + "logps/rejected": -60.450714111328125, + "loss": 0.7438, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.4411605596542358, + "rewards/margins": -0.39297640323638916, + "rewards/rejected": 1.834136962890625, + "step": 8599 + }, + { + "epoch": 1.4, + "learning_rate": 2.2079591711031353e-06, + "logits/chosen": -0.9942559003829956, + "logits/rejected": -1.0593717098236084, + "logps/chosen": -64.90707397460938, + "logps/rejected": -140.72213745117188, + "loss": 0.7423, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.3839691877365112, + "rewards/margins": -0.44133293628692627, + "rewards/rejected": 1.8253021240234375, + "step": 8600 + }, + { + "epoch": 1.4, + "learning_rate": 2.206869006718082e-06, + "logits/chosen": -1.1666338443756104, + "logits/rejected": -1.1105775833129883, + "logps/chosen": -106.1951904296875, + "logps/rejected": -109.22602081298828, + "loss": 0.1443, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.966929912567139, + "rewards/margins": 2.537774085998535, + "rewards/rejected": 3.4291558265686035, + "step": 8601 + }, + { + "epoch": 1.4, + "learning_rate": 2.2057790353119533e-06, + "logits/chosen": -1.0132689476013184, + "logits/rejected": -1.0229759216308594, + "logps/chosen": -14.82843017578125, + "logps/rejected": -1.4817919731140137, + "loss": 0.4084, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.2984790802001953, + "rewards/margins": -0.1686493456363678, + "rewards/rejected": 0.4671284258365631, + "step": 8602 + }, + { + "epoch": 1.4, + "learning_rate": 2.2046892569600532e-06, + "logits/chosen": -1.5252496004104614, + "logits/rejected": -1.575453281402588, + "logps/chosen": -54.743812561035156, + "logps/rejected": -152.20452880859375, + "loss": 0.9506, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.948362112045288, + "rewards/margins": -1.6997487545013428, + "rewards/rejected": 4.648110866546631, + "step": 8603 + }, + { + "epoch": 1.4, + "learning_rate": 2.203599671737677e-06, + "logits/chosen": -1.4505199193954468, + "logits/rejected": -1.200670599937439, + "logps/chosen": -68.43986511230469, + "logps/rejected": -28.963645935058594, + "loss": 0.0159, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.359901428222656, + "rewards/margins": 3.705707550048828, + "rewards/rejected": 0.6541938781738281, + "step": 8604 + }, + { + "epoch": 1.4, + "learning_rate": 2.202510279720102e-06, + "logits/chosen": -1.3765945434570312, + "logits/rejected": -1.3664969205856323, + "logps/chosen": -51.51264572143555, + "logps/rejected": -43.84800338745117, + "loss": 0.4734, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.50982403755188, + "rewards/margins": -0.4509453773498535, + "rewards/rejected": 3.9607694149017334, + "step": 8605 + }, + { + "epoch": 1.4, + "learning_rate": 2.201421080982598e-06, + "logits/chosen": -1.1072880029678345, + "logits/rejected": -1.1196078062057495, + "logps/chosen": -42.322853088378906, + "logps/rejected": -67.86526489257812, + "loss": 3.9244, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.6678627729415894, + "rewards/margins": -2.302950382232666, + "rewards/rejected": 3.970813035964966, + "step": 8606 + }, + { + "epoch": 1.4, + "learning_rate": 2.2003320756004154e-06, + "logits/chosen": -1.2615996599197388, + "logits/rejected": -1.238476037979126, + "logps/chosen": -61.585689544677734, + "logps/rejected": -40.61202621459961, + "loss": 0.3101, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.334890365600586, + "rewards/margins": 0.99053955078125, + "rewards/rejected": 1.344350814819336, + "step": 8607 + }, + { + "epoch": 1.4, + "learning_rate": 2.1992432636487976e-06, + "logits/chosen": -1.0655523538589478, + "logits/rejected": -1.041306495666504, + "logps/chosen": -57.077877044677734, + "logps/rejected": -62.44285583496094, + "loss": 0.3563, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.106821060180664, + "rewards/margins": 1.0283863544464111, + "rewards/rejected": 2.078434705734253, + "step": 8608 + }, + { + "epoch": 1.4, + "learning_rate": 2.198154645202968e-06, + "logits/chosen": -1.9045608043670654, + "logits/rejected": -1.9693715572357178, + "logps/chosen": -42.35226058959961, + "logps/rejected": -115.70225524902344, + "loss": 1.0721, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.8488426208496094, + "rewards/margins": -2.000673770904541, + "rewards/rejected": 3.8495163917541504, + "step": 8609 + }, + { + "epoch": 1.4, + "learning_rate": 2.1970662203381423e-06, + "logits/chosen": -1.0744379758834839, + "logits/rejected": -1.1038280725479126, + "logps/chosen": -50.47624206542969, + "logps/rejected": -67.19548034667969, + "loss": 0.7804, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.897350311279297, + "rewards/margins": 0.9431478977203369, + "rewards/rejected": 2.95420241355896, + "step": 8610 + }, + { + "epoch": 1.4, + "learning_rate": 2.1959779891295167e-06, + "logits/chosen": -1.1203376054763794, + "logits/rejected": -1.071714997291565, + "logps/chosen": -52.69076919555664, + "logps/rejected": -26.28271484375, + "loss": 0.6563, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.1161205768585205, + "rewards/margins": -0.6291217803955078, + "rewards/rejected": 2.7452423572540283, + "step": 8611 + }, + { + "epoch": 1.4, + "learning_rate": 2.194889951652283e-06, + "logits/chosen": -1.2880581617355347, + "logits/rejected": -1.2057124376296997, + "logps/chosen": -139.35391235351562, + "logps/rejected": -83.66183471679688, + "loss": 0.2149, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.1755340099334717, + "rewards/margins": 0.6393507719039917, + "rewards/rejected": 1.53618323802948, + "step": 8612 + }, + { + "epoch": 1.4, + "learning_rate": 2.1938021079816097e-06, + "logits/chosen": -1.3168010711669922, + "logits/rejected": -1.231926679611206, + "logps/chosen": -179.01805114746094, + "logps/rejected": -141.8370361328125, + "loss": 0.2071, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.450664043426514, + "rewards/margins": 0.6854815483093262, + "rewards/rejected": 5.7651824951171875, + "step": 8613 + }, + { + "epoch": 1.4, + "learning_rate": 2.1927144581926597e-06, + "logits/chosen": -1.3785203695297241, + "logits/rejected": -1.3316078186035156, + "logps/chosen": -40.543357849121094, + "logps/rejected": -60.40032958984375, + "loss": 1.0134, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.3218421936035156, + "rewards/margins": -1.7170038223266602, + "rewards/rejected": 5.038846015930176, + "step": 8614 + }, + { + "epoch": 1.4, + "learning_rate": 2.191627002360576e-06, + "logits/chosen": -1.458222508430481, + "logits/rejected": -1.4567509889602661, + "logps/chosen": -113.00205993652344, + "logps/rejected": -70.0921630859375, + "loss": 0.1973, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0559966564178467, + "rewards/margins": 1.3386962413787842, + "rewards/rejected": 1.7173004150390625, + "step": 8615 + }, + { + "epoch": 1.4, + "learning_rate": 2.190539740560495e-06, + "logits/chosen": -1.2791129350662231, + "logits/rejected": -1.1604140996932983, + "logps/chosen": -47.917091369628906, + "logps/rejected": -27.32832145690918, + "loss": 0.0124, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.627437591552734, + "rewards/margins": 4.080185890197754, + "rewards/rejected": 0.5472517013549805, + "step": 8616 + }, + { + "epoch": 1.4, + "learning_rate": 2.189452672867533e-06, + "logits/chosen": -1.0012372732162476, + "logits/rejected": -0.9585055708885193, + "logps/chosen": -101.96336364746094, + "logps/rejected": -64.31315612792969, + "loss": 0.4092, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.5815627574920654, + "rewards/margins": 1.4494261741638184, + "rewards/rejected": 2.132136583328247, + "step": 8617 + }, + { + "epoch": 1.4, + "learning_rate": 2.1883657993567993e-06, + "logits/chosen": -1.4831736087799072, + "logits/rejected": -1.4740123748779297, + "logps/chosen": -85.80029296875, + "logps/rejected": -51.832557678222656, + "loss": 0.6787, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.9842820167541504, + "rewards/margins": -1.0458197593688965, + "rewards/rejected": 4.030101776123047, + "step": 8618 + }, + { + "epoch": 1.4, + "learning_rate": 2.187279120103383e-06, + "logits/chosen": -1.513129472732544, + "logits/rejected": -1.4894293546676636, + "logps/chosen": -198.18592834472656, + "logps/rejected": -45.03059387207031, + "loss": 0.0675, + "rewards/accuracies": 1.0, + "rewards/chosen": 9.943455696105957, + "rewards/margins": 9.797101974487305, + "rewards/rejected": 0.14635352790355682, + "step": 8619 + }, + { + "epoch": 1.4, + "learning_rate": 2.1861926351823675e-06, + "logits/chosen": -1.2903908491134644, + "logits/rejected": -1.2060924768447876, + "logps/chosen": -151.20932006835938, + "logps/rejected": -84.81895446777344, + "loss": 0.2517, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.827972412109375, + "rewards/margins": 1.012751817703247, + "rewards/rejected": 3.815220594406128, + "step": 8620 + }, + { + "epoch": 1.4, + "learning_rate": 2.185106344668814e-06, + "logits/chosen": -0.7620283961296082, + "logits/rejected": -0.7620283961296082, + "logps/chosen": -41.46860122680664, + "logps/rejected": -41.46860122680664, + "loss": 0.7732, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.325892210006714, + "rewards/margins": 0.0, + "rewards/rejected": 2.325892210006714, + "step": 8621 + }, + { + "epoch": 1.4, + "learning_rate": 2.1840202486377797e-06, + "logits/chosen": -1.2956925630569458, + "logits/rejected": -1.3264098167419434, + "logps/chosen": -49.69157791137695, + "logps/rejected": -61.455810546875, + "loss": 0.7472, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.9761295318603516, + "rewards/margins": -1.2276928424835205, + "rewards/rejected": 3.203822374343872, + "step": 8622 + }, + { + "epoch": 1.4, + "learning_rate": 2.1829343471642995e-06, + "logits/chosen": -0.9282343983650208, + "logits/rejected": -0.8796514272689819, + "logps/chosen": -99.13011169433594, + "logps/rejected": -117.649169921875, + "loss": 0.2878, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.9737510681152344, + "rewards/margins": 0.34723734855651855, + "rewards/rejected": 1.6265137195587158, + "step": 8623 + }, + { + "epoch": 1.4, + "learning_rate": 2.1818486403234e-06, + "logits/chosen": -1.2212626934051514, + "logits/rejected": -1.149125576019287, + "logps/chosen": -65.87065887451172, + "logps/rejected": -38.387733459472656, + "loss": 0.7704, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.3072259426116943, + "rewards/margins": -1.2458784580230713, + "rewards/rejected": 3.5531044006347656, + "step": 8624 + }, + { + "epoch": 1.4, + "learning_rate": 2.1807631281900967e-06, + "logits/chosen": -1.3900020122528076, + "logits/rejected": -1.408016324043274, + "logps/chosen": -60.49781799316406, + "logps/rejected": -58.50261306762695, + "loss": 0.7201, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.9956573247909546, + "rewards/margins": -1.0470823049545288, + "rewards/rejected": 3.0427396297454834, + "step": 8625 + }, + { + "epoch": 1.4, + "learning_rate": 2.1796778108393824e-06, + "logits/chosen": -0.9971950650215149, + "logits/rejected": -1.018662929534912, + "logps/chosen": -52.17908477783203, + "logps/rejected": -77.40196228027344, + "loss": 0.525, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.8708336353302, + "rewards/margins": 0.7605705261230469, + "rewards/rejected": 2.1102631092071533, + "step": 8626 + }, + { + "epoch": 1.4, + "learning_rate": 2.1785926883462475e-06, + "logits/chosen": -0.7492368817329407, + "logits/rejected": -0.7492368817329407, + "logps/chosen": -16.057552337646484, + "logps/rejected": -16.057552337646484, + "loss": 0.3776, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.6790760159492493, + "rewards/margins": 0.0, + "rewards/rejected": 0.6790760159492493, + "step": 8627 + }, + { + "epoch": 1.4, + "learning_rate": 2.17750776078566e-06, + "logits/chosen": -1.2774343490600586, + "logits/rejected": -1.2581335306167603, + "logps/chosen": -55.007755279541016, + "logps/rejected": -57.735252380371094, + "loss": 1.1464, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.7031140327453613, + "rewards/margins": -1.5082011222839355, + "rewards/rejected": 4.211315155029297, + "step": 8628 + }, + { + "epoch": 1.4, + "learning_rate": 2.1764230282325806e-06, + "logits/chosen": -1.2714998722076416, + "logits/rejected": -1.5002646446228027, + "logps/chosen": -42.737953186035156, + "logps/rejected": -62.501976013183594, + "loss": 2.8077, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.978630781173706, + "rewards/margins": -5.251607894897461, + "rewards/rejected": 8.230238914489746, + "step": 8629 + }, + { + "epoch": 1.4, + "learning_rate": 2.1753384907619517e-06, + "logits/chosen": -0.977735161781311, + "logits/rejected": -0.9782204627990723, + "logps/chosen": -32.99131774902344, + "logps/rejected": -41.79081726074219, + "loss": 1.5678, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.3410625457763672, + "rewards/margins": -2.3657803535461426, + "rewards/rejected": 3.7068428993225098, + "step": 8630 + }, + { + "epoch": 1.4, + "learning_rate": 2.174254148448708e-06, + "logits/chosen": -1.0888643264770508, + "logits/rejected": -1.2403324842453003, + "logps/chosen": -68.96112823486328, + "logps/rejected": -121.31963348388672, + "loss": 1.5618, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.2849128246307373, + "rewards/margins": -2.3437716960906982, + "rewards/rejected": 4.6286845207214355, + "step": 8631 + }, + { + "epoch": 1.4, + "learning_rate": 2.1731700013677628e-06, + "logits/chosen": -1.1851236820220947, + "logits/rejected": -1.09781014919281, + "logps/chosen": -85.08415985107422, + "logps/rejected": -114.43948364257812, + "loss": 0.7838, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.755544185638428, + "rewards/margins": 1.0478200912475586, + "rewards/rejected": 3.707724094390869, + "step": 8632 + }, + { + "epoch": 1.4, + "learning_rate": 2.1720860495940242e-06, + "logits/chosen": -0.808449923992157, + "logits/rejected": -0.808449923992157, + "logps/chosen": -3.8876776695251465, + "logps/rejected": -3.8876776695251465, + "loss": 0.462, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.8522998690605164, + "rewards/margins": 0.0, + "rewards/rejected": 0.8522998690605164, + "step": 8633 + }, + { + "epoch": 1.4, + "learning_rate": 2.1710022932023805e-06, + "logits/chosen": -1.4723023176193237, + "logits/rejected": -1.4679933786392212, + "logps/chosen": -140.66481018066406, + "logps/rejected": -70.62237548828125, + "loss": 0.2336, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.3485000133514404, + "rewards/margins": 0.7053709030151367, + "rewards/rejected": 2.6431291103363037, + "step": 8634 + }, + { + "epoch": 1.4, + "learning_rate": 2.169918732267712e-06, + "logits/chosen": -0.9519844651222229, + "logits/rejected": -0.9519844651222229, + "logps/chosen": -87.77313232421875, + "logps/rejected": -87.77313232421875, + "loss": 2.0473, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.604156494140625, + "rewards/margins": 0.0, + "rewards/rejected": 1.604156494140625, + "step": 8635 + }, + { + "epoch": 1.4, + "learning_rate": 2.1688353668648787e-06, + "logits/chosen": -1.2102867364883423, + "logits/rejected": -1.1561089754104614, + "logps/chosen": -59.7043571472168, + "logps/rejected": -15.627673149108887, + "loss": 0.0761, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.188342809677124, + "rewards/margins": 1.8319977521896362, + "rewards/rejected": 0.3563450872898102, + "step": 8636 + }, + { + "epoch": 1.4, + "learning_rate": 2.167752197068734e-06, + "logits/chosen": -1.279937744140625, + "logits/rejected": -1.2620104551315308, + "logps/chosen": -39.998451232910156, + "logps/rejected": -35.34757995605469, + "loss": 0.3315, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.0510536432266235, + "rewards/margins": 0.3534232974052429, + "rewards/rejected": 0.6976303458213806, + "step": 8637 + }, + { + "epoch": 1.4, + "learning_rate": 2.1666692229541126e-06, + "logits/chosen": -1.2230451107025146, + "logits/rejected": -1.2230451107025146, + "logps/chosen": -0.05608098953962326, + "logps/rejected": -0.05608098953962326, + "loss": 0.3648, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.08226317167282104, + "rewards/margins": 0.0, + "rewards/rejected": 0.08226317167282104, + "step": 8638 + }, + { + "epoch": 1.4, + "learning_rate": 2.1655864445958403e-06, + "logits/chosen": -0.9057059288024902, + "logits/rejected": -0.9639232754707336, + "logps/chosen": -13.966913223266602, + "logps/rejected": -29.441654205322266, + "loss": 0.4757, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.9353694915771484, + "rewards/margins": 0.062255859375, + "rewards/rejected": 0.8731136322021484, + "step": 8639 + }, + { + "epoch": 1.4, + "learning_rate": 2.164503862068723e-06, + "logits/chosen": -1.4309959411621094, + "logits/rejected": -1.4309959411621094, + "logps/chosen": -61.803367614746094, + "logps/rejected": -61.803367614746094, + "loss": 0.9039, + "rewards/accuracies": 0.0, + "rewards/chosen": 4.136878967285156, + "rewards/margins": 0.0, + "rewards/rejected": 4.136878967285156, + "step": 8640 + }, + { + "epoch": 1.4, + "learning_rate": 2.163421475447563e-06, + "logits/chosen": -1.4190644025802612, + "logits/rejected": -1.343201756477356, + "logps/chosen": -74.14263916015625, + "logps/rejected": -44.83902359008789, + "loss": 0.3376, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.210019111633301, + "rewards/margins": 2.4017796516418457, + "rewards/rejected": 2.808239459991455, + "step": 8641 + }, + { + "epoch": 1.4, + "learning_rate": 2.162339284807136e-06, + "logits/chosen": -1.5024940967559814, + "logits/rejected": -1.502721905708313, + "logps/chosen": -215.74081420898438, + "logps/rejected": -23.77243995666504, + "loss": 0.0119, + "rewards/accuracies": 1.0, + "rewards/chosen": 7.726492404937744, + "rewards/margins": 6.0771484375, + "rewards/rejected": 1.6493440866470337, + "step": 8642 + }, + { + "epoch": 1.4, + "learning_rate": 2.161257290222213e-06, + "logits/chosen": -1.0147862434387207, + "logits/rejected": -1.0397464036941528, + "logps/chosen": -37.00907897949219, + "logps/rejected": -59.072113037109375, + "loss": 0.3417, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.4291934967041016, + "rewards/margins": 0.12238574028015137, + "rewards/rejected": 2.30680775642395, + "step": 8643 + }, + { + "epoch": 1.4, + "learning_rate": 2.160175491767553e-06, + "logits/chosen": -1.4287443161010742, + "logits/rejected": -1.4090219736099243, + "logps/chosen": -59.9008674621582, + "logps/rejected": -78.57872009277344, + "loss": 0.0615, + "rewards/accuracies": 1.0, + "rewards/chosen": 8.005457878112793, + "rewards/margins": 3.241539478302002, + "rewards/rejected": 4.763918399810791, + "step": 8644 + }, + { + "epoch": 1.4, + "learning_rate": 2.159093889517895e-06, + "logits/chosen": -1.3519469499588013, + "logits/rejected": -1.2825764417648315, + "logps/chosen": -42.15746307373047, + "logps/rejected": -14.445327758789062, + "loss": 0.4916, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.914777398109436, + "rewards/margins": 1.5300496816635132, + "rewards/rejected": 0.38472768664360046, + "step": 8645 + }, + { + "epoch": 1.4, + "learning_rate": 2.158012483547969e-06, + "logits/chosen": -0.8546649217605591, + "logits/rejected": -0.8624438047409058, + "logps/chosen": -29.420536041259766, + "logps/rejected": -33.61548614501953, + "loss": 1.9498, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.01288375910371542, + "rewards/margins": -0.10989532619714737, + "rewards/rejected": 0.09701156616210938, + "step": 8646 + }, + { + "epoch": 1.4, + "learning_rate": 2.156931273932488e-06, + "logits/chosen": -1.3222190141677856, + "logits/rejected": -1.3890188932418823, + "logps/chosen": -62.48999786376953, + "logps/rejected": -187.70330810546875, + "loss": 0.4633, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.505469560623169, + "rewards/margins": -0.2789757251739502, + "rewards/rejected": 2.784445285797119, + "step": 8647 + }, + { + "epoch": 1.4, + "learning_rate": 2.1558502607461564e-06, + "logits/chosen": -1.0532900094985962, + "logits/rejected": -1.1154632568359375, + "logps/chosen": -1.1294071674346924, + "logps/rejected": -46.521461486816406, + "loss": 0.4116, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.42139387130737305, + "rewards/margins": -0.05367976427078247, + "rewards/rejected": 0.4750736355781555, + "step": 8648 + }, + { + "epoch": 1.4, + "learning_rate": 2.154769444063658e-06, + "logits/chosen": -1.030868411064148, + "logits/rejected": -0.9042487740516663, + "logps/chosen": -36.14785385131836, + "logps/rejected": -16.122011184692383, + "loss": 1.2999, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.139172077178955, + "rewards/margins": 1.198021411895752, + "rewards/rejected": 0.9411506652832031, + "step": 8649 + }, + { + "epoch": 1.4, + "learning_rate": 2.1536888239596714e-06, + "logits/chosen": -1.3195267915725708, + "logits/rejected": -1.2946819067001343, + "logps/chosen": -67.69633483886719, + "logps/rejected": -71.17964172363281, + "loss": 1.0283, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1713013648986816, + "rewards/margins": 0.48127055168151855, + "rewards/rejected": 2.690030813217163, + "step": 8650 + }, + { + "epoch": 1.4, + "learning_rate": 2.1526084005088533e-06, + "logits/chosen": -1.4575220346450806, + "logits/rejected": -1.347659707069397, + "logps/chosen": -44.82250213623047, + "logps/rejected": -19.354490280151367, + "loss": 0.6161, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.4248398542404175, + "rewards/margins": 0.9462301731109619, + "rewards/rejected": 0.4786096513271332, + "step": 8651 + }, + { + "epoch": 1.4, + "learning_rate": 2.151528173785854e-06, + "logits/chosen": -1.2220882177352905, + "logits/rejected": -1.0075616836547852, + "logps/chosen": -115.0806655883789, + "logps/rejected": -46.08357620239258, + "loss": 0.0586, + "rewards/accuracies": 1.0, + "rewards/chosen": 8.81013298034668, + "rewards/margins": 5.2261505126953125, + "rewards/rejected": 3.583982467651367, + "step": 8652 + }, + { + "epoch": 1.4, + "learning_rate": 2.1504481438653035e-06, + "logits/chosen": -1.3976259231567383, + "logits/rejected": -1.4122463464736938, + "logps/chosen": -204.95069885253906, + "logps/rejected": -79.31324768066406, + "loss": 0.0317, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.42629861831665, + "rewards/margins": 2.888861894607544, + "rewards/rejected": 3.5374367237091064, + "step": 8653 + }, + { + "epoch": 1.4, + "learning_rate": 2.1493683108218254e-06, + "logits/chosen": -1.249586820602417, + "logits/rejected": -1.3051002025604248, + "logps/chosen": -71.08576965332031, + "logps/rejected": -54.21413803100586, + "loss": 1.242, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.4938080310821533, + "rewards/margins": -0.8550007343292236, + "rewards/rejected": 2.348808765411377, + "step": 8654 + }, + { + "epoch": 1.4, + "learning_rate": 2.1482886747300226e-06, + "logits/chosen": -0.7262271046638489, + "logits/rejected": -0.7292015552520752, + "logps/chosen": -3.31150484085083, + "logps/rejected": -2.102938413619995, + "loss": 0.3338, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3254673480987549, + "rewards/margins": 0.052383214235305786, + "rewards/rejected": 0.2730841338634491, + "step": 8655 + }, + { + "epoch": 1.4, + "learning_rate": 2.1472092356644907e-06, + "logits/chosen": -1.0479613542556763, + "logits/rejected": -0.9867220520973206, + "logps/chosen": -44.691184997558594, + "logps/rejected": -37.471561431884766, + "loss": 1.9267, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.378725528717041, + "rewards/margins": 0.4297901391983032, + "rewards/rejected": 1.9489353895187378, + "step": 8656 + }, + { + "epoch": 1.41, + "learning_rate": 2.1461299936998054e-06, + "logits/chosen": -1.419453501701355, + "logits/rejected": -1.178174614906311, + "logps/chosen": -157.43643188476562, + "logps/rejected": -81.09419250488281, + "loss": 0.0325, + "rewards/accuracies": 1.0, + "rewards/chosen": 7.310951232910156, + "rewards/margins": 2.7116880416870117, + "rewards/rejected": 4.5992631912231445, + "step": 8657 + }, + { + "epoch": 1.41, + "learning_rate": 2.145050948910536e-06, + "logits/chosen": -1.4221343994140625, + "logits/rejected": -1.3845769166946411, + "logps/chosen": -111.86528015136719, + "logps/rejected": -87.28022766113281, + "loss": 0.3357, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.2668426036834717, + "rewards/margins": 0.04399704933166504, + "rewards/rejected": 2.2228455543518066, + "step": 8658 + }, + { + "epoch": 1.41, + "learning_rate": 2.1439721013712305e-06, + "logits/chosen": -0.9684154391288757, + "logits/rejected": -0.8954199552536011, + "logps/chosen": -97.3268814086914, + "logps/rejected": -37.15147399902344, + "loss": 0.6167, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.3672478199005127, + "rewards/margins": -0.8282508850097656, + "rewards/rejected": 2.1954987049102783, + "step": 8659 + }, + { + "epoch": 1.41, + "learning_rate": 2.1428934511564304e-06, + "logits/chosen": -1.649106740951538, + "logits/rejected": -1.7292695045471191, + "logps/chosen": -105.05210876464844, + "logps/rejected": -70.72470092773438, + "loss": 0.2766, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.0568695068359375, + "rewards/margins": 1.265345811843872, + "rewards/rejected": 3.7915236949920654, + "step": 8660 + }, + { + "epoch": 1.41, + "learning_rate": 2.141814998340657e-06, + "logits/chosen": -1.46160888671875, + "logits/rejected": -1.3194876909255981, + "logps/chosen": -60.32801818847656, + "logps/rejected": -57.36848831176758, + "loss": 0.6362, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.303776741027832, + "rewards/margins": 1.644611120223999, + "rewards/rejected": 2.659165620803833, + "step": 8661 + }, + { + "epoch": 1.41, + "learning_rate": 2.140736742998424e-06, + "logits/chosen": -1.3262277841567993, + "logits/rejected": -1.3631045818328857, + "logps/chosen": -84.7755126953125, + "logps/rejected": -120.0123519897461, + "loss": 0.3507, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.666802883148193, + "rewards/margins": 0.25597286224365234, + "rewards/rejected": 6.410830020904541, + "step": 8662 + }, + { + "epoch": 1.41, + "learning_rate": 2.139658685204227e-06, + "logits/chosen": -1.0003703832626343, + "logits/rejected": -1.060200810432434, + "logps/chosen": -80.86161041259766, + "logps/rejected": -85.97542572021484, + "loss": 0.5137, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.8339935541152954, + "rewards/margins": 0.053269147872924805, + "rewards/rejected": 1.7807244062423706, + "step": 8663 + }, + { + "epoch": 1.41, + "learning_rate": 2.1385808250325506e-06, + "logits/chosen": -1.121311068534851, + "logits/rejected": -1.134200930595398, + "logps/chosen": -35.95969009399414, + "logps/rejected": -63.805301666259766, + "loss": 0.7957, + "rewards/accuracies": 0.0, + "rewards/chosen": 4.227450370788574, + "rewards/margins": -0.9775519371032715, + "rewards/rejected": 5.205002307891846, + "step": 8664 + }, + { + "epoch": 1.41, + "learning_rate": 2.1375031625578628e-06, + "logits/chosen": -1.3153703212738037, + "logits/rejected": -1.2724605798721313, + "logps/chosen": -48.1706657409668, + "logps/rejected": -14.051872253417969, + "loss": 0.4929, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.7380935549736023, + "rewards/margins": -0.03104841709136963, + "rewards/rejected": 0.7691419720649719, + "step": 8665 + }, + { + "epoch": 1.41, + "learning_rate": 2.136425697854623e-06, + "logits/chosen": -0.9944869875907898, + "logits/rejected": -1.0225714445114136, + "logps/chosen": -77.65486145019531, + "logps/rejected": -65.97022247314453, + "loss": 0.1173, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.3004043102264404, + "rewards/margins": 1.3961127996444702, + "rewards/rejected": 1.9042915105819702, + "step": 8666 + }, + { + "epoch": 1.41, + "learning_rate": 2.135348430997271e-06, + "logits/chosen": -1.4924631118774414, + "logits/rejected": -1.586276888847351, + "logps/chosen": -81.90327453613281, + "logps/rejected": -193.22213745117188, + "loss": 1.3516, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.7857345342636108, + "rewards/margins": -2.612473964691162, + "rewards/rejected": 4.3982086181640625, + "step": 8667 + }, + { + "epoch": 1.41, + "learning_rate": 2.1342713620602377e-06, + "logits/chosen": -1.2943263053894043, + "logits/rejected": -1.2803508043289185, + "logps/chosen": -57.565101623535156, + "logps/rejected": -47.56414031982422, + "loss": 1.342, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.2640061378479004, + "rewards/margins": -0.0802910327911377, + "rewards/rejected": 2.344297170639038, + "step": 8668 + }, + { + "epoch": 1.41, + "learning_rate": 2.1331944911179364e-06, + "logits/chosen": -1.0811432600021362, + "logits/rejected": -0.9664000868797302, + "logps/chosen": -64.64789581298828, + "logps/rejected": -16.176694869995117, + "loss": 0.3035, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.9320846796035767, + "rewards/margins": 0.3450283408164978, + "rewards/rejected": 0.5870563387870789, + "step": 8669 + }, + { + "epoch": 1.41, + "learning_rate": 2.132117818244771e-06, + "logits/chosen": -1.5091884136199951, + "logits/rejected": -1.4971842765808105, + "logps/chosen": -110.08484649658203, + "logps/rejected": -55.28676986694336, + "loss": 1.2791, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.8722022771835327, + "rewards/margins": -2.162156105041504, + "rewards/rejected": 4.034358501434326, + "step": 8670 + }, + { + "epoch": 1.41, + "learning_rate": 2.1310413435151266e-06, + "logits/chosen": -1.3561397790908813, + "logits/rejected": -1.3613189458847046, + "logps/chosen": -11.397560119628906, + "logps/rejected": -32.950965881347656, + "loss": 2.3792, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.271052598953247, + "rewards/margins": -1.5932297706604004, + "rewards/rejected": 3.8642823696136475, + "step": 8671 + }, + { + "epoch": 1.41, + "learning_rate": 2.1299650670033813e-06, + "logits/chosen": -0.9050146341323853, + "logits/rejected": -0.9232717752456665, + "logps/chosen": -67.97803497314453, + "logps/rejected": -69.39270782470703, + "loss": 0.2471, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.8156325817108154, + "rewards/margins": 0.5033698081970215, + "rewards/rejected": 2.312262773513794, + "step": 8672 + }, + { + "epoch": 1.41, + "learning_rate": 2.128888988783891e-06, + "logits/chosen": -0.8932093381881714, + "logits/rejected": -0.8932093381881714, + "logps/chosen": -1.3865854740142822, + "logps/rejected": -1.3865854740142822, + "loss": 0.6905, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.22829802334308624, + "rewards/margins": 0.0, + "rewards/rejected": 0.22829802334308624, + "step": 8673 + }, + { + "epoch": 1.41, + "learning_rate": 2.127813108931007e-06, + "logits/chosen": -1.0782618522644043, + "logits/rejected": -1.1972004175186157, + "logps/chosen": -94.1916732788086, + "logps/rejected": -108.80479431152344, + "loss": 1.1802, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.2054543495178223, + "rewards/margins": -2.0923194885253906, + "rewards/rejected": 5.297773838043213, + "step": 8674 + }, + { + "epoch": 1.41, + "learning_rate": 2.126737427519058e-06, + "logits/chosen": -0.8225682377815247, + "logits/rejected": -0.7413575053215027, + "logps/chosen": -47.26885986328125, + "logps/rejected": -38.535858154296875, + "loss": 0.3398, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.259947299957275, + "rewards/margins": 1.3648979663848877, + "rewards/rejected": 2.8950493335723877, + "step": 8675 + }, + { + "epoch": 1.41, + "learning_rate": 2.1256619446223675e-06, + "logits/chosen": -1.201291561126709, + "logits/rejected": -1.2990546226501465, + "logps/chosen": -49.53440856933594, + "logps/rejected": -71.57638549804688, + "loss": 1.0753, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.7487343549728394, + "rewards/margins": -2.023967742919922, + "rewards/rejected": 3.7727019786834717, + "step": 8676 + }, + { + "epoch": 1.41, + "learning_rate": 2.1245866603152384e-06, + "logits/chosen": -1.3498826026916504, + "logits/rejected": -1.3316913843154907, + "logps/chosen": -73.27342224121094, + "logps/rejected": -59.91866683959961, + "loss": 0.5801, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.9204262495040894, + "rewards/margins": 0.16182446479797363, + "rewards/rejected": 1.7586017847061157, + "step": 8677 + }, + { + "epoch": 1.41, + "learning_rate": 2.1235115746719647e-06, + "logits/chosen": -1.17475426197052, + "logits/rejected": -1.17475426197052, + "logps/chosen": -42.897361755371094, + "logps/rejected": -42.897361755371094, + "loss": 0.4035, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.1754944324493408, + "rewards/margins": 0.0, + "rewards/rejected": 1.1754944324493408, + "step": 8678 + }, + { + "epoch": 1.41, + "learning_rate": 2.122436687766822e-06, + "logits/chosen": -1.031924843788147, + "logits/rejected": -0.9749001264572144, + "logps/chosen": -149.40008544921875, + "logps/rejected": -27.264917373657227, + "loss": 0.0927, + "rewards/accuracies": 1.0, + "rewards/chosen": 8.742563247680664, + "rewards/margins": 4.9640398025512695, + "rewards/rejected": 3.7785232067108154, + "step": 8679 + }, + { + "epoch": 1.41, + "learning_rate": 2.1213619996740765e-06, + "logits/chosen": -1.3144209384918213, + "logits/rejected": -1.358307123184204, + "logps/chosen": -59.46824645996094, + "logps/rejected": -57.57822799682617, + "loss": 0.7965, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.956798553466797, + "rewards/margins": -0.011323213577270508, + "rewards/rejected": 2.9681217670440674, + "step": 8680 + }, + { + "epoch": 1.41, + "learning_rate": 2.1202875104679803e-06, + "logits/chosen": -0.9401268362998962, + "logits/rejected": -0.9956191182136536, + "logps/chosen": -52.17076110839844, + "logps/rejected": -66.953125, + "loss": 0.5449, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.6084084510803223, + "rewards/margins": 0.11425256729125977, + "rewards/rejected": 2.4941558837890625, + "step": 8681 + }, + { + "epoch": 1.41, + "learning_rate": 2.1192132202227676e-06, + "logits/chosen": -1.271891474723816, + "logits/rejected": -1.384944200515747, + "logps/chosen": -74.90103912353516, + "logps/rejected": -115.16636657714844, + "loss": 1.2355, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.2256126403808594, + "rewards/margins": -2.3142294883728027, + "rewards/rejected": 4.539842128753662, + "step": 8682 + }, + { + "epoch": 1.41, + "learning_rate": 2.1181391290126653e-06, + "logits/chosen": -1.074223518371582, + "logits/rejected": -1.0539357662200928, + "logps/chosen": -62.811763763427734, + "logps/rejected": -103.03797912597656, + "loss": 1.0839, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.0569493770599365, + "rewards/margins": 0.43728387355804443, + "rewards/rejected": 1.619665503501892, + "step": 8683 + }, + { + "epoch": 1.41, + "learning_rate": 2.117065236911878e-06, + "logits/chosen": -1.4054166078567505, + "logits/rejected": -1.37531578540802, + "logps/chosen": -105.8642578125, + "logps/rejected": -40.36387252807617, + "loss": 0.5641, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.8888320922851562, + "rewards/margins": -0.7345371246337891, + "rewards/rejected": 2.6233692169189453, + "step": 8684 + }, + { + "epoch": 1.41, + "learning_rate": 2.1159915439946073e-06, + "logits/chosen": -1.4438072443008423, + "logits/rejected": -1.3807059526443481, + "logps/chosen": -102.96269226074219, + "logps/rejected": -135.57936096191406, + "loss": 0.5162, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.689296245574951, + "rewards/margins": 0.7536106109619141, + "rewards/rejected": 5.935685634613037, + "step": 8685 + }, + { + "epoch": 1.41, + "learning_rate": 2.114918050335029e-06, + "logits/chosen": -0.837546706199646, + "logits/rejected": -0.8390042185783386, + "logps/chosen": -5.098857402801514, + "logps/rejected": -2.690356969833374, + "loss": 0.377, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.4495762288570404, + "rewards/margins": 0.17160311341285706, + "rewards/rejected": 0.27797311544418335, + "step": 8686 + }, + { + "epoch": 1.41, + "learning_rate": 2.1138447560073172e-06, + "logits/chosen": -1.3059508800506592, + "logits/rejected": -1.2578550577163696, + "logps/chosen": -76.38328552246094, + "logps/rejected": -57.42716979980469, + "loss": 0.8769, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.1773147583007812, + "rewards/margins": -1.562962532043457, + "rewards/rejected": 4.740277290344238, + "step": 8687 + }, + { + "epoch": 1.41, + "learning_rate": 2.112771661085622e-06, + "logits/chosen": -1.430652141571045, + "logits/rejected": -1.401654839515686, + "logps/chosen": -136.41419982910156, + "logps/rejected": -105.62367248535156, + "loss": 0.2627, + "rewards/accuracies": 1.0, + "rewards/chosen": 7.515946865081787, + "rewards/margins": 0.6741452217102051, + "rewards/rejected": 6.841801643371582, + "step": 8688 + }, + { + "epoch": 1.41, + "learning_rate": 2.1116987656440874e-06, + "logits/chosen": -1.2081642150878906, + "logits/rejected": -1.0738346576690674, + "logps/chosen": -89.664306640625, + "logps/rejected": -56.03435516357422, + "loss": 0.5239, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.966059923171997, + "rewards/margins": 3.1786513328552246, + "rewards/rejected": 0.7874084711074829, + "step": 8689 + }, + { + "epoch": 1.41, + "learning_rate": 2.1106260697568364e-06, + "logits/chosen": -1.108709454536438, + "logits/rejected": -1.0554484128952026, + "logps/chosen": -42.768341064453125, + "logps/rejected": -34.58403778076172, + "loss": 3.8216, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.6650627851486206, + "rewards/margins": -2.3054704666137695, + "rewards/rejected": 3.9705331325531006, + "step": 8690 + }, + { + "epoch": 1.41, + "learning_rate": 2.1095535734979873e-06, + "logits/chosen": -1.4544161558151245, + "logits/rejected": -1.4544161558151245, + "logps/chosen": -93.14265441894531, + "logps/rejected": -93.14265441894531, + "loss": 0.6583, + "rewards/accuracies": 0.0, + "rewards/chosen": 8.894706726074219, + "rewards/margins": 0.0, + "rewards/rejected": 8.894706726074219, + "step": 8691 + }, + { + "epoch": 1.41, + "learning_rate": 2.108481276941634e-06, + "logits/chosen": -1.3137611150741577, + "logits/rejected": -1.2985519170761108, + "logps/chosen": -97.0633316040039, + "logps/rejected": -75.77914428710938, + "loss": 1.6675, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.221067190170288, + "rewards/margins": 1.4205422401428223, + "rewards/rejected": 1.8005249500274658, + "step": 8692 + }, + { + "epoch": 1.41, + "learning_rate": 2.107409180161867e-06, + "logits/chosen": -1.0937464237213135, + "logits/rejected": -1.0871092081069946, + "logps/chosen": -14.104085922241211, + "logps/rejected": -5.912739276885986, + "loss": 0.3561, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.597084641456604, + "rewards/margins": 0.37124955654144287, + "rewards/rejected": 0.22583508491516113, + "step": 8693 + }, + { + "epoch": 1.41, + "learning_rate": 2.1063372832327535e-06, + "logits/chosen": -1.4362666606903076, + "logits/rejected": -1.4549437761306763, + "logps/chosen": -23.791467666625977, + "logps/rejected": -58.1171875, + "loss": 1.9139, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.9646399021148682, + "rewards/margins": -3.014862298965454, + "rewards/rejected": 4.979502201080322, + "step": 8694 + }, + { + "epoch": 1.41, + "learning_rate": 2.105265586228355e-06, + "logits/chosen": -1.1961709260940552, + "logits/rejected": -0.8014983534812927, + "logps/chosen": -101.03407287597656, + "logps/rejected": -49.908203125, + "loss": 0.0285, + "rewards/accuracies": 1.0, + "rewards/chosen": 7.6776628494262695, + "rewards/margins": 2.8375511169433594, + "rewards/rejected": 4.84011173248291, + "step": 8695 + }, + { + "epoch": 1.41, + "learning_rate": 2.104194089222713e-06, + "logits/chosen": -1.3634427785873413, + "logits/rejected": -1.226272463798523, + "logps/chosen": -112.79309844970703, + "logps/rejected": -42.46570587158203, + "loss": 0.3984, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.944588661193848, + "rewards/margins": 1.1126666069030762, + "rewards/rejected": 4.8319220542907715, + "step": 8696 + }, + { + "epoch": 1.41, + "learning_rate": 2.1031227922898614e-06, + "logits/chosen": -0.8359425067901611, + "logits/rejected": -0.8751626014709473, + "logps/chosen": -36.91133117675781, + "logps/rejected": -47.14643478393555, + "loss": 1.2155, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.1909635066986084, + "rewards/margins": -0.07453060150146484, + "rewards/rejected": 2.2654941082000732, + "step": 8697 + }, + { + "epoch": 1.41, + "learning_rate": 2.1020516955038118e-06, + "logits/chosen": -0.7882602214813232, + "logits/rejected": -0.7882602214813232, + "logps/chosen": -0.8084120750427246, + "logps/rejected": -0.8084120750427246, + "loss": 0.7688, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.5148455500602722, + "rewards/margins": 0.0, + "rewards/rejected": 0.5148455500602722, + "step": 8698 + }, + { + "epoch": 1.41, + "learning_rate": 2.1009807989385713e-06, + "logits/chosen": -1.1494604349136353, + "logits/rejected": -1.10197114944458, + "logps/chosen": -76.31268310546875, + "logps/rejected": -38.91204071044922, + "loss": 0.2523, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.563999891281128, + "rewards/margins": 1.626014232635498, + "rewards/rejected": 1.9379856586456299, + "step": 8699 + }, + { + "epoch": 1.41, + "learning_rate": 2.099910102668125e-06, + "logits/chosen": -1.2628952264785767, + "logits/rejected": -1.2496424913406372, + "logps/chosen": -92.90986633300781, + "logps/rejected": -79.48824310302734, + "loss": 0.2051, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.143357992172241, + "rewards/margins": 1.201492428779602, + "rewards/rejected": 1.9418655633926392, + "step": 8700 + }, + { + "epoch": 1.41, + "learning_rate": 2.0988396067664518e-06, + "logits/chosen": -1.2616957426071167, + "logits/rejected": -1.1525026559829712, + "logps/chosen": -86.07706451416016, + "logps/rejected": -21.781253814697266, + "loss": 0.4738, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.7306183576583862, + "rewards/margins": 1.6187243461608887, + "rewards/rejected": 0.11189403384923935, + "step": 8701 + }, + { + "epoch": 1.41, + "learning_rate": 2.0977693113075088e-06, + "logits/chosen": -1.1470587253570557, + "logits/rejected": -1.206261157989502, + "logps/chosen": -65.28968048095703, + "logps/rejected": -27.979581832885742, + "loss": 0.2207, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.4436593055725098, + "rewards/margins": 3.087512731552124, + "rewards/rejected": 0.3561466336250305, + "step": 8702 + }, + { + "epoch": 1.41, + "learning_rate": 2.096699216365247e-06, + "logits/chosen": -1.2832152843475342, + "logits/rejected": -1.2801889181137085, + "logps/chosen": -8.620877265930176, + "logps/rejected": -1.3534015417099, + "loss": 0.5928, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.269947350025177, + "rewards/margins": -0.14836210012435913, + "rewards/rejected": 0.41830945014953613, + "step": 8703 + }, + { + "epoch": 1.41, + "learning_rate": 2.095629322013596e-06, + "logits/chosen": -1.5294910669326782, + "logits/rejected": -1.5294910669326782, + "logps/chosen": -46.32575988769531, + "logps/rejected": -46.32575988769531, + "loss": 0.4031, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.3459999561309814, + "rewards/margins": 0.0, + "rewards/rejected": 3.3459999561309814, + "step": 8704 + }, + { + "epoch": 1.41, + "learning_rate": 2.0945596283264794e-06, + "logits/chosen": -0.8712963461875916, + "logits/rejected": -0.856310248374939, + "logps/chosen": -15.016694068908691, + "logps/rejected": -7.172593116760254, + "loss": 0.7293, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.0144823789596558, + "rewards/margins": 0.6647958755493164, + "rewards/rejected": 0.34968653321266174, + "step": 8705 + }, + { + "epoch": 1.41, + "learning_rate": 2.0934901353777994e-06, + "logits/chosen": -1.460700511932373, + "logits/rejected": -1.5403568744659424, + "logps/chosen": -13.731809616088867, + "logps/rejected": -79.64546203613281, + "loss": 1.8216, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.9934911727905273, + "rewards/margins": -3.587881088256836, + "rewards/rejected": 4.581372261047363, + "step": 8706 + }, + { + "epoch": 1.41, + "learning_rate": 2.092420843241451e-06, + "logits/chosen": -1.1625944375991821, + "logits/rejected": -1.2511872053146362, + "logps/chosen": -93.52677917480469, + "logps/rejected": -94.24899291992188, + "loss": 1.2166, + "rewards/accuracies": 0.0, + "rewards/chosen": 5.968730449676514, + "rewards/margins": -2.063210964202881, + "rewards/rejected": 8.031941413879395, + "step": 8707 + }, + { + "epoch": 1.41, + "learning_rate": 2.091351751991309e-06, + "logits/chosen": -1.4013168811798096, + "logits/rejected": -1.3711518049240112, + "logps/chosen": -41.73019027709961, + "logps/rejected": -47.578468322753906, + "loss": 0.2577, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.5226504802703857, + "rewards/margins": 1.3913718461990356, + "rewards/rejected": 1.13127863407135, + "step": 8708 + }, + { + "epoch": 1.41, + "learning_rate": 2.0902828617012405e-06, + "logits/chosen": -0.954395055770874, + "logits/rejected": -0.9521094560623169, + "logps/chosen": -42.85308837890625, + "logps/rejected": -76.51934051513672, + "loss": 0.2831, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.8246269226074219, + "rewards/margins": 0.6304092407226562, + "rewards/rejected": 1.1942176818847656, + "step": 8709 + }, + { + "epoch": 1.41, + "learning_rate": 2.0892141724450925e-06, + "logits/chosen": -1.3303823471069336, + "logits/rejected": -1.2128791809082031, + "logps/chosen": -183.35916137695312, + "logps/rejected": -147.41162109375, + "loss": 0.1867, + "rewards/accuracies": 1.0, + "rewards/chosen": 8.516467094421387, + "rewards/margins": 0.8631329536437988, + "rewards/rejected": 7.653334140777588, + "step": 8710 + }, + { + "epoch": 1.41, + "learning_rate": 2.088145684296705e-06, + "logits/chosen": -1.3115514516830444, + "logits/rejected": -1.2822215557098389, + "logps/chosen": -62.528709411621094, + "logps/rejected": -89.04145050048828, + "loss": 0.0634, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.052987098693848, + "rewards/margins": 2.5051605701446533, + "rewards/rejected": 2.5478265285491943, + "step": 8711 + }, + { + "epoch": 1.41, + "learning_rate": 2.087077397329897e-06, + "logits/chosen": -1.287536859512329, + "logits/rejected": -1.2546594142913818, + "logps/chosen": -40.00091552734375, + "logps/rejected": -65.19254302978516, + "loss": 0.0536, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.3010146617889404, + "rewards/margins": 2.354200601577759, + "rewards/rejected": 0.9468140006065369, + "step": 8712 + }, + { + "epoch": 1.41, + "learning_rate": 2.0860093116184798e-06, + "logits/chosen": -1.4846163988113403, + "logits/rejected": -1.0649888515472412, + "logps/chosen": -160.08047485351562, + "logps/rejected": -22.633544921875, + "loss": 0.0339, + "rewards/accuracies": 1.0, + "rewards/chosen": 7.590405464172363, + "rewards/margins": 7.3278937339782715, + "rewards/rejected": 0.26251181960105896, + "step": 8713 + }, + { + "epoch": 1.41, + "learning_rate": 2.084941427236245e-06, + "logits/chosen": -1.3623605966567993, + "logits/rejected": -1.2473714351654053, + "logps/chosen": -47.07521057128906, + "logps/rejected": -70.59623718261719, + "loss": 0.0666, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.3985824584960938, + "rewards/margins": 2.539433240890503, + "rewards/rejected": 0.859149158000946, + "step": 8714 + }, + { + "epoch": 1.41, + "learning_rate": 2.083873744256975e-06, + "logits/chosen": -1.3643262386322021, + "logits/rejected": -1.3606784343719482, + "logps/chosen": -47.09051513671875, + "logps/rejected": -50.724700927734375, + "loss": 1.0325, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.431626081466675, + "rewards/margins": 0.0006568431854248047, + "rewards/rejected": 2.43096923828125, + "step": 8715 + }, + { + "epoch": 1.41, + "learning_rate": 2.082806262754439e-06, + "logits/chosen": -1.660652995109558, + "logits/rejected": -1.6627495288848877, + "logps/chosen": -128.5432891845703, + "logps/rejected": -48.627830505371094, + "loss": 0.1089, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.4357833862304688, + "rewards/margins": 1.7391624450683594, + "rewards/rejected": 1.6966209411621094, + "step": 8716 + }, + { + "epoch": 1.41, + "learning_rate": 2.0817389828023846e-06, + "logits/chosen": -1.0657100677490234, + "logits/rejected": -1.0731927156448364, + "logps/chosen": -1.8867906332015991, + "logps/rejected": -13.781522750854492, + "loss": 0.5797, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.5813469290733337, + "rewards/margins": -0.20673388242721558, + "rewards/rejected": 0.7880808115005493, + "step": 8717 + }, + { + "epoch": 1.42, + "learning_rate": 2.0806719044745565e-06, + "logits/chosen": -0.9110334515571594, + "logits/rejected": -0.9098845720291138, + "logps/chosen": -3.1857454776763916, + "logps/rejected": -5.001706123352051, + "loss": 0.3595, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3555079698562622, + "rewards/margins": 0.11264373362064362, + "rewards/rejected": 0.2428642362356186, + "step": 8718 + }, + { + "epoch": 1.42, + "learning_rate": 2.079605027844675e-06, + "logits/chosen": -1.186780571937561, + "logits/rejected": -1.2155095338821411, + "logps/chosen": -56.40679168701172, + "logps/rejected": -70.17001342773438, + "loss": 0.9033, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.1373963356018066, + "rewards/margins": 0.3632103204727173, + "rewards/rejected": 1.7741860151290894, + "step": 8719 + }, + { + "epoch": 1.42, + "learning_rate": 2.0785383529864544e-06, + "logits/chosen": -1.390842080116272, + "logits/rejected": -1.3992797136306763, + "logps/chosen": -146.9309844970703, + "logps/rejected": -104.80265808105469, + "loss": 0.1886, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9587693214416504, + "rewards/margins": 0.9788804054260254, + "rewards/rejected": 1.979888916015625, + "step": 8720 + }, + { + "epoch": 1.42, + "learning_rate": 2.0774718799735887e-06, + "logits/chosen": -1.508683443069458, + "logits/rejected": -1.4683362245559692, + "logps/chosen": -42.295738220214844, + "logps/rejected": -66.79747009277344, + "loss": 0.3544, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.444155216217041, + "rewards/margins": 0.24150776863098145, + "rewards/rejected": 3.2026474475860596, + "step": 8721 + }, + { + "epoch": 1.42, + "learning_rate": 2.0764056088797646e-06, + "logits/chosen": -0.8894620537757874, + "logits/rejected": -0.8911713361740112, + "logps/chosen": -35.67052459716797, + "logps/rejected": -37.421417236328125, + "loss": 1.6334, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.2603763341903687, + "rewards/margins": 0.26396632194519043, + "rewards/rejected": 0.9964100122451782, + "step": 8722 + }, + { + "epoch": 1.42, + "learning_rate": 2.075339539778648e-06, + "logits/chosen": -1.2812275886535645, + "logits/rejected": -1.3094465732574463, + "logps/chosen": -166.8688201904297, + "logps/rejected": -68.90287780761719, + "loss": 0.5182, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.553004741668701, + "rewards/margins": 2.787705421447754, + "rewards/rejected": 2.7652993202209473, + "step": 8723 + }, + { + "epoch": 1.42, + "learning_rate": 2.0742736727438978e-06, + "logits/chosen": -1.5798755884170532, + "logits/rejected": -1.4775917530059814, + "logps/chosen": -92.39037322998047, + "logps/rejected": -111.09932708740234, + "loss": 0.337, + "rewards/accuracies": 1.0, + "rewards/chosen": 8.917893409729004, + "rewards/margins": 0.04294013977050781, + "rewards/rejected": 8.874953269958496, + "step": 8724 + }, + { + "epoch": 1.42, + "learning_rate": 2.073208007849151e-06, + "logits/chosen": -1.1856876611709595, + "logits/rejected": -1.1074166297912598, + "logps/chosen": -52.660545349121094, + "logps/rejected": -63.33964538574219, + "loss": 0.3737, + "rewards/accuracies": 0.0, + "rewards/chosen": 5.577248573303223, + "rewards/margins": -0.0722508430480957, + "rewards/rejected": 5.649499416351318, + "step": 8725 + }, + { + "epoch": 1.42, + "learning_rate": 2.07214254516804e-06, + "logits/chosen": -1.0165365934371948, + "logits/rejected": -1.0165365934371948, + "logps/chosen": -41.6688117980957, + "logps/rejected": -41.6688117980957, + "loss": 0.9142, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.4901107549667358, + "rewards/margins": 0.0, + "rewards/rejected": 1.4901107549667358, + "step": 8726 + }, + { + "epoch": 1.42, + "learning_rate": 2.071077284774173e-06, + "logits/chosen": -1.395641803741455, + "logits/rejected": -1.3262715339660645, + "logps/chosen": -88.65306854248047, + "logps/rejected": -92.80203247070312, + "loss": 0.3522, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.9138290882110596, + "rewards/margins": -0.004311323165893555, + "rewards/rejected": 2.918140411376953, + "step": 8727 + }, + { + "epoch": 1.42, + "learning_rate": 2.0700122267411536e-06, + "logits/chosen": -1.255555510520935, + "logits/rejected": -1.1228291988372803, + "logps/chosen": -115.18489074707031, + "logps/rejected": -67.65235900878906, + "loss": 0.0186, + "rewards/accuracies": 1.0, + "rewards/chosen": 8.2326078414917, + "rewards/margins": 5.005033493041992, + "rewards/rejected": 3.227574110031128, + "step": 8728 + }, + { + "epoch": 1.42, + "learning_rate": 2.068947371142564e-06, + "logits/chosen": -1.259255051612854, + "logits/rejected": -1.2791740894317627, + "logps/chosen": -89.31005859375, + "logps/rejected": -77.0586166381836, + "loss": 0.606, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.289747714996338, + "rewards/margins": -0.5268776416778564, + "rewards/rejected": 3.8166253566741943, + "step": 8729 + }, + { + "epoch": 1.42, + "learning_rate": 2.067882718051979e-06, + "logits/chosen": -1.2093058824539185, + "logits/rejected": -1.2332097291946411, + "logps/chosen": -84.73548889160156, + "logps/rejected": -110.04202270507812, + "loss": 1.4881, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.520796298980713, + "rewards/margins": -0.41126394271850586, + "rewards/rejected": 3.9320602416992188, + "step": 8730 + }, + { + "epoch": 1.42, + "learning_rate": 2.0668182675429525e-06, + "logits/chosen": -1.045116662979126, + "logits/rejected": -0.9555915594100952, + "logps/chosen": -63.51233673095703, + "logps/rejected": -54.91093444824219, + "loss": 0.3619, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.695472002029419, + "rewards/margins": 2.112452983856201, + "rewards/rejected": 0.5830188989639282, + "step": 8731 + }, + { + "epoch": 1.42, + "learning_rate": 2.065754019689031e-06, + "logits/chosen": -1.4186897277832031, + "logits/rejected": -1.4118627309799194, + "logps/chosen": -144.50082397460938, + "logps/rejected": -59.24205780029297, + "loss": 0.1522, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.750282287597656, + "rewards/margins": 1.4166100025177002, + "rewards/rejected": 3.333672285079956, + "step": 8732 + }, + { + "epoch": 1.42, + "learning_rate": 2.0646899745637417e-06, + "logits/chosen": -1.102824330329895, + "logits/rejected": -1.0272562503814697, + "logps/chosen": -46.9173698425293, + "logps/rejected": -51.15411376953125, + "loss": 0.0619, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.220395565032959, + "rewards/margins": 2.0363523960113525, + "rewards/rejected": 2.1840431690216064, + "step": 8733 + }, + { + "epoch": 1.42, + "learning_rate": 2.063626132240602e-06, + "logits/chosen": -1.6061500310897827, + "logits/rejected": -1.4172080755233765, + "logps/chosen": -127.52359008789062, + "logps/rejected": -137.47607421875, + "loss": 0.0932, + "rewards/accuracies": 1.0, + "rewards/chosen": 8.787759780883789, + "rewards/margins": 1.9503026008605957, + "rewards/rejected": 6.837457180023193, + "step": 8734 + }, + { + "epoch": 1.42, + "learning_rate": 2.062562492793111e-06, + "logits/chosen": -1.4645227193832397, + "logits/rejected": -1.4027681350708008, + "logps/chosen": -62.2689208984375, + "logps/rejected": -47.2197265625, + "loss": 0.7613, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.4838013648986816, + "rewards/margins": 0.14533400535583496, + "rewards/rejected": 2.3384673595428467, + "step": 8735 + }, + { + "epoch": 1.42, + "learning_rate": 2.061499056294759e-06, + "logits/chosen": -1.0478577613830566, + "logits/rejected": -1.0574692487716675, + "logps/chosen": -64.15322875976562, + "logps/rejected": -101.257080078125, + "loss": 0.9035, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.936476945877075, + "rewards/margins": -0.051609039306640625, + "rewards/rejected": 2.988085985183716, + "step": 8736 + }, + { + "epoch": 1.42, + "learning_rate": 2.0604358228190164e-06, + "logits/chosen": -1.0427355766296387, + "logits/rejected": -0.9724690318107605, + "logps/chosen": -20.40142059326172, + "logps/rejected": -9.47407054901123, + "loss": 0.1251, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.576263189315796, + "rewards/margins": 1.8117119073867798, + "rewards/rejected": 0.7645512819290161, + "step": 8737 + }, + { + "epoch": 1.42, + "learning_rate": 2.059372792439345e-06, + "logits/chosen": -1.8780980110168457, + "logits/rejected": -1.0844241380691528, + "logps/chosen": -101.04743194580078, + "logps/rejected": -117.87528991699219, + "loss": 0.807, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.0178773403167725, + "rewards/margins": -0.7574400901794434, + "rewards/rejected": 2.775317430496216, + "step": 8738 + }, + { + "epoch": 1.42, + "learning_rate": 2.0583099652291884e-06, + "logits/chosen": -1.4264713525772095, + "logits/rejected": -1.4264713525772095, + "logps/chosen": -27.4219970703125, + "logps/rejected": -27.4219970703125, + "loss": 0.9895, + "rewards/accuracies": 0.0, + "rewards/chosen": 4.6125359535217285, + "rewards/margins": 0.0, + "rewards/rejected": 4.6125359535217285, + "step": 8739 + }, + { + "epoch": 1.42, + "learning_rate": 2.0572473412619797e-06, + "logits/chosen": -1.1954628229141235, + "logits/rejected": -1.2351720333099365, + "logps/chosen": -65.07672119140625, + "logps/rejected": -43.81806182861328, + "loss": 0.4809, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.3119919300079346, + "rewards/margins": 0.5541114807128906, + "rewards/rejected": 1.757880449295044, + "step": 8740 + }, + { + "epoch": 1.42, + "learning_rate": 2.056184920611134e-06, + "logits/chosen": -1.3955674171447754, + "logits/rejected": -1.3932971954345703, + "logps/chosen": -59.30375289916992, + "logps/rejected": -74.56729125976562, + "loss": 0.8796, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.942899703979492, + "rewards/margins": 0.5426783561706543, + "rewards/rejected": 2.400221347808838, + "step": 8741 + }, + { + "epoch": 1.42, + "learning_rate": 2.055122703350057e-06, + "logits/chosen": -1.3847237825393677, + "logits/rejected": -1.3483455181121826, + "logps/chosen": -59.065650939941406, + "logps/rejected": -68.90226745605469, + "loss": 1.0325, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.338249921798706, + "rewards/margins": -1.7519919872283936, + "rewards/rejected": 4.0902419090271, + "step": 8742 + }, + { + "epoch": 1.42, + "learning_rate": 2.0540606895521346e-06, + "logits/chosen": -1.3131824731826782, + "logits/rejected": -1.365479588508606, + "logps/chosen": -47.43095397949219, + "logps/rejected": -91.37750244140625, + "loss": 1.9724, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.331195116043091, + "rewards/margins": -2.3548829555511475, + "rewards/rejected": 5.686078071594238, + "step": 8743 + }, + { + "epoch": 1.42, + "learning_rate": 2.0529988792907457e-06, + "logits/chosen": -1.3561654090881348, + "logits/rejected": -1.4385030269622803, + "logps/chosen": -48.264610290527344, + "logps/rejected": -132.38941955566406, + "loss": 1.1511, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.755275011062622, + "rewards/margins": -2.1156554222106934, + "rewards/rejected": 3.8709304332733154, + "step": 8744 + }, + { + "epoch": 1.42, + "learning_rate": 2.0519372726392476e-06, + "logits/chosen": -1.535549283027649, + "logits/rejected": -1.4684964418411255, + "logps/chosen": -104.2370834350586, + "logps/rejected": -75.56645965576172, + "loss": 0.0538, + "rewards/accuracies": 1.0, + "rewards/chosen": 7.238808631896973, + "rewards/margins": 3.49406361579895, + "rewards/rejected": 3.7447450160980225, + "step": 8745 + }, + { + "epoch": 1.42, + "learning_rate": 2.0508758696709913e-06, + "logits/chosen": -0.7102737426757812, + "logits/rejected": -0.7102737426757812, + "logps/chosen": -33.49040222167969, + "logps/rejected": -33.49040222167969, + "loss": 1.5797, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.087454319000244, + "rewards/margins": 0.0, + "rewards/rejected": 2.087454319000244, + "step": 8746 + }, + { + "epoch": 1.42, + "learning_rate": 2.0498146704593056e-06, + "logits/chosen": -1.4088435173034668, + "logits/rejected": -1.4088435173034668, + "logps/chosen": -42.486480712890625, + "logps/rejected": -42.486480712890625, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.068077802658081, + "rewards/margins": 0.0, + "rewards/rejected": 3.068077802658081, + "step": 8747 + }, + { + "epoch": 1.42, + "learning_rate": 2.0487536750775132e-06, + "logits/chosen": -1.228240966796875, + "logits/rejected": -1.2536935806274414, + "logps/chosen": -53.623268127441406, + "logps/rejected": -69.60893249511719, + "loss": 0.3296, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.8321739435195923, + "rewards/margins": 0.1214606761932373, + "rewards/rejected": 1.710713267326355, + "step": 8748 + }, + { + "epoch": 1.42, + "learning_rate": 2.0476928835989167e-06, + "logits/chosen": -1.1586592197418213, + "logits/rejected": -1.205166220664978, + "logps/chosen": -42.88330078125, + "logps/rejected": -41.068115234375, + "loss": 0.6473, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.784312129020691, + "rewards/margins": -0.9015873670578003, + "rewards/rejected": 2.685899496078491, + "step": 8749 + }, + { + "epoch": 1.42, + "learning_rate": 2.0466322960968047e-06, + "logits/chosen": -1.3363693952560425, + "logits/rejected": -1.3334805965423584, + "logps/chosen": -136.4554443359375, + "logps/rejected": -108.97347259521484, + "loss": 0.2869, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.8286163806915283, + "rewards/margins": 0.29877543449401855, + "rewards/rejected": 3.5298409461975098, + "step": 8750 + }, + { + "epoch": 1.42, + "learning_rate": 2.0455719126444584e-06, + "logits/chosen": -0.8868644833564758, + "logits/rejected": -0.8868644833564758, + "logps/chosen": -26.787193298339844, + "logps/rejected": -26.787193298339844, + "loss": 1.1576, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.1598191261291504, + "rewards/margins": 0.0, + "rewards/rejected": 2.1598191261291504, + "step": 8751 + }, + { + "epoch": 1.42, + "learning_rate": 2.044511733315136e-06, + "logits/chosen": -1.1436446905136108, + "logits/rejected": -1.1372698545455933, + "logps/chosen": -68.21601867675781, + "logps/rejected": -76.94624328613281, + "loss": 0.4003, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.4417572021484375, + "rewards/margins": 0.013721466064453125, + "rewards/rejected": 2.4280357360839844, + "step": 8752 + }, + { + "epoch": 1.42, + "learning_rate": 2.0434517581820893e-06, + "logits/chosen": -1.3759706020355225, + "logits/rejected": -1.263441801071167, + "logps/chosen": -29.66437530517578, + "logps/rejected": -5.254724502563477, + "loss": 0.0382, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.5966782569885254, + "rewards/margins": 2.544264793395996, + "rewards/rejected": 1.0524133443832397, + "step": 8753 + }, + { + "epoch": 1.42, + "learning_rate": 2.04239198731855e-06, + "logits/chosen": -1.2559243440628052, + "logits/rejected": -1.2559243440628052, + "logps/chosen": -66.46104431152344, + "logps/rejected": -66.46104431152344, + "loss": 0.5835, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.9351929426193237, + "rewards/margins": 0.0, + "rewards/rejected": 1.9351929426193237, + "step": 8754 + }, + { + "epoch": 1.42, + "learning_rate": 2.0413324207977404e-06, + "logits/chosen": -1.308387041091919, + "logits/rejected": -1.3336647748947144, + "logps/chosen": -46.85628128051758, + "logps/rejected": -46.51531219482422, + "loss": 0.38, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.8615254163742065, + "rewards/margins": -0.1287517547607422, + "rewards/rejected": 1.9902771711349487, + "step": 8755 + }, + { + "epoch": 1.42, + "learning_rate": 2.0402730586928633e-06, + "logits/chosen": -1.0740394592285156, + "logits/rejected": -1.107643723487854, + "logps/chosen": -88.32820129394531, + "logps/rejected": -104.85936737060547, + "loss": 0.3272, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.4164596796035767, + "rewards/margins": 0.1128547191619873, + "rewards/rejected": 1.3036049604415894, + "step": 8756 + }, + { + "epoch": 1.42, + "learning_rate": 2.039213901077115e-06, + "logits/chosen": -1.6156563758850098, + "logits/rejected": -1.6381877660751343, + "logps/chosen": -61.96620178222656, + "logps/rejected": -86.14100646972656, + "loss": 0.4759, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.4954895973205566, + "rewards/margins": -0.37839341163635254, + "rewards/rejected": 3.873883008956909, + "step": 8757 + }, + { + "epoch": 1.42, + "learning_rate": 2.0381549480236685e-06, + "logits/chosen": -1.052019476890564, + "logits/rejected": -1.052019476890564, + "logps/chosen": -72.18771362304688, + "logps/rejected": -72.18771362304688, + "loss": 0.3659, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.9110618829727173, + "rewards/margins": 0.0, + "rewards/rejected": 1.9110618829727173, + "step": 8758 + }, + { + "epoch": 1.42, + "learning_rate": 2.037096199605692e-06, + "logits/chosen": -1.1665420532226562, + "logits/rejected": -1.1434388160705566, + "logps/chosen": -21.18210220336914, + "logps/rejected": -45.046607971191406, + "loss": 0.4325, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.2476966381073, + "rewards/margins": -0.016443729400634766, + "rewards/rejected": 2.2641403675079346, + "step": 8759 + }, + { + "epoch": 1.42, + "learning_rate": 2.036037655896331e-06, + "logits/chosen": -1.4015581607818604, + "logits/rejected": -1.4505176544189453, + "logps/chosen": -112.45233917236328, + "logps/rejected": -96.6997299194336, + "loss": 0.6528, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.8714897632598877, + "rewards/margins": -0.9248278141021729, + "rewards/rejected": 4.7963175773620605, + "step": 8760 + }, + { + "epoch": 1.42, + "learning_rate": 2.034979316968725e-06, + "logits/chosen": -1.358654260635376, + "logits/rejected": -1.2207071781158447, + "logps/chosen": -94.6025619506836, + "logps/rejected": -76.04844665527344, + "loss": 0.1951, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.273839473724365, + "rewards/margins": 3.123789072036743, + "rewards/rejected": 3.150050401687622, + "step": 8761 + }, + { + "epoch": 1.42, + "learning_rate": 2.0339211828959903e-06, + "logits/chosen": -1.2211971282958984, + "logits/rejected": -1.389998197555542, + "logps/chosen": -73.36934661865234, + "logps/rejected": -110.619384765625, + "loss": 2.374, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.7714059352874756, + "rewards/margins": -4.6450395584106445, + "rewards/rejected": 8.4164457321167, + "step": 8762 + }, + { + "epoch": 1.42, + "learning_rate": 2.0328632537512387e-06, + "logits/chosen": -1.372110366821289, + "logits/rejected": -1.1623469591140747, + "logps/chosen": -123.61309051513672, + "logps/rejected": -47.553009033203125, + "loss": 0.0843, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.321537017822266, + "rewards/margins": 2.0659193992614746, + "rewards/rejected": 3.255617618560791, + "step": 8763 + }, + { + "epoch": 1.42, + "learning_rate": 2.0318055296075587e-06, + "logits/chosen": -1.3125510215759277, + "logits/rejected": -1.3888280391693115, + "logps/chosen": -88.78327941894531, + "logps/rejected": -100.83061218261719, + "loss": 1.2394, + "rewards/accuracies": 0.0, + "rewards/chosen": 5.811560153961182, + "rewards/margins": -1.7849836349487305, + "rewards/rejected": 7.596543788909912, + "step": 8764 + }, + { + "epoch": 1.42, + "learning_rate": 2.030748010538034e-06, + "logits/chosen": -0.859264612197876, + "logits/rejected": -0.8523440361022949, + "logps/chosen": -6.839081764221191, + "logps/rejected": -4.120029449462891, + "loss": 1.0478, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.6897894740104675, + "rewards/margins": -0.24892866611480713, + "rewards/rejected": 0.9387181401252747, + "step": 8765 + }, + { + "epoch": 1.42, + "learning_rate": 2.0296906966157243e-06, + "logits/chosen": -1.1828832626342773, + "logits/rejected": -1.2019566297531128, + "logps/chosen": -45.871482849121094, + "logps/rejected": -88.57183074951172, + "loss": 0.2735, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.9663909673690796, + "rewards/margins": 0.48644018173217773, + "rewards/rejected": 1.4799507856369019, + "step": 8766 + }, + { + "epoch": 1.42, + "learning_rate": 2.0286335879136838e-06, + "logits/chosen": -1.291839838027954, + "logits/rejected": -1.2306710481643677, + "logps/chosen": -52.84548568725586, + "logps/rejected": -23.62904167175293, + "loss": 0.2595, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.343088150024414, + "rewards/margins": 0.3948490619659424, + "rewards/rejected": 2.9482390880584717, + "step": 8767 + }, + { + "epoch": 1.42, + "learning_rate": 2.027576684504945e-06, + "logits/chosen": -0.6136491298675537, + "logits/rejected": -0.6150074601173401, + "logps/chosen": -3.812483549118042, + "logps/rejected": -3.8735103607177734, + "loss": 0.7042, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.5167168974876404, + "rewards/margins": -0.13880211114883423, + "rewards/rejected": 0.6555190086364746, + "step": 8768 + }, + { + "epoch": 1.42, + "learning_rate": 2.026519986462534e-06, + "logits/chosen": -0.9290123581886292, + "logits/rejected": -0.9370008707046509, + "logps/chosen": -2.9135167598724365, + "logps/rejected": -0.5374143123626709, + "loss": 0.5316, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.27959200739860535, + "rewards/margins": 0.045487821102142334, + "rewards/rejected": 0.234104186296463, + "step": 8769 + }, + { + "epoch": 1.42, + "learning_rate": 2.0254634938594555e-06, + "logits/chosen": -1.3319728374481201, + "logits/rejected": -1.204759120941162, + "logps/chosen": -122.95817565917969, + "logps/rejected": -60.93534469604492, + "loss": 0.6471, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.4625396728515625, + "rewards/margins": 1.482067346572876, + "rewards/rejected": 2.9804723262786865, + "step": 8770 + }, + { + "epoch": 1.42, + "learning_rate": 2.0244072067687033e-06, + "logits/chosen": -1.5109119415283203, + "logits/rejected": -1.4209024906158447, + "logps/chosen": -159.38674926757812, + "logps/rejected": -18.948654174804688, + "loss": 0.0381, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.310122966766357, + "rewards/margins": 5.5149054527282715, + "rewards/rejected": 0.7952175140380859, + "step": 8771 + }, + { + "epoch": 1.42, + "learning_rate": 2.02335112526326e-06, + "logits/chosen": -0.8956133723258972, + "logits/rejected": -0.8992447257041931, + "logps/chosen": -2.1850521564483643, + "logps/rejected": -6.224921703338623, + "loss": 0.268, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2388555258512497, + "rewards/margins": 0.3468877971172333, + "rewards/rejected": -0.10803227871656418, + "step": 8772 + }, + { + "epoch": 1.42, + "learning_rate": 2.0222952494160864e-06, + "logits/chosen": -1.1754858493804932, + "logits/rejected": -1.1668405532836914, + "logps/chosen": -81.30815124511719, + "logps/rejected": -61.73945236206055, + "loss": 0.7052, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.567007541656494, + "rewards/margins": 0.3871586322784424, + "rewards/rejected": 2.1798489093780518, + "step": 8773 + }, + { + "epoch": 1.42, + "learning_rate": 2.0212395793001384e-06, + "logits/chosen": -1.3652963638305664, + "logits/rejected": -1.3945268392562866, + "logps/chosen": -69.48629760742188, + "logps/rejected": -85.26978302001953, + "loss": 0.5809, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.393644094467163, + "rewards/margins": -0.2546234130859375, + "rewards/rejected": 2.6482675075531006, + "step": 8774 + }, + { + "epoch": 1.42, + "learning_rate": 2.020184114988347e-06, + "logits/chosen": -1.5634187459945679, + "logits/rejected": -1.5680073499679565, + "logps/chosen": -61.146026611328125, + "logps/rejected": -79.25012969970703, + "loss": 0.6793, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.7995376586914062, + "rewards/margins": 0.02081608772277832, + "rewards/rejected": 2.778721570968628, + "step": 8775 + }, + { + "epoch": 1.42, + "learning_rate": 2.019128856553641e-06, + "logits/chosen": -1.2486706972122192, + "logits/rejected": -1.1280407905578613, + "logps/chosen": -140.70738220214844, + "logps/rejected": -75.36885070800781, + "loss": 0.3198, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.532940626144409, + "rewards/margins": 0.18196702003479004, + "rewards/rejected": 2.350973606109619, + "step": 8776 + }, + { + "epoch": 1.42, + "learning_rate": 2.0180738040689235e-06, + "logits/chosen": -1.3612902164459229, + "logits/rejected": -1.214125394821167, + "logps/chosen": -112.12202453613281, + "logps/rejected": -50.7185173034668, + "loss": 0.297, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3055877685546875, + "rewards/margins": 0.21429100632667542, + "rewards/rejected": 0.09129676967859268, + "step": 8777 + }, + { + "epoch": 1.42, + "learning_rate": 2.017018957607093e-06, + "logits/chosen": -0.9267003536224365, + "logits/rejected": -0.9220967888832092, + "logps/chosen": -6.340344429016113, + "logps/rejected": -2.4137918949127197, + "loss": 0.4318, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.19504375755786896, + "rewards/margins": -0.29395198822021484, + "rewards/rejected": 0.488995760679245, + "step": 8778 + }, + { + "epoch": 1.42, + "learning_rate": 2.0159643172410252e-06, + "logits/chosen": -0.8668325543403625, + "logits/rejected": -0.8668325543403625, + "logps/chosen": -74.34678649902344, + "logps/rejected": -74.34678649902344, + "loss": 0.3494, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.015446424484253, + "rewards/margins": 0.0, + "rewards/rejected": 2.015446424484253, + "step": 8779 + }, + { + "epoch": 1.43, + "learning_rate": 2.01490988304359e-06, + "logits/chosen": -1.3370410203933716, + "logits/rejected": -1.362457275390625, + "logps/chosen": -53.571197509765625, + "logps/rejected": -98.8470458984375, + "loss": 1.4339, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.39227294921875, + "rewards/margins": 0.69258713722229, + "rewards/rejected": 2.69968581199646, + "step": 8780 + }, + { + "epoch": 1.43, + "learning_rate": 2.013855655087634e-06, + "logits/chosen": -1.0850155353546143, + "logits/rejected": -1.0888569355010986, + "logps/chosen": -41.001731872558594, + "logps/rejected": -92.20460510253906, + "loss": 0.7737, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.6334855556488037, + "rewards/margins": -1.1865813732147217, + "rewards/rejected": 4.820066928863525, + "step": 8781 + }, + { + "epoch": 1.43, + "learning_rate": 2.012801633446e-06, + "logits/chosen": -1.3687541484832764, + "logits/rejected": -1.1865229606628418, + "logps/chosen": -89.82745361328125, + "logps/rejected": -33.224674224853516, + "loss": 0.0327, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.917802333831787, + "rewards/margins": 4.697347164154053, + "rewards/rejected": 2.2204551696777344, + "step": 8782 + }, + { + "epoch": 1.43, + "learning_rate": 2.011747818191506e-06, + "logits/chosen": -1.504044771194458, + "logits/rejected": -1.4950319528579712, + "logps/chosen": -93.77348327636719, + "logps/rejected": -89.60716247558594, + "loss": 0.2611, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.6238694190979004, + "rewards/margins": 0.3820769786834717, + "rewards/rejected": 3.2417924404144287, + "step": 8783 + }, + { + "epoch": 1.43, + "learning_rate": 2.010694209396964e-06, + "logits/chosen": -1.2240480184555054, + "logits/rejected": -1.152625560760498, + "logps/chosen": -98.82312774658203, + "logps/rejected": -44.215248107910156, + "loss": 0.2054, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.2238640785217285, + "rewards/margins": 3.67917799949646, + "rewards/rejected": 2.5446860790252686, + "step": 8784 + }, + { + "epoch": 1.43, + "learning_rate": 2.0096408071351654e-06, + "logits/chosen": -1.014271855354309, + "logits/rejected": -1.0185946226119995, + "logps/chosen": -3.159299373626709, + "logps/rejected": -1.5676683187484741, + "loss": 0.4564, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.25144609808921814, + "rewards/margins": -0.153085857629776, + "rewards/rejected": 0.40453195571899414, + "step": 8785 + }, + { + "epoch": 1.43, + "learning_rate": 2.008587611478894e-06, + "logits/chosen": -1.4483364820480347, + "logits/rejected": -1.1862763166427612, + "logps/chosen": -118.72911834716797, + "logps/rejected": -13.056453704833984, + "loss": 0.5285, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.884675025939941, + "rewards/margins": 3.9185032844543457, + "rewards/rejected": 0.9661718606948853, + "step": 8786 + }, + { + "epoch": 1.43, + "learning_rate": 2.0075346225009114e-06, + "logits/chosen": -0.9096080660820007, + "logits/rejected": -0.9096080660820007, + "logps/chosen": -57.993202209472656, + "logps/rejected": -57.993202209472656, + "loss": 0.347, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.0547072887420654, + "rewards/margins": 0.0, + "rewards/rejected": 2.0547072887420654, + "step": 8787 + }, + { + "epoch": 1.43, + "learning_rate": 2.006481840273973e-06, + "logits/chosen": -1.5770288705825806, + "logits/rejected": -1.5794469118118286, + "logps/chosen": -127.67288208007812, + "logps/rejected": -102.57594299316406, + "loss": 0.2235, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.79632568359375, + "rewards/margins": 1.9596495628356934, + "rewards/rejected": 0.8366760611534119, + "step": 8788 + }, + { + "epoch": 1.43, + "learning_rate": 2.0054292648708136e-06, + "logits/chosen": -1.2674537897109985, + "logits/rejected": -1.2785987854003906, + "logps/chosen": -43.48483657836914, + "logps/rejected": -107.03988647460938, + "loss": 0.3355, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.5476391315460205, + "rewards/margins": 0.7560482025146484, + "rewards/rejected": 1.791590929031372, + "step": 8789 + }, + { + "epoch": 1.43, + "learning_rate": 2.0043768963641584e-06, + "logits/chosen": -1.3767650127410889, + "logits/rejected": -1.3767650127410889, + "logps/chosen": -60.466426849365234, + "logps/rejected": -60.466426849365234, + "loss": 0.3467, + "rewards/accuracies": 0.0, + "rewards/chosen": 4.188533306121826, + "rewards/margins": 0.0, + "rewards/rejected": 4.188533306121826, + "step": 8790 + }, + { + "epoch": 1.43, + "learning_rate": 2.003324734826713e-06, + "logits/chosen": -1.4597893953323364, + "logits/rejected": -1.285032868385315, + "logps/chosen": -191.7359161376953, + "logps/rejected": -38.82334899902344, + "loss": 0.0114, + "rewards/accuracies": 1.0, + "rewards/chosen": 7.188725471496582, + "rewards/margins": 4.460121154785156, + "rewards/rejected": 2.7286040782928467, + "step": 8791 + }, + { + "epoch": 1.43, + "learning_rate": 2.0022727803311757e-06, + "logits/chosen": -1.4311519861221313, + "logits/rejected": -1.4744104146957397, + "logps/chosen": -148.07101440429688, + "logps/rejected": -76.88111877441406, + "loss": 0.6245, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.853537082672119, + "rewards/margins": 3.887986898422241, + "rewards/rejected": 2.965550184249878, + "step": 8792 + }, + { + "epoch": 1.43, + "learning_rate": 2.0012210329502224e-06, + "logits/chosen": -0.9892064929008484, + "logits/rejected": -0.9884436726570129, + "logps/chosen": -2.801133155822754, + "logps/rejected": -0.6983261704444885, + "loss": 1.9271, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.26975247263908386, + "rewards/margins": 0.011130154132843018, + "rewards/rejected": 0.25862231850624084, + "step": 8793 + }, + { + "epoch": 1.43, + "learning_rate": 2.000169492756523e-06, + "logits/chosen": -1.2526979446411133, + "logits/rejected": -1.2892261743545532, + "logps/chosen": -88.15018463134766, + "logps/rejected": -176.80516052246094, + "loss": 0.0922, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9952361583709717, + "rewards/margins": 1.9260696172714233, + "rewards/rejected": 1.0691665410995483, + "step": 8794 + }, + { + "epoch": 1.43, + "learning_rate": 1.9991181598227248e-06, + "logits/chosen": -1.1656399965286255, + "logits/rejected": -1.176975965499878, + "logps/chosen": -7.836030960083008, + "logps/rejected": -21.807861328125, + "loss": 0.2539, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.0427812337875366, + "rewards/margins": 0.42623841762542725, + "rewards/rejected": 0.6165428161621094, + "step": 8795 + }, + { + "epoch": 1.43, + "learning_rate": 1.99806703422147e-06, + "logits/chosen": -1.439154028892517, + "logits/rejected": -1.4345895051956177, + "logps/chosen": -135.76954650878906, + "logps/rejected": -101.50666809082031, + "loss": 0.5392, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.5372588634490967, + "rewards/margins": -0.6607925891876221, + "rewards/rejected": 3.1980514526367188, + "step": 8796 + }, + { + "epoch": 1.43, + "learning_rate": 1.997016116025376e-06, + "logits/chosen": -1.4516650438308716, + "logits/rejected": -1.3492062091827393, + "logps/chosen": -86.36613464355469, + "logps/rejected": -63.62154006958008, + "loss": 1.2036, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.6655113697052, + "rewards/margins": 1.2551730871200562, + "rewards/rejected": 1.410338282585144, + "step": 8797 + }, + { + "epoch": 1.43, + "learning_rate": 1.9959654053070556e-06, + "logits/chosen": -1.079892635345459, + "logits/rejected": -1.1476337909698486, + "logps/chosen": -52.33027267456055, + "logps/rejected": -100.7962646484375, + "loss": 0.8071, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.4231021404266357, + "rewards/margins": -0.7216641902923584, + "rewards/rejected": 4.144766330718994, + "step": 8798 + }, + { + "epoch": 1.43, + "learning_rate": 1.9949149021390995e-06, + "logits/chosen": -1.263792872428894, + "logits/rejected": -1.1258264780044556, + "logps/chosen": -43.969459533691406, + "logps/rejected": -29.276935577392578, + "loss": 1.299, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9808297157287598, + "rewards/margins": 0.5247292518615723, + "rewards/rejected": 2.4561004638671875, + "step": 8799 + }, + { + "epoch": 1.43, + "learning_rate": 1.9938646065940914e-06, + "logits/chosen": -1.2727075815200806, + "logits/rejected": -1.1881489753723145, + "logps/chosen": -61.68278503417969, + "logps/rejected": -61.98078155517578, + "loss": 1.7722, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.278917074203491, + "rewards/margins": 0.7372283935546875, + "rewards/rejected": 2.5416886806488037, + "step": 8800 + }, + { + "epoch": 1.43, + "learning_rate": 1.9928145187445925e-06, + "logits/chosen": -1.1631879806518555, + "logits/rejected": -1.094735860824585, + "logps/chosen": -38.272911071777344, + "logps/rejected": -16.671642303466797, + "loss": 0.1684, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2786216735839844, + "rewards/margins": 2.4825453758239746, + "rewards/rejected": 0.7960764169692993, + "step": 8801 + }, + { + "epoch": 1.43, + "learning_rate": 1.9917646386631577e-06, + "logits/chosen": -1.1939634084701538, + "logits/rejected": -1.1939634084701538, + "logps/chosen": -86.2247314453125, + "logps/rejected": -86.2247314453125, + "loss": 0.3891, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.3390259742736816, + "rewards/margins": 0.0, + "rewards/rejected": 3.3390259742736816, + "step": 8802 + }, + { + "epoch": 1.43, + "learning_rate": 1.9907149664223206e-06, + "logits/chosen": -0.9546816349029541, + "logits/rejected": -0.9546816349029541, + "logps/chosen": -97.28508758544922, + "logps/rejected": -97.28508758544922, + "loss": 0.6766, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.7513039112091064, + "rewards/margins": 0.0, + "rewards/rejected": 3.7513039112091064, + "step": 8803 + }, + { + "epoch": 1.43, + "learning_rate": 1.9896655020946074e-06, + "logits/chosen": -1.4418818950653076, + "logits/rejected": -1.3295491933822632, + "logps/chosen": -78.1473159790039, + "logps/rejected": -30.3476619720459, + "loss": 1.3493, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.77128529548645, + "rewards/margins": -0.008249521255493164, + "rewards/rejected": 2.7795348167419434, + "step": 8804 + }, + { + "epoch": 1.43, + "learning_rate": 1.988616245752522e-06, + "logits/chosen": -1.2644370794296265, + "logits/rejected": -1.162691593170166, + "logps/chosen": -173.4603729248047, + "logps/rejected": -64.46002197265625, + "loss": 0.3211, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.789903163909912, + "rewards/margins": 1.4483473300933838, + "rewards/rejected": 3.3415558338165283, + "step": 8805 + }, + { + "epoch": 1.43, + "learning_rate": 1.98756719746856e-06, + "logits/chosen": -1.300257682800293, + "logits/rejected": -1.2686145305633545, + "logps/chosen": -62.8917121887207, + "logps/rejected": -111.06575775146484, + "loss": 0.5828, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.0806682109832764, + "rewards/margins": 0.6093325614929199, + "rewards/rejected": 1.4713356494903564, + "step": 8806 + }, + { + "epoch": 1.43, + "learning_rate": 1.9865183573152025e-06, + "logits/chosen": -1.514732837677002, + "logits/rejected": -1.507917881011963, + "logps/chosen": -102.25703430175781, + "logps/rejected": -125.48565673828125, + "loss": 0.8693, + "rewards/accuracies": 0.0, + "rewards/chosen": 5.0381669998168945, + "rewards/margins": -1.5401077270507812, + "rewards/rejected": 6.578274726867676, + "step": 8807 + }, + { + "epoch": 1.43, + "learning_rate": 1.9854697253649108e-06, + "logits/chosen": -1.2825833559036255, + "logits/rejected": -1.249216079711914, + "logps/chosen": -111.55859375, + "logps/rejected": -90.7114028930664, + "loss": 0.2061, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.6691651344299316, + "rewards/margins": 0.7051025629043579, + "rewards/rejected": 1.9640625715255737, + "step": 8808 + }, + { + "epoch": 1.43, + "learning_rate": 1.9844213016901392e-06, + "logits/chosen": -1.2617154121398926, + "logits/rejected": -1.2738829851150513, + "logps/chosen": -63.708099365234375, + "logps/rejected": -60.588890075683594, + "loss": 0.448, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.184626817703247, + "rewards/margins": 0.31294703483581543, + "rewards/rejected": 2.8716797828674316, + "step": 8809 + }, + { + "epoch": 1.43, + "learning_rate": 1.98337308636332e-06, + "logits/chosen": -1.1515650749206543, + "logits/rejected": -1.0139323472976685, + "logps/chosen": -150.0828857421875, + "logps/rejected": -72.22638702392578, + "loss": 0.07, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.774781703948975, + "rewards/margins": 1.9383411407470703, + "rewards/rejected": 4.836440563201904, + "step": 8810 + }, + { + "epoch": 1.43, + "learning_rate": 1.9823250794568795e-06, + "logits/chosen": -1.543717622756958, + "logits/rejected": -1.2873972654342651, + "logps/chosen": -77.70999145507812, + "logps/rejected": -63.183570861816406, + "loss": 0.4576, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.8996552228927612, + "rewards/margins": -0.25871360301971436, + "rewards/rejected": 2.1583688259124756, + "step": 8811 + }, + { + "epoch": 1.43, + "learning_rate": 1.9812772810432195e-06, + "logits/chosen": -1.0895347595214844, + "logits/rejected": -1.1267441511154175, + "logps/chosen": -58.78965759277344, + "logps/rejected": -112.18843078613281, + "loss": 0.9494, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.3955901861190796, + "rewards/margins": -0.15694284439086914, + "rewards/rejected": 1.5525330305099487, + "step": 8812 + }, + { + "epoch": 1.43, + "learning_rate": 1.9802296911947387e-06, + "logits/chosen": -1.3636008501052856, + "logits/rejected": -1.383716106414795, + "logps/chosen": -54.57913589477539, + "logps/rejected": -51.392051696777344, + "loss": 1.9917, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.7290172576904297, + "rewards/margins": -1.6492290496826172, + "rewards/rejected": 3.378246307373047, + "step": 8813 + }, + { + "epoch": 1.43, + "learning_rate": 1.9791823099838107e-06, + "logits/chosen": -1.4586399793624878, + "logits/rejected": -1.4447848796844482, + "logps/chosen": -115.56976318359375, + "logps/rejected": -108.01410675048828, + "loss": 0.2421, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.529484510421753, + "rewards/margins": 1.5052435398101807, + "rewards/rejected": 2.0242409706115723, + "step": 8814 + }, + { + "epoch": 1.43, + "learning_rate": 1.9781351374828037e-06, + "logits/chosen": -1.4569091796875, + "logits/rejected": -1.4590380191802979, + "logps/chosen": -106.496826171875, + "logps/rejected": -183.10992431640625, + "loss": 0.4032, + "rewards/accuracies": 0.0, + "rewards/chosen": 7.289258003234863, + "rewards/margins": -0.21439790725708008, + "rewards/rejected": 7.503655910491943, + "step": 8815 + }, + { + "epoch": 1.43, + "learning_rate": 1.9770881737640642e-06, + "logits/chosen": -1.130380392074585, + "logits/rejected": -1.090190052986145, + "logps/chosen": -66.32164001464844, + "logps/rejected": -128.03118896484375, + "loss": 0.4793, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.9975318908691406, + "rewards/margins": 0.10195386409759521, + "rewards/rejected": 1.8955780267715454, + "step": 8816 + }, + { + "epoch": 1.43, + "learning_rate": 1.97604141889993e-06, + "logits/chosen": -1.6983200311660767, + "logits/rejected": -1.708966851234436, + "logps/chosen": -70.4220962524414, + "logps/rejected": -128.94012451171875, + "loss": 0.3906, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9137489795684814, + "rewards/margins": 1.0605324506759644, + "rewards/rejected": 1.853216528892517, + "step": 8817 + }, + { + "epoch": 1.43, + "learning_rate": 1.9749948729627188e-06, + "logits/chosen": -1.2081245183944702, + "logits/rejected": -1.1630111932754517, + "logps/chosen": -31.750076293945312, + "logps/rejected": -12.892560958862305, + "loss": 0.6941, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.5639862418174744, + "rewards/margins": -0.2394578456878662, + "rewards/rejected": 0.8034440875053406, + "step": 8818 + }, + { + "epoch": 1.43, + "learning_rate": 1.973948536024741e-06, + "logits/chosen": -1.1885316371917725, + "logits/rejected": -1.2040178775787354, + "logps/chosen": -86.77618408203125, + "logps/rejected": -68.06855010986328, + "loss": 0.4126, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.548814535140991, + "rewards/margins": 1.958590030670166, + "rewards/rejected": 0.5902244448661804, + "step": 8819 + }, + { + "epoch": 1.43, + "learning_rate": 1.9729024081582855e-06, + "logits/chosen": -1.3857132196426392, + "logits/rejected": -1.2729212045669556, + "logps/chosen": -65.51714324951172, + "logps/rejected": -17.478042602539062, + "loss": 0.2341, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.9651939868927002, + "rewards/margins": 1.4841642379760742, + "rewards/rejected": 0.4810297191143036, + "step": 8820 + }, + { + "epoch": 1.43, + "learning_rate": 1.971856489435632e-06, + "logits/chosen": -1.6130096912384033, + "logits/rejected": -1.5471031665802002, + "logps/chosen": -87.55418395996094, + "logps/rejected": -26.217655181884766, + "loss": 0.1532, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.5911331176757812, + "rewards/margins": 2.9681358337402344, + "rewards/rejected": -0.3770027160644531, + "step": 8821 + }, + { + "epoch": 1.43, + "learning_rate": 1.970810779929041e-06, + "logits/chosen": -1.394997239112854, + "logits/rejected": -1.3747050762176514, + "logps/chosen": -58.25358581542969, + "logps/rejected": -63.26116180419922, + "loss": 0.2512, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.115041494369507, + "rewards/margins": 0.5834367275238037, + "rewards/rejected": 1.5316047668457031, + "step": 8822 + }, + { + "epoch": 1.43, + "learning_rate": 1.9697652797107645e-06, + "logits/chosen": -1.1041837930679321, + "logits/rejected": -1.1416186094284058, + "logps/chosen": -63.82008743286133, + "logps/rejected": -64.70893859863281, + "loss": 1.0512, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.2238521575927734, + "rewards/margins": -0.4022960662841797, + "rewards/rejected": 1.6261482238769531, + "step": 8823 + }, + { + "epoch": 1.43, + "learning_rate": 1.968719988853033e-06, + "logits/chosen": -1.7374242544174194, + "logits/rejected": -1.6773496866226196, + "logps/chosen": -65.77220916748047, + "logps/rejected": -22.245328903198242, + "loss": 1.1859, + "rewards/accuracies": 1.0, + "rewards/chosen": 7.265015602111816, + "rewards/margins": 6.64909553527832, + "rewards/rejected": 0.6159200668334961, + "step": 8824 + }, + { + "epoch": 1.43, + "learning_rate": 1.96767490742807e-06, + "logits/chosen": -1.09757399559021, + "logits/rejected": -1.076662540435791, + "logps/chosen": -47.321556091308594, + "logps/rejected": -126.66651916503906, + "loss": 0.1736, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.7041534185409546, + "rewards/margins": 1.3207015991210938, + "rewards/rejected": 0.3834518492221832, + "step": 8825 + }, + { + "epoch": 1.43, + "learning_rate": 1.9666300355080764e-06, + "logits/chosen": -1.1265177726745605, + "logits/rejected": -1.1265177726745605, + "logps/chosen": -24.194656372070312, + "logps/rejected": -24.194656372070312, + "loss": 0.3965, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.873608112335205, + "rewards/margins": 0.0, + "rewards/rejected": 3.873608112335205, + "step": 8826 + }, + { + "epoch": 1.43, + "learning_rate": 1.9655853731652473e-06, + "logits/chosen": -1.348975419998169, + "logits/rejected": -1.3761497735977173, + "logps/chosen": -58.33879470825195, + "logps/rejected": -98.79763793945312, + "loss": 1.197, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.5872387886047363, + "rewards/margins": -0.42003822326660156, + "rewards/rejected": 4.007277011871338, + "step": 8827 + }, + { + "epoch": 1.43, + "learning_rate": 1.9645409204717556e-06, + "logits/chosen": -1.681664228439331, + "logits/rejected": -1.7217581272125244, + "logps/chosen": -137.3194580078125, + "logps/rejected": -154.67037963867188, + "loss": 0.553, + "rewards/accuracies": 0.0, + "rewards/chosen": 6.896800518035889, + "rewards/margins": -0.4736800193786621, + "rewards/rejected": 7.370480537414551, + "step": 8828 + }, + { + "epoch": 1.43, + "learning_rate": 1.963496677499766e-06, + "logits/chosen": -1.176194667816162, + "logits/rejected": -1.177351951599121, + "logps/chosen": -25.36556053161621, + "logps/rejected": -79.2806396484375, + "loss": 1.1688, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.686174154281616, + "rewards/margins": -2.2046525478363037, + "rewards/rejected": 4.89082670211792, + "step": 8829 + }, + { + "epoch": 1.43, + "learning_rate": 1.9624526443214228e-06, + "logits/chosen": -1.4873247146606445, + "logits/rejected": -1.3656885623931885, + "logps/chosen": -85.50283813476562, + "logps/rejected": -33.414024353027344, + "loss": 0.0973, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.891522169113159, + "rewards/margins": 2.2682013511657715, + "rewards/rejected": 0.6233207583427429, + "step": 8830 + }, + { + "epoch": 1.43, + "learning_rate": 1.961408821008862e-06, + "logits/chosen": -1.5512961149215698, + "logits/rejected": -1.5321842432022095, + "logps/chosen": -228.10794067382812, + "logps/rejected": -78.75448608398438, + "loss": 0.0864, + "rewards/accuracies": 1.0, + "rewards/chosen": 8.2764253616333, + "rewards/margins": 1.6831450462341309, + "rewards/rejected": 6.59328031539917, + "step": 8831 + }, + { + "epoch": 1.43, + "learning_rate": 1.9603652076341986e-06, + "logits/chosen": -1.0276180505752563, + "logits/rejected": -1.006662130355835, + "logps/chosen": -30.346248626708984, + "logps/rejected": -22.318511962890625, + "loss": 0.4131, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.39562150835990906, + "rewards/margins": -0.18190059065818787, + "rewards/rejected": 0.5775220990180969, + "step": 8832 + }, + { + "epoch": 1.43, + "learning_rate": 1.9593218042695394e-06, + "logits/chosen": -1.5100377798080444, + "logits/rejected": -1.5550345182418823, + "logps/chosen": -100.65973663330078, + "logps/rejected": -165.48104858398438, + "loss": 2.4453, + "rewards/accuracies": 0.0, + "rewards/chosen": 7.270143985748291, + "rewards/margins": -4.883036136627197, + "rewards/rejected": 12.153180122375488, + "step": 8833 + }, + { + "epoch": 1.43, + "learning_rate": 1.9582786109869713e-06, + "logits/chosen": -1.106007695198059, + "logits/rejected": -1.0687586069107056, + "logps/chosen": -61.943695068359375, + "logps/rejected": -65.54059600830078, + "loss": 0.5915, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.5557831525802612, + "rewards/margins": -0.3545631170272827, + "rewards/rejected": 1.910346269607544, + "step": 8834 + }, + { + "epoch": 1.43, + "learning_rate": 1.9572356278585715e-06, + "logits/chosen": -0.9624773859977722, + "logits/rejected": -0.9791085124015808, + "logps/chosen": -50.314308166503906, + "logps/rejected": -63.666648864746094, + "loss": 0.5224, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.791954755783081, + "rewards/margins": -0.32059192657470703, + "rewards/rejected": 3.112546682357788, + "step": 8835 + }, + { + "epoch": 1.43, + "learning_rate": 1.956192854956397e-06, + "logits/chosen": -1.25398850440979, + "logits/rejected": -1.2363721132278442, + "logps/chosen": -80.61652374267578, + "logps/rejected": -145.31802368164062, + "loss": 0.2541, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.1866118907928467, + "rewards/margins": 0.6895750761032104, + "rewards/rejected": 1.4970368146896362, + "step": 8836 + }, + { + "epoch": 1.43, + "learning_rate": 1.9551502923524975e-06, + "logits/chosen": -0.935932457447052, + "logits/rejected": -0.8950984477996826, + "logps/chosen": -26.48175621032715, + "logps/rejected": -18.128623962402344, + "loss": 0.2528, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.954615592956543, + "rewards/margins": 0.47488346695899963, + "rewards/rejected": 0.47973212599754333, + "step": 8837 + }, + { + "epoch": 1.43, + "learning_rate": 1.9541079401189e-06, + "logits/chosen": -1.4394804239273071, + "logits/rejected": -1.3220810890197754, + "logps/chosen": -79.06197357177734, + "logps/rejected": -54.085758209228516, + "loss": 0.4034, + "rewards/accuracies": 0.0, + "rewards/chosen": 4.317179203033447, + "rewards/margins": -0.2142200469970703, + "rewards/rejected": 4.531399250030518, + "step": 8838 + }, + { + "epoch": 1.43, + "learning_rate": 1.9530657983276254e-06, + "logits/chosen": -1.1570466756820679, + "logits/rejected": -1.174492359161377, + "logps/chosen": -38.21155548095703, + "logps/rejected": -66.23160552978516, + "loss": 0.4539, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.9335120916366577, + "rewards/margins": -0.3799546957015991, + "rewards/rejected": 2.313466787338257, + "step": 8839 + }, + { + "epoch": 1.43, + "learning_rate": 1.952023867050672e-06, + "logits/chosen": -1.2734695672988892, + "logits/rejected": -1.1504851579666138, + "logps/chosen": -100.44221496582031, + "logps/rejected": -125.9027099609375, + "loss": 0.0162, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.402549743652344, + "rewards/margins": 3.614462375640869, + "rewards/rejected": 1.7880874872207642, + "step": 8840 + }, + { + "epoch": 1.43, + "learning_rate": 1.9509821463600304e-06, + "logits/chosen": -1.160808801651001, + "logits/rejected": -0.9348616003990173, + "logps/chosen": -58.72876739501953, + "logps/rejected": -15.314939498901367, + "loss": 0.1415, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.568878173828125, + "rewards/margins": 2.3097119331359863, + "rewards/rejected": 0.25916633009910583, + "step": 8841 + }, + { + "epoch": 1.44, + "learning_rate": 1.949940636327671e-06, + "logits/chosen": -1.304236650466919, + "logits/rejected": -1.265505075454712, + "logps/chosen": -57.91120147705078, + "logps/rejected": -56.28028106689453, + "loss": 0.5491, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.8446877002716064, + "rewards/margins": -0.4864325523376465, + "rewards/rejected": 2.331120252609253, + "step": 8842 + }, + { + "epoch": 1.44, + "learning_rate": 1.9488993370255543e-06, + "logits/chosen": -1.3092368841171265, + "logits/rejected": -1.3000231981277466, + "logps/chosen": -72.17023468017578, + "logps/rejected": -88.36689758300781, + "loss": 0.1481, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.7441346645355225, + "rewards/margins": 1.1429193019866943, + "rewards/rejected": 2.601215362548828, + "step": 8843 + }, + { + "epoch": 1.44, + "learning_rate": 1.9478582485256248e-06, + "logits/chosen": -1.2352336645126343, + "logits/rejected": -1.1603680849075317, + "logps/chosen": -54.903656005859375, + "logps/rejected": -45.14617156982422, + "loss": 1.1377, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.055326223373413, + "rewards/margins": -0.8965620994567871, + "rewards/rejected": 2.9518883228302, + "step": 8844 + }, + { + "epoch": 1.44, + "learning_rate": 1.946817370899809e-06, + "logits/chosen": -0.8160433769226074, + "logits/rejected": -0.8416690826416016, + "logps/chosen": -86.01992797851562, + "logps/rejected": -60.649139404296875, + "loss": 2.4508, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.3226242065429688, + "rewards/margins": 0.13441920280456543, + "rewards/rejected": 1.1882050037384033, + "step": 8845 + }, + { + "epoch": 1.44, + "learning_rate": 1.9457767042200255e-06, + "logits/chosen": -1.2348182201385498, + "logits/rejected": -1.1579605340957642, + "logps/chosen": -100.73702239990234, + "logps/rejected": -51.36663818359375, + "loss": 0.1869, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.128101348876953, + "rewards/margins": 0.7950828075408936, + "rewards/rejected": 2.3330185413360596, + "step": 8846 + }, + { + "epoch": 1.44, + "learning_rate": 1.94473624855817e-06, + "logits/chosen": -1.4404217004776, + "logits/rejected": -1.37933349609375, + "logps/chosen": -69.198486328125, + "logps/rejected": -58.95054626464844, + "loss": 3.2534, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.4543144702911377, + "rewards/margins": -0.29289770126342773, + "rewards/rejected": 3.7472121715545654, + "step": 8847 + }, + { + "epoch": 1.44, + "learning_rate": 1.9436960039861324e-06, + "logits/chosen": -1.3737283945083618, + "logits/rejected": -1.2140719890594482, + "logps/chosen": -64.151123046875, + "logps/rejected": -31.71883201599121, + "loss": 0.9069, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.8680312633514404, + "rewards/margins": 0.7039964199066162, + "rewards/rejected": 3.164034843444824, + "step": 8848 + }, + { + "epoch": 1.44, + "learning_rate": 1.942655970575781e-06, + "logits/chosen": -1.0909767150878906, + "logits/rejected": -1.1174083948135376, + "logps/chosen": -31.36107063293457, + "logps/rejected": -56.713417053222656, + "loss": 0.3923, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.187820315361023, + "rewards/margins": -0.0986490249633789, + "rewards/rejected": 1.2864693403244019, + "step": 8849 + }, + { + "epoch": 1.44, + "learning_rate": 1.9416161483989736e-06, + "logits/chosen": -1.4410216808319092, + "logits/rejected": -1.4406434297561646, + "logps/chosen": -17.86347198486328, + "logps/rejected": -14.480228424072266, + "loss": 0.5875, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.7371253967285156, + "rewards/margins": 0.47934699058532715, + "rewards/rejected": 1.2577784061431885, + "step": 8850 + }, + { + "epoch": 1.44, + "learning_rate": 1.9405765375275508e-06, + "logits/chosen": -1.6204110383987427, + "logits/rejected": -1.649838924407959, + "logps/chosen": -47.761131286621094, + "logps/rejected": -103.23314666748047, + "loss": 0.8777, + "rewards/accuracies": 0.0, + "rewards/chosen": 4.002530574798584, + "rewards/margins": -1.56233549118042, + "rewards/rejected": 5.564866065979004, + "step": 8851 + }, + { + "epoch": 1.44, + "learning_rate": 1.9395371380333427e-06, + "logits/chosen": -1.0162469148635864, + "logits/rejected": -1.1602245569229126, + "logps/chosen": -44.696109771728516, + "logps/rejected": -95.52467346191406, + "loss": 1.9879, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.5474880933761597, + "rewards/margins": 0.15822112560272217, + "rewards/rejected": 1.3892669677734375, + "step": 8852 + }, + { + "epoch": 1.44, + "learning_rate": 1.9384979499881578e-06, + "logits/chosen": -0.9629931449890137, + "logits/rejected": -0.9629931449890137, + "logps/chosen": -40.404541015625, + "logps/rejected": -40.404541015625, + "loss": 0.3473, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.971519112586975, + "rewards/margins": 0.0, + "rewards/rejected": 1.971519112586975, + "step": 8853 + }, + { + "epoch": 1.44, + "learning_rate": 1.9374589734638e-06, + "logits/chosen": -1.4627174139022827, + "logits/rejected": -1.4823311567306519, + "logps/chosen": -99.06779479980469, + "logps/rejected": -154.554931640625, + "loss": 1.2235, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.688633680343628, + "rewards/margins": -2.323298692703247, + "rewards/rejected": 6.011932373046875, + "step": 8854 + }, + { + "epoch": 1.44, + "learning_rate": 1.9364202085320454e-06, + "logits/chosen": -1.371376395225525, + "logits/rejected": -1.3588281869888306, + "logps/chosen": -57.10704803466797, + "logps/rejected": -50.388240814208984, + "loss": 0.2571, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.297221660614014, + "rewards/margins": 0.40309596061706543, + "rewards/rejected": 3.8941256999969482, + "step": 8855 + }, + { + "epoch": 1.44, + "learning_rate": 1.9353816552646683e-06, + "logits/chosen": -1.385180950164795, + "logits/rejected": -1.407230019569397, + "logps/chosen": -53.57392501831055, + "logps/rejected": -58.70025634765625, + "loss": 0.5568, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.080087661743164, + "rewards/margins": -0.5665555000305176, + "rewards/rejected": 3.6466431617736816, + "step": 8856 + }, + { + "epoch": 1.44, + "learning_rate": 1.9343433137334193e-06, + "logits/chosen": -0.8812065124511719, + "logits/rejected": -0.8916141986846924, + "logps/chosen": -20.6059627532959, + "logps/rejected": -19.320934295654297, + "loss": 0.4931, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.4832575023174286, + "rewards/margins": -0.13298681378364563, + "rewards/rejected": 0.6162443161010742, + "step": 8857 + }, + { + "epoch": 1.44, + "learning_rate": 1.933305184010042e-06, + "logits/chosen": -1.2005058526992798, + "logits/rejected": -1.1681629419326782, + "logps/chosen": -34.05522537231445, + "logps/rejected": -12.097750663757324, + "loss": 0.1898, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.7041641473770142, + "rewards/margins": 0.8958186507225037, + "rewards/rejected": 0.8083454966545105, + "step": 8858 + }, + { + "epoch": 1.44, + "learning_rate": 1.932267266166257e-06, + "logits/chosen": -1.5782313346862793, + "logits/rejected": -1.5170588493347168, + "logps/chosen": -112.45976257324219, + "logps/rejected": -92.49098205566406, + "loss": 0.3245, + "rewards/accuracies": 1.0, + "rewards/chosen": 7.568783760070801, + "rewards/margins": 0.15524005889892578, + "rewards/rejected": 7.413543701171875, + "step": 8859 + }, + { + "epoch": 1.44, + "learning_rate": 1.9312295602737783e-06, + "logits/chosen": -1.6469544172286987, + "logits/rejected": -1.6537530422210693, + "logps/chosen": -71.44607543945312, + "logps/rejected": -68.29258728027344, + "loss": 0.6457, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0336501598358154, + "rewards/margins": 0.1998732089996338, + "rewards/rejected": 2.8337769508361816, + "step": 8860 + }, + { + "epoch": 1.44, + "learning_rate": 1.930192066404299e-06, + "logits/chosen": -0.6654231548309326, + "logits/rejected": -0.6371787190437317, + "logps/chosen": -20.243051528930664, + "logps/rejected": -18.489444732666016, + "loss": 0.5781, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5152506232261658, + "rewards/margins": 0.08949339389801025, + "rewards/rejected": 0.4257572293281555, + "step": 8861 + }, + { + "epoch": 1.44, + "learning_rate": 1.9291547846295005e-06, + "logits/chosen": -1.0897469520568848, + "logits/rejected": -1.1416692733764648, + "logps/chosen": -26.707761764526367, + "logps/rejected": -43.650962829589844, + "loss": 0.618, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.0970308780670166, + "rewards/margins": -0.2591344118118286, + "rewards/rejected": 1.3561652898788452, + "step": 8862 + }, + { + "epoch": 1.44, + "learning_rate": 1.9281177150210515e-06, + "logits/chosen": -1.266981840133667, + "logits/rejected": -1.2278387546539307, + "logps/chosen": -42.555458068847656, + "logps/rejected": -27.988525390625, + "loss": 1.1288, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.1045444011688232, + "rewards/margins": -0.3950231075286865, + "rewards/rejected": 2.4995675086975098, + "step": 8863 + }, + { + "epoch": 1.44, + "learning_rate": 1.9270808576506006e-06, + "logits/chosen": -0.9191589951515198, + "logits/rejected": -0.9191589951515198, + "logps/chosen": -27.898685455322266, + "logps/rejected": -27.898685455322266, + "loss": 0.5001, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.8527248501777649, + "rewards/margins": 0.0, + "rewards/rejected": 0.8527248501777649, + "step": 8864 + }, + { + "epoch": 1.44, + "learning_rate": 1.9260442125897883e-06, + "logits/chosen": -1.0332860946655273, + "logits/rejected": -1.109380841255188, + "logps/chosen": -19.64396858215332, + "logps/rejected": -61.05091857910156, + "loss": 2.1132, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.7451872229576111, + "rewards/margins": -2.011225938796997, + "rewards/rejected": 2.756413221359253, + "step": 8865 + }, + { + "epoch": 1.44, + "learning_rate": 1.9250077799102323e-06, + "logits/chosen": -1.4659876823425293, + "logits/rejected": -1.428802490234375, + "logps/chosen": -122.4942626953125, + "logps/rejected": -90.24925994873047, + "loss": 0.0583, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.675392150878906, + "rewards/margins": 3.7557082176208496, + "rewards/rejected": 2.9196839332580566, + "step": 8866 + }, + { + "epoch": 1.44, + "learning_rate": 1.923971559683545e-06, + "logits/chosen": -1.4026330709457397, + "logits/rejected": -1.0385866165161133, + "logps/chosen": -139.74700927734375, + "logps/rejected": -80.72825622558594, + "loss": 0.094, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.598379611968994, + "rewards/margins": 1.6216933727264404, + "rewards/rejected": 3.9766862392425537, + "step": 8867 + }, + { + "epoch": 1.44, + "learning_rate": 1.9229355519813154e-06, + "logits/chosen": -1.38639497756958, + "logits/rejected": -1.4203059673309326, + "logps/chosen": -77.1767578125, + "logps/rejected": -97.5364990234375, + "loss": 0.9458, + "rewards/accuracies": 0.0, + "rewards/chosen": 4.319096565246582, + "rewards/margins": -1.673609733581543, + "rewards/rejected": 5.992706298828125, + "step": 8868 + }, + { + "epoch": 1.44, + "learning_rate": 1.9218997568751258e-06, + "logits/chosen": -1.5254842042922974, + "logits/rejected": -1.430051326751709, + "logps/chosen": -58.234580993652344, + "logps/rejected": -48.76072692871094, + "loss": 0.1625, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.424022674560547, + "rewards/margins": 1.3946353197097778, + "rewards/rejected": 1.029387354850769, + "step": 8869 + }, + { + "epoch": 1.44, + "learning_rate": 1.9208641744365353e-06, + "logits/chosen": -1.4810099601745605, + "logits/rejected": -1.511257529258728, + "logps/chosen": -106.74092102050781, + "logps/rejected": -101.22074890136719, + "loss": 0.2045, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.9157333374023438, + "rewards/margins": 0.7577452659606934, + "rewards/rejected": 3.1579880714416504, + "step": 8870 + }, + { + "epoch": 1.44, + "learning_rate": 1.919828804737098e-06, + "logits/chosen": -0.9084943532943726, + "logits/rejected": -0.8809643387794495, + "logps/chosen": -63.49491500854492, + "logps/rejected": -70.46399688720703, + "loss": 0.3371, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.1295353174209595, + "rewards/margins": 0.09331631660461426, + "rewards/rejected": 1.0362190008163452, + "step": 8871 + }, + { + "epoch": 1.44, + "learning_rate": 1.9187936478483426e-06, + "logits/chosen": -1.3062537908554077, + "logits/rejected": -1.3173240423202515, + "logps/chosen": -75.9100341796875, + "logps/rejected": -67.52056884765625, + "loss": 1.6467, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.6168289184570312, + "rewards/margins": -1.9769349098205566, + "rewards/rejected": 4.593763828277588, + "step": 8872 + }, + { + "epoch": 1.44, + "learning_rate": 1.917758703841794e-06, + "logits/chosen": -1.341370701789856, + "logits/rejected": -1.23477303981781, + "logps/chosen": -76.24543762207031, + "logps/rejected": -37.543846130371094, + "loss": 0.2171, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.3138041496276855, + "rewards/margins": 0.7387635707855225, + "rewards/rejected": 3.575040578842163, + "step": 8873 + }, + { + "epoch": 1.44, + "learning_rate": 1.9167239727889527e-06, + "logits/chosen": -1.1568139791488647, + "logits/rejected": -1.1381820440292358, + "logps/chosen": -30.728139877319336, + "logps/rejected": -45.59185791015625, + "loss": 0.629, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.9337457418441772, + "rewards/margins": -0.5547116994857788, + "rewards/rejected": 2.488457441329956, + "step": 8874 + }, + { + "epoch": 1.44, + "learning_rate": 1.915689454761312e-06, + "logits/chosen": -1.2423585653305054, + "logits/rejected": -1.003902554512024, + "logps/chosen": -170.57046508789062, + "logps/rejected": -151.72824096679688, + "loss": 0.0106, + "rewards/accuracies": 1.0, + "rewards/chosen": 8.169445991516113, + "rewards/margins": 3.8521547317504883, + "rewards/rejected": 4.317291259765625, + "step": 8875 + }, + { + "epoch": 1.44, + "learning_rate": 1.9146551498303446e-06, + "logits/chosen": -1.336302399635315, + "logits/rejected": -1.3491562604904175, + "logps/chosen": -119.82804870605469, + "logps/rejected": -128.62596130371094, + "loss": 0.6208, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.6892685890197754, + "rewards/margins": 1.127314805984497, + "rewards/rejected": 1.5619537830352783, + "step": 8876 + }, + { + "epoch": 1.44, + "learning_rate": 1.913621058067514e-06, + "logits/chosen": -1.2552402019500732, + "logits/rejected": -1.3002492189407349, + "logps/chosen": -75.92566680908203, + "logps/rejected": -77.60903930664062, + "loss": 0.1993, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.201798439025879, + "rewards/margins": 1.5631449222564697, + "rewards/rejected": 2.638653516769409, + "step": 8877 + }, + { + "epoch": 1.44, + "learning_rate": 1.912587179544263e-06, + "logits/chosen": -1.2008613348007202, + "logits/rejected": -1.1598577499389648, + "logps/chosen": -55.013710021972656, + "logps/rejected": -10.435224533081055, + "loss": 0.2567, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.3245948553085327, + "rewards/margins": 1.3122060298919678, + "rewards/rejected": 0.012388802133500576, + "step": 8878 + }, + { + "epoch": 1.44, + "learning_rate": 1.911553514332026e-06, + "logits/chosen": -1.2664892673492432, + "logits/rejected": -1.379324197769165, + "logps/chosen": -104.23780822753906, + "logps/rejected": -120.82539367675781, + "loss": 1.0384, + "rewards/accuracies": 0.0, + "rewards/chosen": 4.8385515213012695, + "rewards/margins": -0.8908352851867676, + "rewards/rejected": 5.729386806488037, + "step": 8879 + }, + { + "epoch": 1.44, + "learning_rate": 1.9105200625022176e-06, + "logits/chosen": -1.430295467376709, + "logits/rejected": -1.4435973167419434, + "logps/chosen": -37.20661544799805, + "logps/rejected": -62.96903991699219, + "loss": 2.0862, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.498056411743164, + "rewards/margins": -0.7239773273468018, + "rewards/rejected": 3.222033739089966, + "step": 8880 + }, + { + "epoch": 1.44, + "learning_rate": 1.9094868241262403e-06, + "logits/chosen": -1.6682145595550537, + "logits/rejected": -1.7261499166488647, + "logps/chosen": -57.64997863769531, + "logps/rejected": -123.56169891357422, + "loss": 0.7031, + "rewards/accuracies": 0.0, + "rewards/chosen": 5.819367408752441, + "rewards/margins": -1.1141571998596191, + "rewards/rejected": 6.9335246086120605, + "step": 8881 + }, + { + "epoch": 1.44, + "learning_rate": 1.908453799275479e-06, + "logits/chosen": -0.9530701041221619, + "logits/rejected": -0.9530701041221619, + "logps/chosen": -1.2437549829483032, + "logps/rejected": -1.2437549829483032, + "loss": 0.4037, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.2640185058116913, + "rewards/margins": 0.0, + "rewards/rejected": 0.2640185058116913, + "step": 8882 + }, + { + "epoch": 1.44, + "learning_rate": 1.90742098802131e-06, + "logits/chosen": -0.8442626595497131, + "logits/rejected": -0.8442626595497131, + "logps/chosen": -35.8153076171875, + "logps/rejected": -35.8153076171875, + "loss": 0.9881, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.5950874090194702, + "rewards/margins": 0.0, + "rewards/rejected": 1.5950874090194702, + "step": 8883 + }, + { + "epoch": 1.44, + "learning_rate": 1.9063883904350873e-06, + "logits/chosen": -1.5504302978515625, + "logits/rejected": -1.5221710205078125, + "logps/chosen": -33.24357223510742, + "logps/rejected": -75.75127410888672, + "loss": 0.9983, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.8814411163330078, + "rewards/margins": -0.08496743440628052, + "rewards/rejected": 0.9664085507392883, + "step": 8884 + }, + { + "epoch": 1.44, + "learning_rate": 1.9053560065881554e-06, + "logits/chosen": -1.4551581144332886, + "logits/rejected": -1.4119646549224854, + "logps/chosen": -86.04327392578125, + "logps/rejected": -79.53585815429688, + "loss": 0.2484, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.068353176116943, + "rewards/margins": 0.4889645576477051, + "rewards/rejected": 4.579388618469238, + "step": 8885 + }, + { + "epoch": 1.44, + "learning_rate": 1.9043238365518403e-06, + "logits/chosen": -1.1168609857559204, + "logits/rejected": -1.102859377861023, + "logps/chosen": -82.17230987548828, + "logps/rejected": -90.53253173828125, + "loss": 0.1343, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.6204659938812256, + "rewards/margins": 1.5152885913848877, + "rewards/rejected": 2.105177402496338, + "step": 8886 + }, + { + "epoch": 1.44, + "learning_rate": 1.9032918803974588e-06, + "logits/chosen": -1.2535746097564697, + "logits/rejected": -1.1777814626693726, + "logps/chosen": -84.09735107421875, + "logps/rejected": -89.48316955566406, + "loss": 0.1959, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2742278575897217, + "rewards/margins": 0.9253485202789307, + "rewards/rejected": 2.348879337310791, + "step": 8887 + }, + { + "epoch": 1.44, + "learning_rate": 1.9022601381963046e-06, + "logits/chosen": -1.3117705583572388, + "logits/rejected": -1.333275556564331, + "logps/chosen": -112.53518676757812, + "logps/rejected": -84.30709838867188, + "loss": 0.5344, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.6826324462890625, + "rewards/margins": -0.5820748805999756, + "rewards/rejected": 2.264707326889038, + "step": 8888 + }, + { + "epoch": 1.44, + "learning_rate": 1.901228610019666e-06, + "logits/chosen": -1.4426268339157104, + "logits/rejected": -1.4286020994186401, + "logps/chosen": -84.96630859375, + "logps/rejected": -91.26392364501953, + "loss": 0.7981, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.166616201400757, + "rewards/margins": -1.369577169418335, + "rewards/rejected": 4.536193370819092, + "step": 8889 + }, + { + "epoch": 1.44, + "learning_rate": 1.9001972959388071e-06, + "logits/chosen": -1.5196640491485596, + "logits/rejected": -1.5158497095108032, + "logps/chosen": -104.4371337890625, + "logps/rejected": -94.79533386230469, + "loss": 0.4442, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.9304641485214233, + "rewards/margins": -0.2752441167831421, + "rewards/rejected": 2.2057082653045654, + "step": 8890 + }, + { + "epoch": 1.44, + "learning_rate": 1.8991661960249868e-06, + "logits/chosen": -1.3610883951187134, + "logits/rejected": -1.404478907585144, + "logps/chosen": -81.5475845336914, + "logps/rejected": -104.31806945800781, + "loss": 2.0665, + "rewards/accuracies": 0.0, + "rewards/chosen": 4.449516296386719, + "rewards/margins": -4.110318183898926, + "rewards/rejected": 8.559834480285645, + "step": 8891 + }, + { + "epoch": 1.44, + "learning_rate": 1.89813531034944e-06, + "logits/chosen": -0.9494611620903015, + "logits/rejected": -0.9431034326553345, + "logps/chosen": -78.28827667236328, + "logps/rejected": -87.41410827636719, + "loss": 0.3281, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0262749195098877, + "rewards/margins": 0.9454193115234375, + "rewards/rejected": 2.08085560798645, + "step": 8892 + }, + { + "epoch": 1.44, + "learning_rate": 1.8971046389833952e-06, + "logits/chosen": -1.1078656911849976, + "logits/rejected": -1.0218982696533203, + "logps/chosen": -27.610904693603516, + "logps/rejected": -10.678319931030273, + "loss": 0.458, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.221884250640869, + "rewards/margins": 1.1134196519851685, + "rewards/rejected": 1.1084645986557007, + "step": 8893 + }, + { + "epoch": 1.44, + "learning_rate": 1.8960741819980576e-06, + "logits/chosen": -1.4185302257537842, + "logits/rejected": -1.3813656568527222, + "logps/chosen": -80.35747528076172, + "logps/rejected": -94.49432373046875, + "loss": 0.1011, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.897207021713257, + "rewards/margins": 1.8932411670684814, + "rewards/rejected": 2.0039658546447754, + "step": 8894 + }, + { + "epoch": 1.44, + "learning_rate": 1.895043939464627e-06, + "logits/chosen": -1.3791038990020752, + "logits/rejected": -1.3280380964279175, + "logps/chosen": -66.60313415527344, + "logps/rejected": -65.75540161132812, + "loss": 1.0841, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.2666594982147217, + "rewards/margins": -0.06871724128723145, + "rewards/rejected": 2.335376739501953, + "step": 8895 + }, + { + "epoch": 1.44, + "learning_rate": 1.8940139114542788e-06, + "logits/chosen": -1.3601990938186646, + "logits/rejected": -1.222383737564087, + "logps/chosen": -120.19367218017578, + "logps/rejected": -113.78636169433594, + "loss": 0.6783, + "rewards/accuracies": 0.0, + "rewards/chosen": 6.2827982902526855, + "rewards/margins": -0.7623968124389648, + "rewards/rejected": 7.04519510269165, + "step": 8896 + }, + { + "epoch": 1.44, + "learning_rate": 1.8929840980381796e-06, + "logits/chosen": -1.2785829305648804, + "logits/rejected": -1.3022370338439941, + "logps/chosen": -50.47228240966797, + "logps/rejected": -50.13539123535156, + "loss": 0.3059, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.305239200592041, + "rewards/margins": 0.32967841625213623, + "rewards/rejected": 1.9755607843399048, + "step": 8897 + }, + { + "epoch": 1.44, + "learning_rate": 1.8919544992874828e-06, + "logits/chosen": -1.4118725061416626, + "logits/rejected": -1.3944576978683472, + "logps/chosen": -55.42442321777344, + "logps/rejected": -93.95610046386719, + "loss": 0.2866, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.3224732875823975, + "rewards/margins": 1.4743852615356445, + "rewards/rejected": 0.8480880856513977, + "step": 8898 + }, + { + "epoch": 1.44, + "learning_rate": 1.8909251152733194e-06, + "logits/chosen": -1.0261801481246948, + "logits/rejected": -1.027695894241333, + "logps/chosen": -6.025647163391113, + "logps/rejected": -3.7249646186828613, + "loss": 0.7332, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.544489324092865, + "rewards/margins": 0.2875567674636841, + "rewards/rejected": 0.2569325566291809, + "step": 8899 + }, + { + "epoch": 1.44, + "learning_rate": 1.8898959460668142e-06, + "logits/chosen": -1.2499396800994873, + "logits/rejected": -1.113242745399475, + "logps/chosen": -55.39891052246094, + "logps/rejected": -45.885597229003906, + "loss": 0.7071, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9739043712615967, + "rewards/margins": 0.0004432201385498047, + "rewards/rejected": 2.973461151123047, + "step": 8900 + }, + { + "epoch": 1.44, + "learning_rate": 1.888866991739069e-06, + "logits/chosen": -1.0454615354537964, + "logits/rejected": -0.9220312237739563, + "logps/chosen": -43.08326721191406, + "logps/rejected": -23.505779266357422, + "loss": 0.402, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9706788063049316, + "rewards/margins": 2.0289559364318848, + "rewards/rejected": 0.9417228698730469, + "step": 8901 + }, + { + "epoch": 1.44, + "learning_rate": 1.8878382523611789e-06, + "logits/chosen": -1.0357873439788818, + "logits/rejected": -1.023110032081604, + "logps/chosen": -26.901350021362305, + "logps/rejected": -2.380352735519409, + "loss": 0.4682, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.32318687438964844, + "rewards/margins": -0.13366147875785828, + "rewards/rejected": 0.4568483531475067, + "step": 8902 + }, + { + "epoch": 1.45, + "learning_rate": 1.8868097280042163e-06, + "logits/chosen": -1.5130598545074463, + "logits/rejected": -1.4806855916976929, + "logps/chosen": -96.62245178222656, + "logps/rejected": -81.66520690917969, + "loss": 0.1609, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.3526992797851562, + "rewards/margins": 1.0432265996932983, + "rewards/rejected": 1.309472680091858, + "step": 8903 + }, + { + "epoch": 1.45, + "learning_rate": 1.8857814187392458e-06, + "logits/chosen": -1.0225751399993896, + "logits/rejected": -1.1097890138626099, + "logps/chosen": -57.83586883544922, + "logps/rejected": -72.81255340576172, + "loss": 0.3286, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.4225823879241943, + "rewards/margins": 0.1298508644104004, + "rewards/rejected": 3.292731523513794, + "step": 8904 + }, + { + "epoch": 1.45, + "learning_rate": 1.8847533246373106e-06, + "logits/chosen": -1.7943427562713623, + "logits/rejected": -1.6265079975128174, + "logps/chosen": -67.94788360595703, + "logps/rejected": -81.83694458007812, + "loss": 0.1418, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.283382415771484, + "rewards/margins": 1.1658101081848145, + "rewards/rejected": 5.11757230758667, + "step": 8905 + }, + { + "epoch": 1.45, + "learning_rate": 1.8837254457694453e-06, + "logits/chosen": -1.2464035749435425, + "logits/rejected": -1.3378829956054688, + "logps/chosen": -78.11830139160156, + "logps/rejected": -86.53970336914062, + "loss": 1.6003, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.0902421474456787, + "rewards/margins": -3.0068953037261963, + "rewards/rejected": 5.097137451171875, + "step": 8906 + }, + { + "epoch": 1.45, + "learning_rate": 1.882697782206664e-06, + "logits/chosen": -1.605317234992981, + "logits/rejected": -1.605317234992981, + "logps/chosen": -53.8165283203125, + "logps/rejected": -53.8165283203125, + "loss": 0.3575, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.5644569396972656, + "rewards/margins": 0.0, + "rewards/rejected": 2.5644569396972656, + "step": 8907 + }, + { + "epoch": 1.45, + "learning_rate": 1.881670334019971e-06, + "logits/chosen": -0.8062609434127808, + "logits/rejected": -0.752525269985199, + "logps/chosen": -68.62603759765625, + "logps/rejected": -73.36124420166016, + "loss": 1.0536, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.297841787338257, + "rewards/margins": 0.22450566291809082, + "rewards/rejected": 2.073336124420166, + "step": 8908 + }, + { + "epoch": 1.45, + "learning_rate": 1.8806431012803506e-06, + "logits/chosen": -0.9866288304328918, + "logits/rejected": -1.067535638809204, + "logps/chosen": -61.586692810058594, + "logps/rejected": -101.46251678466797, + "loss": 1.1546, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.1946113109588623, + "rewards/margins": -2.1310503482818604, + "rewards/rejected": 4.325661659240723, + "step": 8909 + }, + { + "epoch": 1.45, + "learning_rate": 1.879616084058778e-06, + "logits/chosen": -1.6304130554199219, + "logits/rejected": -1.6883867979049683, + "logps/chosen": -98.5850601196289, + "logps/rejected": -244.920166015625, + "loss": 0.1308, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.17106556892395, + "rewards/margins": 1.2092186212539673, + "rewards/rejected": 1.961846947669983, + "step": 8910 + }, + { + "epoch": 1.45, + "learning_rate": 1.878589282426207e-06, + "logits/chosen": -1.263985514640808, + "logits/rejected": -1.3806322813034058, + "logps/chosen": -71.53556823730469, + "logps/rejected": -94.10755920410156, + "loss": 2.7167, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.30002760887146, + "rewards/margins": -3.0683577060699463, + "rewards/rejected": 5.368385314941406, + "step": 8911 + }, + { + "epoch": 1.45, + "learning_rate": 1.8775626964535832e-06, + "logits/chosen": -0.7446897625923157, + "logits/rejected": -0.7660196423530579, + "logps/chosen": -107.03263854980469, + "logps/rejected": -73.33605194091797, + "loss": 0.0432, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.9794633388519287, + "rewards/margins": 2.44545841217041, + "rewards/rejected": 1.534005045890808, + "step": 8912 + }, + { + "epoch": 1.45, + "learning_rate": 1.8765363262118302e-06, + "logits/chosen": -0.9668753743171692, + "logits/rejected": -1.018264651298523, + "logps/chosen": -16.25481414794922, + "logps/rejected": -60.54147720336914, + "loss": 2.7886, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.47301846742630005, + "rewards/margins": -4.381001949310303, + "rewards/rejected": 4.854020595550537, + "step": 8913 + }, + { + "epoch": 1.45, + "learning_rate": 1.875510171771865e-06, + "logits/chosen": -1.1863797903060913, + "logits/rejected": -1.1864017248153687, + "logps/chosen": -3.963859796524048, + "logps/rejected": -6.6235737800598145, + "loss": 0.6535, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.29321300983428955, + "rewards/margins": 0.16537368297576904, + "rewards/rejected": 0.1278393268585205, + "step": 8914 + }, + { + "epoch": 1.45, + "learning_rate": 1.8744842332045804e-06, + "logits/chosen": -1.3933337926864624, + "logits/rejected": -1.41948664188385, + "logps/chosen": -73.09368896484375, + "logps/rejected": -98.07347106933594, + "loss": 0.415, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.5024850368499756, + "rewards/margins": -0.21189641952514648, + "rewards/rejected": 3.714381456375122, + "step": 8915 + }, + { + "epoch": 1.45, + "learning_rate": 1.8734585105808634e-06, + "logits/chosen": -1.5993821620941162, + "logits/rejected": -1.5126069784164429, + "logps/chosen": -79.45022583007812, + "logps/rejected": -27.811599731445312, + "loss": 0.657, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.8187522888183594, + "rewards/margins": 0.22500914335250854, + "rewards/rejected": 0.5937431454658508, + "step": 8916 + }, + { + "epoch": 1.45, + "learning_rate": 1.8724330039715777e-06, + "logits/chosen": -1.1484012603759766, + "logits/rejected": -1.1426552534103394, + "logps/chosen": -127.97003936767578, + "logps/rejected": -85.36493682861328, + "loss": 0.845, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.126941680908203, + "rewards/margins": -1.1302490234375, + "rewards/rejected": 3.257190704345703, + "step": 8917 + }, + { + "epoch": 1.45, + "learning_rate": 1.87140771344758e-06, + "logits/chosen": -1.3116612434387207, + "logits/rejected": -1.2739566564559937, + "logps/chosen": -23.258399963378906, + "logps/rejected": -44.636024475097656, + "loss": 0.7636, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.145111083984375, + "rewards/margins": 0.4828425645828247, + "rewards/rejected": 1.6622685194015503, + "step": 8918 + }, + { + "epoch": 1.45, + "learning_rate": 1.8703826390797047e-06, + "logits/chosen": -1.2783292531967163, + "logits/rejected": -1.4151403903961182, + "logps/chosen": -102.88135528564453, + "logps/rejected": -85.68499755859375, + "loss": 2.9812, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.5059471130371094, + "rewards/margins": -5.508133888244629, + "rewards/rejected": 7.014081001281738, + "step": 8919 + }, + { + "epoch": 1.45, + "learning_rate": 1.869357780938778e-06, + "logits/chosen": -1.1794226169586182, + "logits/rejected": -1.2699204683303833, + "logps/chosen": -22.130645751953125, + "logps/rejected": -101.2142562866211, + "loss": 1.3753, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.8543717861175537, + "rewards/margins": -2.455739736557007, + "rewards/rejected": 5.3101115226745605, + "step": 8920 + }, + { + "epoch": 1.45, + "learning_rate": 1.8683331390956044e-06, + "logits/chosen": -1.432634949684143, + "logits/rejected": -1.3824654817581177, + "logps/chosen": -84.0107192993164, + "logps/rejected": -70.73746490478516, + "loss": 0.3502, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.996540069580078, + "rewards/margins": 0.5359251499176025, + "rewards/rejected": 2.4606149196624756, + "step": 8921 + }, + { + "epoch": 1.45, + "learning_rate": 1.8673087136209804e-06, + "logits/chosen": -1.4454634189605713, + "logits/rejected": -1.1251682043075562, + "logps/chosen": -191.41094970703125, + "logps/rejected": -108.13818359375, + "loss": 0.2837, + "rewards/accuracies": 1.0, + "rewards/chosen": 8.132837295532227, + "rewards/margins": 0.28043413162231445, + "rewards/rejected": 7.852403163909912, + "step": 8922 + }, + { + "epoch": 1.45, + "learning_rate": 1.8662845045856809e-06, + "logits/chosen": -1.263764500617981, + "logits/rejected": -1.2661569118499756, + "logps/chosen": -43.84477996826172, + "logps/rejected": -58.533897399902344, + "loss": 0.1111, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.6383073329925537, + "rewards/margins": 1.516688585281372, + "rewards/rejected": 2.1216187477111816, + "step": 8923 + }, + { + "epoch": 1.45, + "learning_rate": 1.8652605120604727e-06, + "logits/chosen": -1.2747690677642822, + "logits/rejected": -1.2907224893569946, + "logps/chosen": -52.1259765625, + "logps/rejected": -59.08258056640625, + "loss": 0.5912, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.683126926422119, + "rewards/margins": 0.7058106660842896, + "rewards/rejected": 1.9773162603378296, + "step": 8924 + }, + { + "epoch": 1.45, + "learning_rate": 1.8642367361160996e-06, + "logits/chosen": -1.3491696119308472, + "logits/rejected": -1.365077257156372, + "logps/chosen": -66.50389099121094, + "logps/rejected": -111.43934631347656, + "loss": 0.8669, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.9702881574630737, + "rewards/margins": 0.42703866958618164, + "rewards/rejected": 1.543249487876892, + "step": 8925 + }, + { + "epoch": 1.45, + "learning_rate": 1.8632131768232992e-06, + "logits/chosen": -1.1124444007873535, + "logits/rejected": -1.0307475328445435, + "logps/chosen": -90.34822082519531, + "logps/rejected": -18.99639129638672, + "loss": 0.0983, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.8905060291290283, + "rewards/margins": 1.6744136810302734, + "rewards/rejected": 0.2160923033952713, + "step": 8926 + }, + { + "epoch": 1.45, + "learning_rate": 1.862189834252786e-06, + "logits/chosen": -1.201068639755249, + "logits/rejected": -1.201068639755249, + "logps/chosen": -36.7662467956543, + "logps/rejected": -36.7662467956543, + "loss": 0.3638, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.1375997066497803, + "rewards/margins": 0.0, + "rewards/rejected": 2.1375997066497803, + "step": 8927 + }, + { + "epoch": 1.45, + "learning_rate": 1.8611667084752666e-06, + "logits/chosen": -1.0100840330123901, + "logits/rejected": -0.9829170107841492, + "logps/chosen": -66.0907211303711, + "logps/rejected": -74.72238159179688, + "loss": 0.4237, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.5831390619277954, + "rewards/margins": 0.3540092706680298, + "rewards/rejected": 1.2291297912597656, + "step": 8928 + }, + { + "epoch": 1.45, + "learning_rate": 1.8601437995614263e-06, + "logits/chosen": -1.2353496551513672, + "logits/rejected": -1.0883336067199707, + "logps/chosen": -138.38327026367188, + "logps/rejected": -86.71910858154297, + "loss": 0.1616, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.9162368774414062, + "rewards/margins": 1.1175720691680908, + "rewards/rejected": 2.7986648082733154, + "step": 8929 + }, + { + "epoch": 1.45, + "learning_rate": 1.8591211075819416e-06, + "logits/chosen": -1.42878258228302, + "logits/rejected": -1.5745065212249756, + "logps/chosen": -37.36220932006836, + "logps/rejected": -34.20383834838867, + "loss": 0.636, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.6829915046691895, + "rewards/margins": 2.3980653285980225, + "rewards/rejected": 0.28492623567581177, + "step": 8930 + }, + { + "epoch": 1.45, + "learning_rate": 1.8580986326074668e-06, + "logits/chosen": -1.1583099365234375, + "logits/rejected": -1.0903469324111938, + "logps/chosen": -55.973106384277344, + "logps/rejected": -57.26172637939453, + "loss": 1.2259, + "rewards/accuracies": 0.0, + "rewards/chosen": 4.179622650146484, + "rewards/margins": -1.4754786491394043, + "rewards/rejected": 5.655101299285889, + "step": 8931 + }, + { + "epoch": 1.45, + "learning_rate": 1.8570763747086496e-06, + "logits/chosen": -1.2343460321426392, + "logits/rejected": -1.1873786449432373, + "logps/chosen": -56.666343688964844, + "logps/rejected": -56.70074462890625, + "loss": 0.3715, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.904576063156128, + "rewards/margins": 0.06460785865783691, + "rewards/rejected": 2.839968204498291, + "step": 8932 + }, + { + "epoch": 1.45, + "learning_rate": 1.8560543339561149e-06, + "logits/chosen": -1.2110371589660645, + "logits/rejected": -1.2110371589660645, + "logps/chosen": -19.27665901184082, + "logps/rejected": -19.27665901184082, + "loss": 0.4882, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.3824586868286133, + "rewards/margins": 0.0, + "rewards/rejected": 2.3824586868286133, + "step": 8933 + }, + { + "epoch": 1.45, + "learning_rate": 1.855032510420477e-06, + "logits/chosen": -1.0618840456008911, + "logits/rejected": -1.0090558528900146, + "logps/chosen": -31.403379440307617, + "logps/rejected": -29.873538970947266, + "loss": 0.2887, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.9088640213012695, + "rewards/margins": 0.8900738954544067, + "rewards/rejected": 1.0187901258468628, + "step": 8934 + }, + { + "epoch": 1.45, + "learning_rate": 1.8540109041723364e-06, + "logits/chosen": -1.5314010381698608, + "logits/rejected": -1.5669968128204346, + "logps/chosen": -72.06680297851562, + "logps/rejected": -85.73971557617188, + "loss": 1.2432, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.0519187450408936, + "rewards/margins": -2.176398515701294, + "rewards/rejected": 5.2283172607421875, + "step": 8935 + }, + { + "epoch": 1.45, + "learning_rate": 1.8529895152822736e-06, + "logits/chosen": -1.22879958152771, + "logits/rejected": -1.2151646614074707, + "logps/chosen": -66.28231811523438, + "logps/rejected": -50.0917854309082, + "loss": 0.935, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.8180084228515625, + "rewards/margins": -0.04736983776092529, + "rewards/rejected": 1.8653782606124878, + "step": 8936 + }, + { + "epoch": 1.45, + "learning_rate": 1.851968343820859e-06, + "logits/chosen": -1.3078505992889404, + "logits/rejected": -1.3174290657043457, + "logps/chosen": -54.782814025878906, + "logps/rejected": -61.19430160522461, + "loss": 1.1799, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.1176971197128296, + "rewards/margins": -2.2577266693115234, + "rewards/rejected": 3.3754239082336426, + "step": 8937 + }, + { + "epoch": 1.45, + "learning_rate": 1.8509473898586432e-06, + "logits/chosen": -1.3599826097488403, + "logits/rejected": -1.3479623794555664, + "logps/chosen": -55.25654602050781, + "logps/rejected": -80.79483795166016, + "loss": 0.6533, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.1747665405273438, + "rewards/margins": -0.9327278137207031, + "rewards/rejected": 3.107494354248047, + "step": 8938 + }, + { + "epoch": 1.45, + "learning_rate": 1.849926653466168e-06, + "logits/chosen": -1.2387065887451172, + "logits/rejected": -1.2454109191894531, + "logps/chosen": -68.38197326660156, + "logps/rejected": -54.85529327392578, + "loss": 0.2536, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.385052442550659, + "rewards/margins": 0.4392608404159546, + "rewards/rejected": 1.9457916021347046, + "step": 8939 + }, + { + "epoch": 1.45, + "learning_rate": 1.848906134713953e-06, + "logits/chosen": -1.1095843315124512, + "logits/rejected": -1.1496257781982422, + "logps/chosen": -119.72352600097656, + "logps/rejected": -110.27523040771484, + "loss": 0.7924, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.2455506324768066, + "rewards/margins": -1.3470022678375244, + "rewards/rejected": 3.592552900314331, + "step": 8940 + }, + { + "epoch": 1.45, + "learning_rate": 1.8478858336725104e-06, + "logits/chosen": -1.083011269569397, + "logits/rejected": -1.1103992462158203, + "logps/chosen": -42.821495056152344, + "logps/rejected": -62.35462951660156, + "loss": 0.3945, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.8325515985488892, + "rewards/margins": 0.2356811761856079, + "rewards/rejected": 1.5968704223632812, + "step": 8941 + }, + { + "epoch": 1.45, + "learning_rate": 1.8468657504123288e-06, + "logits/chosen": -1.163257360458374, + "logits/rejected": -1.2538511753082275, + "logps/chosen": -73.04631042480469, + "logps/rejected": -104.17660522460938, + "loss": 1.1074, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.9211044311523438, + "rewards/margins": -1.0741608142852783, + "rewards/rejected": 2.995265245437622, + "step": 8942 + }, + { + "epoch": 1.45, + "learning_rate": 1.8458458850038907e-06, + "logits/chosen": -1.0905853509902954, + "logits/rejected": -1.065002679824829, + "logps/chosen": -20.754314422607422, + "logps/rejected": -6.639385223388672, + "loss": 0.5042, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.1055641174316406, + "rewards/margins": 1.399486780166626, + "rewards/rejected": 0.7060773968696594, + "step": 8943 + }, + { + "epoch": 1.45, + "learning_rate": 1.8448262375176552e-06, + "logits/chosen": -1.2260046005249023, + "logits/rejected": -1.2044951915740967, + "logps/chosen": -31.744659423828125, + "logps/rejected": -29.759862899780273, + "loss": 0.4898, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.6342708468437195, + "rewards/margins": -0.5014566779136658, + "rewards/rejected": 1.1357275247573853, + "step": 8944 + }, + { + "epoch": 1.45, + "learning_rate": 1.843806808024074e-06, + "logits/chosen": -1.4767457246780396, + "logits/rejected": -1.4946452379226685, + "logps/chosen": -135.90261840820312, + "logps/rejected": -76.3681411743164, + "loss": 0.1665, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.64149808883667, + "rewards/margins": 0.9591028690338135, + "rewards/rejected": 3.6823952198028564, + "step": 8945 + }, + { + "epoch": 1.45, + "learning_rate": 1.842787596593576e-06, + "logits/chosen": -1.6982418298721313, + "logits/rejected": -1.6578736305236816, + "logps/chosen": -98.56887817382812, + "logps/rejected": -73.72923278808594, + "loss": 0.5255, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.6593079566955566, + "rewards/margins": -0.6104559898376465, + "rewards/rejected": 4.269763946533203, + "step": 8946 + }, + { + "epoch": 1.45, + "learning_rate": 1.841768603296583e-06, + "logits/chosen": -1.5491738319396973, + "logits/rejected": -1.4093081951141357, + "logps/chosen": -131.27352905273438, + "logps/rejected": -56.67364501953125, + "loss": 0.0413, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.3649187088012695, + "rewards/margins": 2.706681251525879, + "rewards/rejected": 3.6582374572753906, + "step": 8947 + }, + { + "epoch": 1.45, + "learning_rate": 1.8407498282034951e-06, + "logits/chosen": -1.304651141166687, + "logits/rejected": -1.2779759168624878, + "logps/chosen": -78.89988708496094, + "logps/rejected": -49.666282653808594, + "loss": 0.9014, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.4796433448791504, + "rewards/margins": -0.568781852722168, + "rewards/rejected": 4.048425197601318, + "step": 8948 + }, + { + "epoch": 1.45, + "learning_rate": 1.839731271384702e-06, + "logits/chosen": -1.3625816106796265, + "logits/rejected": -1.3345164060592651, + "logps/chosen": -30.77411651611328, + "logps/rejected": -7.121275424957275, + "loss": 0.3975, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.7136417627334595, + "rewards/margins": 0.8859491348266602, + "rewards/rejected": 0.8276926279067993, + "step": 8949 + }, + { + "epoch": 1.45, + "learning_rate": 1.8387129329105742e-06, + "logits/chosen": -1.024700403213501, + "logits/rejected": -1.024700403213501, + "logps/chosen": -9.55180835723877, + "logps/rejected": -9.55180835723877, + "loss": 0.4479, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.034648992121219635, + "rewards/margins": 0.0, + "rewards/rejected": 0.034648992121219635, + "step": 8950 + }, + { + "epoch": 1.45, + "learning_rate": 1.8376948128514716e-06, + "logits/chosen": -1.0027954578399658, + "logits/rejected": -1.0027954578399658, + "logps/chosen": -72.24247741699219, + "logps/rejected": -72.24247741699219, + "loss": 1.4841, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.847137451171875, + "rewards/margins": 0.0, + "rewards/rejected": 1.847137451171875, + "step": 8951 + }, + { + "epoch": 1.45, + "learning_rate": 1.8366769112777333e-06, + "logits/chosen": -1.2853522300720215, + "logits/rejected": -1.3481600284576416, + "logps/chosen": -49.69453048706055, + "logps/rejected": -52.83268356323242, + "loss": 0.939, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.3749775886535645, + "rewards/margins": 0.028627872467041016, + "rewards/rejected": 2.3463497161865234, + "step": 8952 + }, + { + "epoch": 1.45, + "learning_rate": 1.8356592282596914e-06, + "logits/chosen": -1.4549801349639893, + "logits/rejected": -1.4575939178466797, + "logps/chosen": -167.37940979003906, + "logps/rejected": -106.46270751953125, + "loss": 0.0307, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.186702251434326, + "rewards/margins": 4.340484619140625, + "rewards/rejected": 1.846217393875122, + "step": 8953 + }, + { + "epoch": 1.45, + "learning_rate": 1.8346417638676533e-06, + "logits/chosen": -1.0692416429519653, + "logits/rejected": -1.0225132703781128, + "logps/chosen": -62.65399169921875, + "logps/rejected": -55.004310607910156, + "loss": 0.5988, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.3906296491622925, + "rewards/margins": -0.7661293745040894, + "rewards/rejected": 2.156759023666382, + "step": 8954 + }, + { + "epoch": 1.45, + "learning_rate": 1.8336245181719203e-06, + "logits/chosen": -1.4672552347183228, + "logits/rejected": -1.2645198106765747, + "logps/chosen": -130.06903076171875, + "logps/rejected": -37.89573669433594, + "loss": 0.0335, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.101678371429443, + "rewards/margins": 2.7389495372772217, + "rewards/rejected": 3.3627288341522217, + "step": 8955 + }, + { + "epoch": 1.45, + "learning_rate": 1.8326074912427704e-06, + "logits/chosen": -0.7953030467033386, + "logits/rejected": -0.8592569828033447, + "logps/chosen": -68.84383392333984, + "logps/rejected": -56.12743377685547, + "loss": 0.5547, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.7197411060333252, + "rewards/margins": -0.3299727439880371, + "rewards/rejected": 2.0497138500213623, + "step": 8956 + }, + { + "epoch": 1.45, + "learning_rate": 1.8315906831504753e-06, + "logits/chosen": -1.2607089281082153, + "logits/rejected": -1.3603147268295288, + "logps/chosen": -89.56961059570312, + "logps/rejected": -69.79611206054688, + "loss": 1.1509, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.6025772094726562, + "rewards/margins": -2.1960105895996094, + "rewards/rejected": 3.7985877990722656, + "step": 8957 + }, + { + "epoch": 1.45, + "learning_rate": 1.8305740939652822e-06, + "logits/chosen": -1.489592432975769, + "logits/rejected": -1.5300768613815308, + "logps/chosen": -127.64146423339844, + "logps/rejected": -119.27450561523438, + "loss": 0.4179, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.9504897594451904, + "rewards/margins": -0.202484130859375, + "rewards/rejected": 3.1529738903045654, + "step": 8958 + }, + { + "epoch": 1.45, + "learning_rate": 1.8295577237574313e-06, + "logits/chosen": -1.6829928159713745, + "logits/rejected": -1.3560285568237305, + "logps/chosen": -106.40506744384766, + "logps/rejected": -83.91712951660156, + "loss": 0.3332, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.780637502670288, + "rewards/margins": 1.1428582668304443, + "rewards/rejected": 2.6377792358398438, + "step": 8959 + }, + { + "epoch": 1.45, + "learning_rate": 1.8285415725971406e-06, + "logits/chosen": -1.1196367740631104, + "logits/rejected": -1.023801326751709, + "logps/chosen": -62.425716400146484, + "logps/rejected": -25.357616424560547, + "loss": 0.2305, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.923781156539917, + "rewards/margins": 1.7426711320877075, + "rewards/rejected": 1.1811100244522095, + "step": 8960 + }, + { + "epoch": 1.45, + "learning_rate": 1.827525640554621e-06, + "logits/chosen": -1.2994986772537231, + "logits/rejected": -1.1510376930236816, + "logps/chosen": -83.28575134277344, + "logps/rejected": -39.43390655517578, + "loss": 0.4078, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.618527412414551, + "rewards/margins": 2.596837282180786, + "rewards/rejected": 2.0216901302337646, + "step": 8961 + }, + { + "epoch": 1.45, + "learning_rate": 1.8265099277000614e-06, + "logits/chosen": -0.9412521123886108, + "logits/rejected": -0.9381595253944397, + "logps/chosen": -134.561279296875, + "logps/rejected": -129.6579132080078, + "loss": 0.0611, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9573638439178467, + "rewards/margins": 2.075178384780884, + "rewards/rejected": 0.8821853995323181, + "step": 8962 + }, + { + "epoch": 1.45, + "learning_rate": 1.8254944341036356e-06, + "logits/chosen": -1.4078218936920166, + "logits/rejected": -1.3244712352752686, + "logps/chosen": -77.05555725097656, + "logps/rejected": -45.46799087524414, + "loss": 0.713, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.8895606994628906, + "rewards/margins": 2.3217296600341797, + "rewards/rejected": 0.5678310394287109, + "step": 8963 + }, + { + "epoch": 1.45, + "learning_rate": 1.8244791598355094e-06, + "logits/chosen": -1.4532440900802612, + "logits/rejected": -1.4339476823806763, + "logps/chosen": -71.77161407470703, + "logps/rejected": -67.12425994873047, + "loss": 0.4201, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.002904415130615, + "rewards/margins": 1.8418691158294678, + "rewards/rejected": 2.1610352993011475, + "step": 8964 + }, + { + "epoch": 1.46, + "learning_rate": 1.8234641049658242e-06, + "logits/chosen": -1.299819827079773, + "logits/rejected": -1.2858779430389404, + "logps/chosen": -86.17514038085938, + "logps/rejected": -71.43804931640625, + "loss": 0.465, + "rewards/accuracies": 0.0, + "rewards/chosen": 4.029571533203125, + "rewards/margins": -0.40139341354370117, + "rewards/rejected": 4.430964946746826, + "step": 8965 + }, + { + "epoch": 1.46, + "learning_rate": 1.8224492695647145e-06, + "logits/chosen": -1.4559047222137451, + "logits/rejected": -1.4663686752319336, + "logps/chosen": -79.45846557617188, + "logps/rejected": -79.707763671875, + "loss": 0.5093, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.7898834943771362, + "rewards/margins": 0.6225708723068237, + "rewards/rejected": 1.1673126220703125, + "step": 8966 + }, + { + "epoch": 1.46, + "learning_rate": 1.821434653702292e-06, + "logits/chosen": -0.9214089512825012, + "logits/rejected": -0.9214089512825012, + "logps/chosen": -51.973915100097656, + "logps/rejected": -51.973915100097656, + "loss": 0.4378, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.8161919116973877, + "rewards/margins": 0.0, + "rewards/rejected": 2.8161919116973877, + "step": 8967 + }, + { + "epoch": 1.46, + "learning_rate": 1.8204202574486611e-06, + "logits/chosen": -1.459018588066101, + "logits/rejected": -1.4276747703552246, + "logps/chosen": -79.94992065429688, + "logps/rejected": -82.24078369140625, + "loss": 0.3655, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.4898239374160767, + "rewards/margins": 0.5647048950195312, + "rewards/rejected": 0.9251190423965454, + "step": 8968 + }, + { + "epoch": 1.46, + "learning_rate": 1.8194060808739029e-06, + "logits/chosen": -1.5544601678848267, + "logits/rejected": -1.5906296968460083, + "logps/chosen": -101.6377944946289, + "logps/rejected": -71.868896484375, + "loss": 0.3558, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0323586463928223, + "rewards/margins": 0.3649735450744629, + "rewards/rejected": 2.6673851013183594, + "step": 8969 + }, + { + "epoch": 1.46, + "learning_rate": 1.8183921240480912e-06, + "logits/chosen": -1.472302794456482, + "logits/rejected": -1.3670037984848022, + "logps/chosen": -95.01544952392578, + "logps/rejected": -78.5232162475586, + "loss": 0.3191, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.04675817489624, + "rewards/margins": 0.11836385726928711, + "rewards/rejected": 4.928394317626953, + "step": 8970 + }, + { + "epoch": 1.46, + "learning_rate": 1.8173783870412776e-06, + "logits/chosen": -1.2376751899719238, + "logits/rejected": -1.1189121007919312, + "logps/chosen": -50.28434753417969, + "logps/rejected": -21.894001007080078, + "loss": 0.4123, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.707241058349609, + "rewards/margins": 3.208488941192627, + "rewards/rejected": 1.498752236366272, + "step": 8971 + }, + { + "epoch": 1.46, + "learning_rate": 1.8163648699235048e-06, + "logits/chosen": -1.6628671884536743, + "logits/rejected": -1.7317297458648682, + "logps/chosen": -80.73202514648438, + "logps/rejected": -35.32590103149414, + "loss": 0.3335, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.118539571762085, + "rewards/margins": 1.8406872749328613, + "rewards/rejected": 0.27785226702690125, + "step": 8972 + }, + { + "epoch": 1.46, + "learning_rate": 1.8153515727647942e-06, + "logits/chosen": -1.3481388092041016, + "logits/rejected": -1.5062845945358276, + "logps/chosen": -46.819602966308594, + "logps/rejected": -95.78947448730469, + "loss": 3.0602, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.5494987964630127, + "rewards/margins": -5.859722137451172, + "rewards/rejected": 9.409220695495605, + "step": 8973 + }, + { + "epoch": 1.46, + "learning_rate": 1.814338495635158e-06, + "logits/chosen": -1.6442168951034546, + "logits/rejected": -1.7703570127487183, + "logps/chosen": -238.60662841796875, + "logps/rejected": -147.23080444335938, + "loss": 0.2198, + "rewards/accuracies": 1.0, + "rewards/chosen": 7.283978462219238, + "rewards/margins": 0.6939606666564941, + "rewards/rejected": 6.590017795562744, + "step": 8974 + }, + { + "epoch": 1.46, + "learning_rate": 1.8133256386045872e-06, + "logits/chosen": -1.1198517084121704, + "logits/rejected": -1.255705714225769, + "logps/chosen": -59.059146881103516, + "logps/rejected": -115.09854125976562, + "loss": 1.702, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.438871383666992, + "rewards/margins": -2.7284231185913086, + "rewards/rejected": 6.167294502258301, + "step": 8975 + }, + { + "epoch": 1.46, + "learning_rate": 1.8123130017430635e-06, + "logits/chosen": -1.0952898263931274, + "logits/rejected": -1.0062720775604248, + "logps/chosen": -52.309295654296875, + "logps/rejected": -43.63733673095703, + "loss": 0.2234, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.175382375717163, + "rewards/margins": 0.6473020315170288, + "rewards/rejected": 1.5280803442001343, + "step": 8976 + }, + { + "epoch": 1.46, + "learning_rate": 1.8113005851205479e-06, + "logits/chosen": -1.697778344154358, + "logits/rejected": -1.6782643795013428, + "logps/chosen": -67.0272445678711, + "logps/rejected": -41.530189514160156, + "loss": 1.4856, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.177377462387085, + "rewards/margins": -1.3394668102264404, + "rewards/rejected": 3.5168442726135254, + "step": 8977 + }, + { + "epoch": 1.46, + "learning_rate": 1.8102883888069917e-06, + "logits/chosen": -1.235707402229309, + "logits/rejected": -1.0959275960922241, + "logps/chosen": -88.41239166259766, + "logps/rejected": -38.756534576416016, + "loss": 0.0607, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.356494903564453, + "rewards/margins": 2.079815626144409, + "rewards/rejected": 2.276679277420044, + "step": 8978 + }, + { + "epoch": 1.46, + "learning_rate": 1.8092764128723245e-06, + "logits/chosen": -1.1162385940551758, + "logits/rejected": -1.0882208347320557, + "logps/chosen": -59.733802795410156, + "logps/rejected": -82.19050598144531, + "loss": 0.9902, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1591637134552, + "rewards/margins": 1.3914765119552612, + "rewards/rejected": 1.767687201499939, + "step": 8979 + }, + { + "epoch": 1.46, + "learning_rate": 1.8082646573864682e-06, + "logits/chosen": -1.5087311267852783, + "logits/rejected": -1.5087311267852783, + "logps/chosen": -37.86267852783203, + "logps/rejected": -37.86267852783203, + "loss": 0.3852, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.9791336059570312, + "rewards/margins": 0.0, + "rewards/rejected": 1.9791336059570312, + "step": 8980 + }, + { + "epoch": 1.46, + "learning_rate": 1.8072531224193218e-06, + "logits/chosen": -1.0285028219223022, + "logits/rejected": -0.9422820210456848, + "logps/chosen": -36.64533233642578, + "logps/rejected": -50.27069091796875, + "loss": 0.2058, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.619683027267456, + "rewards/margins": 0.9646544456481934, + "rewards/rejected": 1.6550285816192627, + "step": 8981 + }, + { + "epoch": 1.46, + "learning_rate": 1.8062418080407762e-06, + "logits/chosen": -1.180777668952942, + "logits/rejected": -1.1326014995574951, + "logps/chosen": -63.1790885925293, + "logps/rejected": -67.08650207519531, + "loss": 0.4403, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.251138687133789, + "rewards/margins": 3.075599670410156e-05, + "rewards/rejected": 2.251107931137085, + "step": 8982 + }, + { + "epoch": 1.46, + "learning_rate": 1.805230714320701e-06, + "logits/chosen": -1.0641998052597046, + "logits/rejected": -1.0641998052597046, + "logps/chosen": -48.71057891845703, + "logps/rejected": -48.71057891845703, + "loss": 0.431, + "rewards/accuracies": 0.0, + "rewards/chosen": 4.117468357086182, + "rewards/margins": 0.0, + "rewards/rejected": 4.117468357086182, + "step": 8983 + }, + { + "epoch": 1.46, + "learning_rate": 1.8042198413289553e-06, + "logits/chosen": -1.2562849521636963, + "logits/rejected": -1.1977020502090454, + "logps/chosen": -98.94740295410156, + "logps/rejected": -63.96334457397461, + "loss": 0.0645, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9165024757385254, + "rewards/margins": 2.04081392288208, + "rewards/rejected": 0.8756885528564453, + "step": 8984 + }, + { + "epoch": 1.46, + "learning_rate": 1.803209189135378e-06, + "logits/chosen": -1.19486403465271, + "logits/rejected": -1.19486403465271, + "logps/chosen": -54.73259735107422, + "logps/rejected": -54.73259735107422, + "loss": 0.5786, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.1272714138031006, + "rewards/margins": 0.0, + "rewards/rejected": 2.1272714138031006, + "step": 8985 + }, + { + "epoch": 1.46, + "learning_rate": 1.8021987578097994e-06, + "logits/chosen": -1.2760474681854248, + "logits/rejected": -1.1921565532684326, + "logps/chosen": -100.83415222167969, + "logps/rejected": -83.96459197998047, + "loss": 0.0784, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.498486518859863, + "rewards/margins": 1.8177270889282227, + "rewards/rejected": 4.680759429931641, + "step": 8986 + }, + { + "epoch": 1.46, + "learning_rate": 1.801188547422027e-06, + "logits/chosen": -1.43007493019104, + "logits/rejected": -1.5039079189300537, + "logps/chosen": -64.79824829101562, + "logps/rejected": -100.07534790039062, + "loss": 1.2524, + "rewards/accuracies": 0.0, + "rewards/chosen": 5.750959873199463, + "rewards/margins": -2.097865104675293, + "rewards/rejected": 7.848824977874756, + "step": 8987 + }, + { + "epoch": 1.46, + "learning_rate": 1.8001785580418607e-06, + "logits/chosen": -1.275460124015808, + "logits/rejected": -1.2852864265441895, + "logps/chosen": -101.2587661743164, + "logps/rejected": -62.50396728515625, + "loss": 0.3546, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.2864882946014404, + "rewards/margins": 0.008238077163696289, + "rewards/rejected": 2.278250217437744, + "step": 8988 + }, + { + "epoch": 1.46, + "learning_rate": 1.799168789739077e-06, + "logits/chosen": -1.093861699104309, + "logits/rejected": -1.0611492395401, + "logps/chosen": -31.183534622192383, + "logps/rejected": -3.521209955215454, + "loss": 0.4726, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.9658986926078796, + "rewards/margins": 0.4399012327194214, + "rewards/rejected": 0.5259974598884583, + "step": 8989 + }, + { + "epoch": 1.46, + "learning_rate": 1.7981592425834432e-06, + "logits/chosen": -1.0435527563095093, + "logits/rejected": -0.9973492622375488, + "logps/chosen": -49.20527648925781, + "logps/rejected": -53.77739715576172, + "loss": 2.065, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.8219505548477173, + "rewards/margins": -0.5528250932693481, + "rewards/rejected": 2.3747756481170654, + "step": 8990 + }, + { + "epoch": 1.46, + "learning_rate": 1.7971499166447114e-06, + "logits/chosen": -1.142880916595459, + "logits/rejected": -1.046675443649292, + "logps/chosen": -88.94572448730469, + "logps/rejected": -28.868675231933594, + "loss": 0.1651, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.139239549636841, + "rewards/margins": 1.213029146194458, + "rewards/rejected": 0.9262104034423828, + "step": 8991 + }, + { + "epoch": 1.46, + "learning_rate": 1.7961408119926133e-06, + "logits/chosen": -1.4142589569091797, + "logits/rejected": -1.5698614120483398, + "logps/chosen": -118.57245635986328, + "logps/rejected": -125.18324279785156, + "loss": 0.8453, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.295774221420288, + "rewards/margins": -1.3077795505523682, + "rewards/rejected": 4.603553771972656, + "step": 8992 + }, + { + "epoch": 1.46, + "learning_rate": 1.7951319286968711e-06, + "logits/chosen": -1.3730188608169556, + "logits/rejected": -1.1813563108444214, + "logps/chosen": -87.35964965820312, + "logps/rejected": -32.66917037963867, + "loss": 0.2473, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.557330369949341, + "rewards/margins": 3.0017261505126953, + "rewards/rejected": 0.5556041598320007, + "step": 8993 + }, + { + "epoch": 1.46, + "learning_rate": 1.7941232668271863e-06, + "logits/chosen": -0.9381012916564941, + "logits/rejected": -1.063737154006958, + "logps/chosen": -113.33103942871094, + "logps/rejected": -103.45323181152344, + "loss": 2.311, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.6461563110351562, + "rewards/margins": -4.61159086227417, + "rewards/rejected": 6.257747173309326, + "step": 8994 + }, + { + "epoch": 1.46, + "learning_rate": 1.7931148264532516e-06, + "logits/chosen": -0.8946939706802368, + "logits/rejected": -0.8204311728477478, + "logps/chosen": -20.182540893554688, + "logps/rejected": -3.2515692710876465, + "loss": 0.2078, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.2292022705078125, + "rewards/margins": 0.6767737865447998, + "rewards/rejected": 0.5524284839630127, + "step": 8995 + }, + { + "epoch": 1.46, + "learning_rate": 1.7921066076447364e-06, + "logits/chosen": -1.8227609395980835, + "logits/rejected": -1.7954353094100952, + "logps/chosen": -82.29948425292969, + "logps/rejected": -22.835296630859375, + "loss": 2.2827, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1805641651153564, + "rewards/margins": 1.170727252960205, + "rewards/rejected": 2.0098369121551514, + "step": 8996 + }, + { + "epoch": 1.46, + "learning_rate": 1.791098610471303e-06, + "logits/chosen": -1.3690558671951294, + "logits/rejected": -1.4294196367263794, + "logps/chosen": -43.61345291137695, + "logps/rejected": -90.49845123291016, + "loss": 1.4888, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.2235028743743896, + "rewards/margins": -2.0955212116241455, + "rewards/rejected": 5.319024085998535, + "step": 8997 + }, + { + "epoch": 1.46, + "learning_rate": 1.7900908350025914e-06, + "logits/chosen": -1.1676322221755981, + "logits/rejected": -1.1631343364715576, + "logps/chosen": -81.04490661621094, + "logps/rejected": -108.42071533203125, + "loss": 0.1277, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.4391770362854004, + "rewards/margins": 2.2434937953948975, + "rewards/rejected": 0.1956832855939865, + "step": 8998 + }, + { + "epoch": 1.46, + "learning_rate": 1.789083281308232e-06, + "logits/chosen": -1.1052496433258057, + "logits/rejected": -1.0258660316467285, + "logps/chosen": -51.47312927246094, + "logps/rejected": -45.16041564941406, + "loss": 1.3735, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.5704476833343506, + "rewards/margins": -0.1947011947631836, + "rewards/rejected": 3.765148878097534, + "step": 8999 + }, + { + "epoch": 1.46, + "learning_rate": 1.7880759494578342e-06, + "logits/chosen": -1.3616985082626343, + "logits/rejected": -1.3301854133605957, + "logps/chosen": -59.89923858642578, + "logps/rejected": -50.20070266723633, + "loss": 0.3108, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.4249794483184814, + "rewards/margins": 1.7054675817489624, + "rewards/rejected": 1.719511866569519, + "step": 9000 + }, + { + "epoch": 1.46, + "learning_rate": 1.7870688395209985e-06, + "logits/chosen": -1.2436732053756714, + "logits/rejected": -1.176460862159729, + "logps/chosen": -125.12931823730469, + "logps/rejected": -86.26637268066406, + "loss": 0.0271, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.283219814300537, + "rewards/margins": 3.1040143966674805, + "rewards/rejected": 3.1792054176330566, + "step": 9001 + }, + { + "epoch": 1.46, + "learning_rate": 1.7860619515673034e-06, + "logits/chosen": -1.1681745052337646, + "logits/rejected": -1.1150678396224976, + "logps/chosen": -56.10740280151367, + "logps/rejected": -64.52365112304688, + "loss": 0.4872, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.7876110076904297, + "rewards/margins": -0.26929426193237305, + "rewards/rejected": 4.056905269622803, + "step": 9002 + }, + { + "epoch": 1.46, + "learning_rate": 1.7850552856663184e-06, + "logits/chosen": -1.2340974807739258, + "logits/rejected": -1.089701533317566, + "logps/chosen": -69.16926574707031, + "logps/rejected": -35.373374938964844, + "loss": 0.1166, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.251664876937866, + "rewards/margins": 3.2736072540283203, + "rewards/rejected": -1.021942377090454, + "step": 9003 + }, + { + "epoch": 1.46, + "learning_rate": 1.7840488418875917e-06, + "logits/chosen": -1.3556122779846191, + "logits/rejected": -1.3592026233673096, + "logps/chosen": -46.4450798034668, + "logps/rejected": -75.54961395263672, + "loss": 0.2632, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.076831340789795, + "rewards/margins": 0.4123423099517822, + "rewards/rejected": 3.6644890308380127, + "step": 9004 + }, + { + "epoch": 1.46, + "learning_rate": 1.7830426203006617e-06, + "logits/chosen": -1.6318775415420532, + "logits/rejected": -1.5282431840896606, + "logps/chosen": -118.40037536621094, + "logps/rejected": -49.249427795410156, + "loss": 0.232, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.8988358974456787, + "rewards/margins": 0.7817208766937256, + "rewards/rejected": 3.117115020751953, + "step": 9005 + }, + { + "epoch": 1.46, + "learning_rate": 1.7820366209750462e-06, + "logits/chosen": -1.3327714204788208, + "logits/rejected": -1.3327714204788208, + "logps/chosen": -61.11418914794922, + "logps/rejected": -61.11418914794922, + "loss": 0.3634, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.0410048961639404, + "rewards/margins": 0.0, + "rewards/rejected": 3.0410048961639404, + "step": 9006 + }, + { + "epoch": 1.46, + "learning_rate": 1.7810308439802532e-06, + "logits/chosen": -1.0449949502944946, + "logits/rejected": -0.9095833897590637, + "logps/chosen": -35.35252380371094, + "logps/rejected": -9.823665618896484, + "loss": 0.1553, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.457990288734436, + "rewards/margins": 1.0758873224258423, + "rewards/rejected": 0.38210296630859375, + "step": 9007 + }, + { + "epoch": 1.46, + "learning_rate": 1.7800252893857683e-06, + "logits/chosen": -1.0508627891540527, + "logits/rejected": -1.0174874067306519, + "logps/chosen": -51.738861083984375, + "logps/rejected": -51.70307922363281, + "loss": 0.348, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.7824699878692627, + "rewards/margins": 0.019504547119140625, + "rewards/rejected": 1.762965440750122, + "step": 9008 + }, + { + "epoch": 1.46, + "learning_rate": 1.7790199572610705e-06, + "logits/chosen": -0.8820915222167969, + "logits/rejected": -0.8720375299453735, + "logps/chosen": -11.065656661987305, + "logps/rejected": -6.808569431304932, + "loss": 0.365, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.0527511835098267, + "rewards/margins": -0.013916730880737305, + "rewards/rejected": 1.066667914390564, + "step": 9009 + }, + { + "epoch": 1.46, + "learning_rate": 1.7780148476756148e-06, + "logits/chosen": -1.3171961307525635, + "logits/rejected": -1.2579578161239624, + "logps/chosen": -117.0748291015625, + "logps/rejected": -41.52379608154297, + "loss": 0.394, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.9734405279159546, + "rewards/margins": 1.652923583984375, + "rewards/rejected": 0.320516973733902, + "step": 9010 + }, + { + "epoch": 1.46, + "learning_rate": 1.7770099606988484e-06, + "logits/chosen": -0.9948843121528625, + "logits/rejected": -1.0742785930633545, + "logps/chosen": -66.19268035888672, + "logps/rejected": -96.27862548828125, + "loss": 0.7533, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.379711151123047, + "rewards/margins": 0.7730109691619873, + "rewards/rejected": 2.6067001819610596, + "step": 9011 + }, + { + "epoch": 1.46, + "learning_rate": 1.776005296400195e-06, + "logits/chosen": -1.1327654123306274, + "logits/rejected": -1.1447012424468994, + "logps/chosen": -29.280174255371094, + "logps/rejected": -65.89323425292969, + "loss": 0.8471, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.438781499862671, + "rewards/margins": -1.2991480827331543, + "rewards/rejected": 3.737929582595825, + "step": 9012 + }, + { + "epoch": 1.46, + "learning_rate": 1.775000854849072e-06, + "logits/chosen": -1.2816176414489746, + "logits/rejected": -1.2858400344848633, + "logps/chosen": -45.823890686035156, + "logps/rejected": -72.0396728515625, + "loss": 0.1341, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.017818450927734, + "rewards/margins": 1.2155075073242188, + "rewards/rejected": 2.8023109436035156, + "step": 9013 + }, + { + "epoch": 1.46, + "learning_rate": 1.773996636114873e-06, + "logits/chosen": -1.2445101737976074, + "logits/rejected": -1.155903935432434, + "logps/chosen": -45.66304397583008, + "logps/rejected": -42.3087043762207, + "loss": 0.1359, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.722474575042725, + "rewards/margins": 1.3790955543518066, + "rewards/rejected": 4.343379020690918, + "step": 9014 + }, + { + "epoch": 1.46, + "learning_rate": 1.772992640266984e-06, + "logits/chosen": -1.5556546449661255, + "logits/rejected": -1.2105286121368408, + "logps/chosen": -114.59837341308594, + "logps/rejected": -101.595947265625, + "loss": 0.0692, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.194140911102295, + "rewards/margins": 2.350090265274048, + "rewards/rejected": 1.844050645828247, + "step": 9015 + }, + { + "epoch": 1.46, + "learning_rate": 1.771988867374767e-06, + "logits/chosen": -1.0634591579437256, + "logits/rejected": -1.0547834634780884, + "logps/chosen": -35.542320251464844, + "logps/rejected": -43.141700744628906, + "loss": 0.4943, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.0848617553710938, + "rewards/margins": -0.22908329963684082, + "rewards/rejected": 1.3139450550079346, + "step": 9016 + }, + { + "epoch": 1.46, + "learning_rate": 1.770985317507577e-06, + "logits/chosen": -1.2947150468826294, + "logits/rejected": -1.1867607831954956, + "logps/chosen": -159.26376342773438, + "logps/rejected": -44.11865997314453, + "loss": 0.0117, + "rewards/accuracies": 1.0, + "rewards/chosen": 7.829339504241943, + "rewards/margins": 4.554007530212402, + "rewards/rejected": 3.275331974029541, + "step": 9017 + }, + { + "epoch": 1.46, + "learning_rate": 1.7699819907347472e-06, + "logits/chosen": -1.1999821662902832, + "logits/rejected": -1.208144187927246, + "logps/chosen": -44.6274528503418, + "logps/rejected": -56.22362518310547, + "loss": 0.6197, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.526435613632202, + "rewards/margins": -0.8106212615966797, + "rewards/rejected": 3.337056875228882, + "step": 9018 + }, + { + "epoch": 1.46, + "learning_rate": 1.7689788871256003e-06, + "logits/chosen": -1.1184180974960327, + "logits/rejected": -1.0901416540145874, + "logps/chosen": -224.11444091796875, + "logps/rejected": -47.213348388671875, + "loss": 0.034, + "rewards/accuracies": 1.0, + "rewards/chosen": 8.545926094055176, + "rewards/margins": 4.6059770584106445, + "rewards/rejected": 3.9399490356445312, + "step": 9019 + }, + { + "epoch": 1.46, + "learning_rate": 1.767976006749439e-06, + "logits/chosen": -1.6193329095840454, + "logits/rejected": -1.492037057876587, + "logps/chosen": -58.29975891113281, + "logps/rejected": -26.979785919189453, + "loss": 0.2278, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.3751327991485596, + "rewards/margins": 2.534424304962158, + "rewards/rejected": 0.8407085537910461, + "step": 9020 + }, + { + "epoch": 1.46, + "learning_rate": 1.7669733496755553e-06, + "logits/chosen": -1.1652486324310303, + "logits/rejected": -1.227828025817871, + "logps/chosen": -24.383563995361328, + "logps/rejected": -39.45732879638672, + "loss": 0.9535, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.61546790599823, + "rewards/margins": -1.704149603843689, + "rewards/rejected": 3.319617509841919, + "step": 9021 + }, + { + "epoch": 1.46, + "learning_rate": 1.7659709159732203e-06, + "logits/chosen": -1.3427802324295044, + "logits/rejected": -1.4066166877746582, + "logps/chosen": -49.712825775146484, + "logps/rejected": -46.849708557128906, + "loss": 1.2888, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.350355863571167, + "rewards/margins": -1.6528699398040771, + "rewards/rejected": 4.003225803375244, + "step": 9022 + }, + { + "epoch": 1.46, + "learning_rate": 1.764968705711696e-06, + "logits/chosen": -1.1226855516433716, + "logits/rejected": -1.0549861192703247, + "logps/chosen": -72.30467224121094, + "logps/rejected": -40.42437744140625, + "loss": 0.5269, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.8984482288360596, + "rewards/margins": 0.36221015453338623, + "rewards/rejected": 1.5362380743026733, + "step": 9023 + }, + { + "epoch": 1.46, + "learning_rate": 1.7639667189602221e-06, + "logits/chosen": -1.3027127981185913, + "logits/rejected": -1.1717801094055176, + "logps/chosen": -78.56730651855469, + "logps/rejected": -50.89131164550781, + "loss": 0.4708, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.3023040294647217, + "rewards/margins": 0.26323914527893066, + "rewards/rejected": 3.039064884185791, + "step": 9024 + }, + { + "epoch": 1.46, + "learning_rate": 1.7629649557880286e-06, + "logits/chosen": -1.3615906238555908, + "logits/rejected": -1.1275622844696045, + "logps/chosen": -90.52001953125, + "logps/rejected": -47.209022521972656, + "loss": 0.0135, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.742032051086426, + "rewards/margins": 5.980619430541992, + "rewards/rejected": -0.2385871857404709, + "step": 9025 + }, + { + "epoch": 1.47, + "learning_rate": 1.761963416264329e-06, + "logits/chosen": -1.7040786743164062, + "logits/rejected": -1.3225632905960083, + "logps/chosen": -180.22006225585938, + "logps/rejected": -23.819303512573242, + "loss": 0.0007, + "rewards/accuracies": 1.0, + "rewards/chosen": 8.370633125305176, + "rewards/margins": 7.585312843322754, + "rewards/rejected": 0.7853204607963562, + "step": 9026 + }, + { + "epoch": 1.47, + "learning_rate": 1.7609621004583171e-06, + "logits/chosen": -1.180936336517334, + "logits/rejected": -1.180936336517334, + "logps/chosen": -18.528736114501953, + "logps/rejected": -18.528736114501953, + "loss": 0.6845, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.1288117170333862, + "rewards/margins": 0.0, + "rewards/rejected": 1.1288117170333862, + "step": 9027 + }, + { + "epoch": 1.47, + "learning_rate": 1.7599610084391782e-06, + "logits/chosen": -1.1671218872070312, + "logits/rejected": -1.1671218872070312, + "logps/chosen": -30.322011947631836, + "logps/rejected": -30.322011947631836, + "loss": 0.3633, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.5538721084594727, + "rewards/margins": 0.0, + "rewards/rejected": 1.5538721084594727, + "step": 9028 + }, + { + "epoch": 1.47, + "learning_rate": 1.7589601402760737e-06, + "logits/chosen": -1.3275190591812134, + "logits/rejected": -1.2978299856185913, + "logps/chosen": -37.951934814453125, + "logps/rejected": -33.79234313964844, + "loss": 0.2455, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.7875075340271, + "rewards/margins": 0.46408843994140625, + "rewards/rejected": 4.323419094085693, + "step": 9029 + }, + { + "epoch": 1.47, + "learning_rate": 1.7579594960381586e-06, + "logits/chosen": -1.3066825866699219, + "logits/rejected": -1.2928889989852905, + "logps/chosen": -53.649803161621094, + "logps/rejected": -75.49375915527344, + "loss": 1.3134, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.230843424797058, + "rewards/margins": -2.5514307022094727, + "rewards/rejected": 3.782274007797241, + "step": 9030 + }, + { + "epoch": 1.47, + "learning_rate": 1.7569590757945637e-06, + "logits/chosen": -1.0009162425994873, + "logits/rejected": -0.8633102178573608, + "logps/chosen": -38.09532928466797, + "logps/rejected": -17.233997344970703, + "loss": 0.1129, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.4988198280334473, + "rewards/margins": 3.6572940349578857, + "rewards/rejected": -0.1584741622209549, + "step": 9031 + }, + { + "epoch": 1.47, + "learning_rate": 1.7559588796144127e-06, + "logits/chosen": -1.175202488899231, + "logits/rejected": -1.0562832355499268, + "logps/chosen": -39.902008056640625, + "logps/rejected": -11.496797561645508, + "loss": 0.2154, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.7489471435546875, + "rewards/margins": 0.7251454591751099, + "rewards/rejected": 1.0238016843795776, + "step": 9032 + }, + { + "epoch": 1.47, + "learning_rate": 1.7549589075668061e-06, + "logits/chosen": -0.9922573566436768, + "logits/rejected": -0.9922573566436768, + "logps/chosen": -31.414949417114258, + "logps/rejected": -31.414949417114258, + "loss": 0.3591, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.098679304122925, + "rewards/margins": 0.0, + "rewards/rejected": 2.098679304122925, + "step": 9033 + }, + { + "epoch": 1.47, + "learning_rate": 1.753959159720836e-06, + "logits/chosen": -0.9696589112281799, + "logits/rejected": -0.9684863686561584, + "logps/chosen": -1.9762612581253052, + "logps/rejected": -7.767341136932373, + "loss": 1.3481, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.24579186737537384, + "rewards/margins": -0.17082710564136505, + "rewards/rejected": 0.4166189730167389, + "step": 9034 + }, + { + "epoch": 1.47, + "learning_rate": 1.752959636145572e-06, + "logits/chosen": -1.3299387693405151, + "logits/rejected": -1.2351410388946533, + "logps/chosen": -135.72532653808594, + "logps/rejected": -191.14590454101562, + "loss": 0.7997, + "rewards/accuracies": 0.0, + "rewards/chosen": 4.270617961883545, + "rewards/margins": -1.3634428977966309, + "rewards/rejected": 5.634060859680176, + "step": 9035 + }, + { + "epoch": 1.47, + "learning_rate": 1.7519603369100747e-06, + "logits/chosen": -1.1808323860168457, + "logits/rejected": -1.1830157041549683, + "logps/chosen": -12.202974319458008, + "logps/rejected": -2.621922016143799, + "loss": 0.4796, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.21263790130615234, + "rewards/margins": -0.06140398979187012, + "rewards/rejected": 0.27404189109802246, + "step": 9036 + }, + { + "epoch": 1.47, + "learning_rate": 1.7509612620833833e-06, + "logits/chosen": -1.2230724096298218, + "logits/rejected": -1.236855149269104, + "logps/chosen": -61.868141174316406, + "logps/rejected": -83.83784484863281, + "loss": 0.2303, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.8522698879241943, + "rewards/margins": 0.5867822170257568, + "rewards/rejected": 2.2654876708984375, + "step": 9037 + }, + { + "epoch": 1.47, + "learning_rate": 1.7499624117345276e-06, + "logits/chosen": -1.3429421186447144, + "logits/rejected": -1.0991408824920654, + "logps/chosen": -141.99160766601562, + "logps/rejected": -14.369430541992188, + "loss": 0.008, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.502124309539795, + "rewards/margins": 4.242278099060059, + "rewards/rejected": 1.2598459720611572, + "step": 9038 + }, + { + "epoch": 1.47, + "learning_rate": 1.7489637859325154e-06, + "logits/chosen": -1.7608474493026733, + "logits/rejected": -1.6933399438858032, + "logps/chosen": -53.38887023925781, + "logps/rejected": -68.08712005615234, + "loss": 1.2504, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.3835160732269287, + "rewards/margins": -2.3911077976226807, + "rewards/rejected": 5.774623870849609, + "step": 9039 + }, + { + "epoch": 1.47, + "learning_rate": 1.7479653847463458e-06, + "logits/chosen": -1.0593127012252808, + "logits/rejected": -1.0614278316497803, + "logps/chosen": -6.346628189086914, + "logps/rejected": -3.242774248123169, + "loss": 0.3638, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3720564842224121, + "rewards/margins": 0.005029022693634033, + "rewards/rejected": 0.3670274615287781, + "step": 9040 + }, + { + "epoch": 1.47, + "learning_rate": 1.7469672082449952e-06, + "logits/chosen": -1.5944173336029053, + "logits/rejected": -1.4840389490127563, + "logps/chosen": -144.9376220703125, + "logps/rejected": -94.44044494628906, + "loss": 0.3457, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.979789733886719, + "rewards/margins": 2.3666348457336426, + "rewards/rejected": 4.613154888153076, + "step": 9041 + }, + { + "epoch": 1.47, + "learning_rate": 1.7459692564974317e-06, + "logits/chosen": -1.0587773323059082, + "logits/rejected": -1.106834053993225, + "logps/chosen": -38.44725036621094, + "logps/rejected": -87.84590148925781, + "loss": 0.2729, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.9097126722335815, + "rewards/margins": 0.3262333869934082, + "rewards/rejected": 1.5834792852401733, + "step": 9042 + }, + { + "epoch": 1.47, + "learning_rate": 1.7449715295726017e-06, + "logits/chosen": -1.3948355913162231, + "logits/rejected": -1.399461030960083, + "logps/chosen": -113.62175750732422, + "logps/rejected": -110.18824768066406, + "loss": 0.3603, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.893644094467163, + "rewards/margins": -0.048863887786865234, + "rewards/rejected": 2.9425079822540283, + "step": 9043 + }, + { + "epoch": 1.47, + "learning_rate": 1.7439740275394406e-06, + "logits/chosen": -1.4245811700820923, + "logits/rejected": -1.5525609254837036, + "logps/chosen": -90.62468719482422, + "logps/rejected": -138.69320678710938, + "loss": 3.645, + "rewards/accuracies": 0.0, + "rewards/chosen": 4.271334171295166, + "rewards/margins": -6.84972620010376, + "rewards/rejected": 11.121060371398926, + "step": 9044 + }, + { + "epoch": 1.47, + "learning_rate": 1.7429767504668638e-06, + "logits/chosen": -1.2397314310073853, + "logits/rejected": -1.032357096672058, + "logps/chosen": -84.39530181884766, + "logps/rejected": -45.2086296081543, + "loss": 0.2715, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.749099016189575, + "rewards/margins": 1.6009529829025269, + "rewards/rejected": 1.1481460332870483, + "step": 9045 + }, + { + "epoch": 1.47, + "learning_rate": 1.741979698423777e-06, + "logits/chosen": -1.1624267101287842, + "logits/rejected": -0.9389581084251404, + "logps/chosen": -95.93485260009766, + "logps/rejected": -46.603721618652344, + "loss": 0.8287, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.052828311920166, + "rewards/margins": 2.915637969970703, + "rewards/rejected": 0.13719025254249573, + "step": 9046 + }, + { + "epoch": 1.47, + "learning_rate": 1.7409828714790638e-06, + "logits/chosen": -1.2044299840927124, + "logits/rejected": -1.197838306427002, + "logps/chosen": -91.48504638671875, + "logps/rejected": -95.01409149169922, + "loss": 1.1171, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.167952060699463, + "rewards/margins": -2.1044974327087402, + "rewards/rejected": 4.272449493408203, + "step": 9047 + }, + { + "epoch": 1.47, + "learning_rate": 1.7399862697015984e-06, + "logits/chosen": -1.3195571899414062, + "logits/rejected": -1.1895558834075928, + "logps/chosen": -41.77569580078125, + "logps/rejected": -60.499263763427734, + "loss": 0.7871, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.8915131092071533, + "rewards/margins": 0.5469799041748047, + "rewards/rejected": 2.3445332050323486, + "step": 9048 + }, + { + "epoch": 1.47, + "learning_rate": 1.738989893160234e-06, + "logits/chosen": -1.4602656364440918, + "logits/rejected": -1.541649580001831, + "logps/chosen": -151.16421508789062, + "logps/rejected": -117.01519012451172, + "loss": 2.1655, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.4166810512542725, + "rewards/margins": -4.317292213439941, + "rewards/rejected": 7.733973026275635, + "step": 9049 + }, + { + "epoch": 1.47, + "learning_rate": 1.7379937419238135e-06, + "logits/chosen": -1.2301058769226074, + "logits/rejected": -1.1853916645050049, + "logps/chosen": -69.39149475097656, + "logps/rejected": -90.92115783691406, + "loss": 0.1013, + "rewards/accuracies": 1.0, + "rewards/chosen": 7.035233974456787, + "rewards/margins": 1.7487821578979492, + "rewards/rejected": 5.286451816558838, + "step": 9050 + }, + { + "epoch": 1.47, + "learning_rate": 1.7369978160611583e-06, + "logits/chosen": -0.9551705121994019, + "logits/rejected": -0.9762382507324219, + "logps/chosen": -21.666336059570312, + "logps/rejected": -40.18556594848633, + "loss": 0.9994, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.9172129034996033, + "rewards/margins": 0.44114744663238525, + "rewards/rejected": 0.476065456867218, + "step": 9051 + }, + { + "epoch": 1.47, + "learning_rate": 1.736002115641081e-06, + "logits/chosen": -1.1152019500732422, + "logits/rejected": -1.0927870273590088, + "logps/chosen": -84.16021728515625, + "logps/rejected": -49.7549934387207, + "loss": 0.0359, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.371806621551514, + "rewards/margins": 2.8525807857513428, + "rewards/rejected": 2.519225835800171, + "step": 9052 + }, + { + "epoch": 1.47, + "learning_rate": 1.7350066407323719e-06, + "logits/chosen": -1.3691939115524292, + "logits/rejected": -1.0526453256607056, + "logps/chosen": -117.36605834960938, + "logps/rejected": -109.37092590332031, + "loss": 0.0572, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.358188152313232, + "rewards/margins": 2.902409553527832, + "rewards/rejected": 3.4557785987854004, + "step": 9053 + }, + { + "epoch": 1.47, + "learning_rate": 1.7340113914038115e-06, + "logits/chosen": -1.0807896852493286, + "logits/rejected": -1.052589774131775, + "logps/chosen": -40.92681884765625, + "logps/rejected": -42.72183609008789, + "loss": 0.6868, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.0199432373046875, + "rewards/margins": -0.9418423175811768, + "rewards/rejected": 2.9617855548858643, + "step": 9054 + }, + { + "epoch": 1.47, + "learning_rate": 1.7330163677241591e-06, + "logits/chosen": -1.260894775390625, + "logits/rejected": -1.2712070941925049, + "logps/chosen": -59.219573974609375, + "logps/rejected": -70.22557067871094, + "loss": 0.8614, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.8485153913497925, + "rewards/margins": -0.6183356046676636, + "rewards/rejected": 2.466850996017456, + "step": 9055 + }, + { + "epoch": 1.47, + "learning_rate": 1.732021569762165e-06, + "logits/chosen": -1.4600056409835815, + "logits/rejected": -1.211680293083191, + "logps/chosen": -117.93099212646484, + "logps/rejected": -27.310832977294922, + "loss": 0.0707, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.4508094787597656, + "rewards/margins": 1.8866634368896484, + "rewards/rejected": 0.5641460418701172, + "step": 9056 + }, + { + "epoch": 1.47, + "learning_rate": 1.7310269975865574e-06, + "logits/chosen": -1.5611817836761475, + "logits/rejected": -1.6160558462142944, + "logps/chosen": -144.59584045410156, + "logps/rejected": -197.73985290527344, + "loss": 0.7447, + "rewards/accuracies": 0.0, + "rewards/chosen": 7.168529033660889, + "rewards/margins": -1.218822956085205, + "rewards/rejected": 8.387351989746094, + "step": 9057 + }, + { + "epoch": 1.47, + "learning_rate": 1.7300326512660542e-06, + "logits/chosen": -1.5567505359649658, + "logits/rejected": -1.5345219373703003, + "logps/chosen": -32.34640884399414, + "logps/rejected": -11.313558578491211, + "loss": 0.124, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.8447315692901611, + "rewards/margins": 1.4935498237609863, + "rewards/rejected": 0.3511818051338196, + "step": 9058 + }, + { + "epoch": 1.47, + "learning_rate": 1.7290385308693526e-06, + "logits/chosen": -1.0645431280136108, + "logits/rejected": -1.0645431280136108, + "logps/chosen": -2.7197940349578857, + "logps/rejected": -2.7197940349578857, + "loss": 0.9494, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.4111459255218506, + "rewards/margins": 0.0, + "rewards/rejected": 1.4111459255218506, + "step": 9059 + }, + { + "epoch": 1.47, + "learning_rate": 1.7280446364651377e-06, + "logits/chosen": -1.3551868200302124, + "logits/rejected": -1.3365933895111084, + "logps/chosen": -76.93499755859375, + "logps/rejected": -47.185157775878906, + "loss": 0.1221, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.3716819286346436, + "rewards/margins": 1.3407676219940186, + "rewards/rejected": 2.030914306640625, + "step": 9060 + }, + { + "epoch": 1.47, + "learning_rate": 1.727050968122081e-06, + "logits/chosen": -1.2504788637161255, + "logits/rejected": -1.2158678770065308, + "logps/chosen": -89.47959899902344, + "logps/rejected": -66.30448913574219, + "loss": 0.2762, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.895860433578491, + "rewards/margins": 0.5606338977813721, + "rewards/rejected": 2.335226535797119, + "step": 9061 + }, + { + "epoch": 1.47, + "learning_rate": 1.726057525908832e-06, + "logits/chosen": -1.2323594093322754, + "logits/rejected": -1.1397207975387573, + "logps/chosen": -97.24928283691406, + "logps/rejected": -77.81771087646484, + "loss": 0.1079, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.2275896072387695, + "rewards/margins": 2.0880091190338135, + "rewards/rejected": 2.139580488204956, + "step": 9062 + }, + { + "epoch": 1.47, + "learning_rate": 1.7250643098940312e-06, + "logits/chosen": -0.9890604019165039, + "logits/rejected": -0.974246621131897, + "logps/chosen": -66.1439437866211, + "logps/rejected": -90.14476013183594, + "loss": 0.2901, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.572821855545044, + "rewards/margins": 0.6666824817657471, + "rewards/rejected": 1.9061393737792969, + "step": 9063 + }, + { + "epoch": 1.47, + "learning_rate": 1.7240713201462973e-06, + "logits/chosen": -1.331512212753296, + "logits/rejected": -1.331512212753296, + "logps/chosen": -29.887062072753906, + "logps/rejected": -29.887062072753906, + "loss": 0.6447, + "rewards/accuracies": 0.0, + "rewards/chosen": 4.92305850982666, + "rewards/margins": 0.0, + "rewards/rejected": 4.92305850982666, + "step": 9064 + }, + { + "epoch": 1.47, + "learning_rate": 1.7230785567342395e-06, + "logits/chosen": -1.3114285469055176, + "logits/rejected": -1.1569653749465942, + "logps/chosen": -99.23820495605469, + "logps/rejected": -58.33421325683594, + "loss": 0.0048, + "rewards/accuracies": 1.0, + "rewards/chosen": 7.218593120574951, + "rewards/margins": 4.771862983703613, + "rewards/rejected": 2.446730136871338, + "step": 9065 + }, + { + "epoch": 1.47, + "learning_rate": 1.7220860197264449e-06, + "logits/chosen": -1.1168997287750244, + "logits/rejected": -1.085070013999939, + "logps/chosen": -30.993637084960938, + "logps/rejected": -50.347511291503906, + "loss": 1.2361, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.0938775539398193, + "rewards/margins": -0.382598876953125, + "rewards/rejected": 3.4764764308929443, + "step": 9066 + }, + { + "epoch": 1.47, + "learning_rate": 1.721093709191493e-06, + "logits/chosen": -1.3561155796051025, + "logits/rejected": -1.4307771921157837, + "logps/chosen": -75.37171936035156, + "logps/rejected": -97.50383758544922, + "loss": 0.4185, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.8894119262695312, + "rewards/margins": -0.20765304565429688, + "rewards/rejected": 4.097064971923828, + "step": 9067 + }, + { + "epoch": 1.47, + "learning_rate": 1.7201016251979397e-06, + "logits/chosen": -1.7355940341949463, + "logits/rejected": -1.682137370109558, + "logps/chosen": -53.190277099609375, + "logps/rejected": -34.29232406616211, + "loss": 0.116, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.527391195297241, + "rewards/margins": 3.1874265670776367, + "rewards/rejected": 0.33996468782424927, + "step": 9068 + }, + { + "epoch": 1.47, + "learning_rate": 1.7191097678143298e-06, + "logits/chosen": -1.6454049348831177, + "logits/rejected": -1.6742616891860962, + "logps/chosen": -70.26630401611328, + "logps/rejected": -94.69844055175781, + "loss": 1.664, + "rewards/accuracies": 0.0, + "rewards/chosen": 5.525302410125732, + "rewards/margins": -0.7519359588623047, + "rewards/rejected": 6.277238368988037, + "step": 9069 + }, + { + "epoch": 1.47, + "learning_rate": 1.7181181371091893e-06, + "logits/chosen": -1.3262015581130981, + "logits/rejected": -1.3262015581130981, + "logps/chosen": -64.8961181640625, + "logps/rejected": -64.8961181640625, + "loss": 0.3491, + "rewards/accuracies": 0.0, + "rewards/chosen": 4.991454601287842, + "rewards/margins": 0.0, + "rewards/rejected": 4.991454601287842, + "step": 9070 + }, + { + "epoch": 1.47, + "learning_rate": 1.7171267331510333e-06, + "logits/chosen": -1.2240488529205322, + "logits/rejected": -1.171105146408081, + "logps/chosen": -51.527565002441406, + "logps/rejected": -83.02181243896484, + "loss": 0.4132, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.7193963527679443, + "rewards/margins": -0.2500457763671875, + "rewards/rejected": 2.969442129135132, + "step": 9071 + }, + { + "epoch": 1.47, + "learning_rate": 1.716135556008356e-06, + "logits/chosen": -1.0712381601333618, + "logits/rejected": -1.1961233615875244, + "logps/chosen": -86.20503234863281, + "logps/rejected": -125.21656799316406, + "loss": 0.9061, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.192195177078247, + "rewards/margins": -1.6191527843475342, + "rewards/rejected": 4.811347961425781, + "step": 9072 + }, + { + "epoch": 1.47, + "learning_rate": 1.7151446057496407e-06, + "logits/chosen": -1.0480951070785522, + "logits/rejected": -1.1782547235488892, + "logps/chosen": -84.83019256591797, + "logps/rejected": -100.3647232055664, + "loss": 1.4111, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.1818931102752686, + "rewards/margins": -2.7274081707000732, + "rewards/rejected": 5.909301280975342, + "step": 9073 + }, + { + "epoch": 1.47, + "learning_rate": 1.7141538824433506e-06, + "logits/chosen": -1.233984112739563, + "logits/rejected": -1.29415762424469, + "logps/chosen": -139.99853515625, + "logps/rejected": -128.49143981933594, + "loss": 1.6963, + "rewards/accuracies": 0.0, + "rewards/chosen": 5.715057373046875, + "rewards/margins": -2.586801528930664, + "rewards/rejected": 8.301858901977539, + "step": 9074 + }, + { + "epoch": 1.47, + "learning_rate": 1.7131633861579373e-06, + "logits/chosen": -1.4904742240905762, + "logits/rejected": -1.4828476905822754, + "logps/chosen": -27.808576583862305, + "logps/rejected": -39.9853515625, + "loss": 0.6943, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.5013034343719482, + "rewards/margins": -1.0317273139953613, + "rewards/rejected": 3.5330307483673096, + "step": 9075 + }, + { + "epoch": 1.47, + "learning_rate": 1.7121731169618322e-06, + "logits/chosen": -1.2032889127731323, + "logits/rejected": -1.0710830688476562, + "logps/chosen": -49.213829040527344, + "logps/rejected": -47.78816604614258, + "loss": 0.9294, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.1117217540740967, + "rewards/margins": 0.5812281370162964, + "rewards/rejected": 1.5304936170578003, + "step": 9076 + }, + { + "epoch": 1.47, + "learning_rate": 1.711183074923457e-06, + "logits/chosen": -1.2368711233139038, + "logits/rejected": -1.283240795135498, + "logps/chosen": -25.019132614135742, + "logps/rejected": -62.4068603515625, + "loss": 2.9641, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.5904452800750732, + "rewards/margins": -1.4581844806671143, + "rewards/rejected": 4.0486297607421875, + "step": 9077 + }, + { + "epoch": 1.47, + "learning_rate": 1.7101932601112104e-06, + "logits/chosen": -1.0950568914413452, + "logits/rejected": -1.0714762210845947, + "logps/chosen": -84.85685729980469, + "logps/rejected": -62.13494110107422, + "loss": 0.2287, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.5015313625335693, + "rewards/margins": 0.8232896327972412, + "rewards/rejected": 1.6782417297363281, + "step": 9078 + }, + { + "epoch": 1.47, + "learning_rate": 1.7092036725934824e-06, + "logits/chosen": -1.348458170890808, + "logits/rejected": -1.2736159563064575, + "logps/chosen": -57.581787109375, + "logps/rejected": -37.65016174316406, + "loss": 0.4819, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.289588212966919, + "rewards/margins": -0.47205591201782227, + "rewards/rejected": 2.761644124984741, + "step": 9079 + }, + { + "epoch": 1.47, + "learning_rate": 1.7082143124386414e-06, + "logits/chosen": -1.3273255825042725, + "logits/rejected": -1.3361810445785522, + "logps/chosen": -29.63237762451172, + "logps/rejected": -51.32421112060547, + "loss": 0.7116, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.612112045288086, + "rewards/margins": -0.5648763179779053, + "rewards/rejected": 2.176988363265991, + "step": 9080 + }, + { + "epoch": 1.47, + "learning_rate": 1.7072251797150441e-06, + "logits/chosen": -1.1374906301498413, + "logits/rejected": -1.1783323287963867, + "logps/chosen": -69.25992584228516, + "logps/rejected": -110.82360076904297, + "loss": 0.272, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.6574578285217285, + "rewards/margins": 1.636971354484558, + "rewards/rejected": 1.0204864740371704, + "step": 9081 + }, + { + "epoch": 1.47, + "learning_rate": 1.7062362744910321e-06, + "logits/chosen": -1.2614489793777466, + "logits/rejected": -1.1667991876602173, + "logps/chosen": -113.04281616210938, + "logps/rejected": -25.004093170166016, + "loss": 2.2655, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.1098556518554688, + "rewards/margins": 0.09345650672912598, + "rewards/rejected": 2.0163991451263428, + "step": 9082 + }, + { + "epoch": 1.47, + "learning_rate": 1.7052475968349264e-06, + "logits/chosen": -1.035373568534851, + "logits/rejected": -1.0172456502914429, + "logps/chosen": -77.06273651123047, + "logps/rejected": -86.85992431640625, + "loss": 1.9013, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0245697498321533, + "rewards/margins": 1.694536566734314, + "rewards/rejected": 1.3300331830978394, + "step": 9083 + }, + { + "epoch": 1.47, + "learning_rate": 1.7042591468150377e-06, + "logits/chosen": -1.4884144067764282, + "logits/rejected": -1.640838384628296, + "logps/chosen": -38.909908294677734, + "logps/rejected": -119.55905151367188, + "loss": 3.1437, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.7025349140167236, + "rewards/margins": -4.266813278198242, + "rewards/rejected": 6.969348430633545, + "step": 9084 + }, + { + "epoch": 1.47, + "learning_rate": 1.7032709244996559e-06, + "logits/chosen": -1.425540804862976, + "logits/rejected": -1.3255372047424316, + "logps/chosen": -82.19046783447266, + "logps/rejected": -25.916093826293945, + "loss": 0.2699, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.142613172531128, + "rewards/margins": 1.5017476081848145, + "rewards/rejected": 0.6408655047416687, + "step": 9085 + }, + { + "epoch": 1.47, + "learning_rate": 1.7022829299570608e-06, + "logits/chosen": -1.4713116884231567, + "logits/rejected": -1.4040186405181885, + "logps/chosen": -167.54010009765625, + "logps/rejected": -82.82675170898438, + "loss": 0.0355, + "rewards/accuracies": 1.0, + "rewards/chosen": 7.775086879730225, + "rewards/margins": 2.9594907760620117, + "rewards/rejected": 4.815596103668213, + "step": 9086 + }, + { + "epoch": 1.47, + "learning_rate": 1.7012951632555103e-06, + "logits/chosen": -1.2967233657836914, + "logits/rejected": -1.2372431755065918, + "logps/chosen": -76.1488265991211, + "logps/rejected": -39.71192169189453, + "loss": 0.4064, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.7174575328826904, + "rewards/margins": 2.063620090484619, + "rewards/rejected": 1.6538375616073608, + "step": 9087 + }, + { + "epoch": 1.48, + "learning_rate": 1.7003076244632533e-06, + "logits/chosen": -0.9832857251167297, + "logits/rejected": -0.9832857251167297, + "logps/chosen": -32.424224853515625, + "logps/rejected": -32.424224853515625, + "loss": 0.5586, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.5426387786865234, + "rewards/margins": 0.0, + "rewards/rejected": 1.5426387786865234, + "step": 9088 + }, + { + "epoch": 1.48, + "learning_rate": 1.6993203136485154e-06, + "logits/chosen": -1.4994653463363647, + "logits/rejected": -1.5834429264068604, + "logps/chosen": -67.08992004394531, + "logps/rejected": -93.69834899902344, + "loss": 0.956, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.3802437782287598, + "rewards/margins": -1.6911230087280273, + "rewards/rejected": 4.071366786956787, + "step": 9089 + }, + { + "epoch": 1.48, + "learning_rate": 1.6983332308795147e-06, + "logits/chosen": -1.432438611984253, + "logits/rejected": -1.480724811553955, + "logps/chosen": -132.953125, + "logps/rejected": -84.51858520507812, + "loss": 0.2629, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.331714153289795, + "rewards/margins": 0.4038543701171875, + "rewards/rejected": 5.927859783172607, + "step": 9090 + }, + { + "epoch": 1.48, + "learning_rate": 1.6973463762244452e-06, + "logits/chosen": -1.1825393438339233, + "logits/rejected": -1.3151122331619263, + "logps/chosen": -77.478515625, + "logps/rejected": -109.07769775390625, + "loss": 1.6308, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.8661141395568848, + "rewards/margins": -3.2103567123413086, + "rewards/rejected": 7.076470851898193, + "step": 9091 + }, + { + "epoch": 1.48, + "learning_rate": 1.6963597497514927e-06, + "logits/chosen": -0.9891722798347473, + "logits/rejected": -0.9896097779273987, + "logps/chosen": -52.35694885253906, + "logps/rejected": -53.468868255615234, + "loss": 1.1832, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.6234681606292725, + "rewards/margins": 0.24436163902282715, + "rewards/rejected": 2.3791065216064453, + "step": 9092 + }, + { + "epoch": 1.48, + "learning_rate": 1.6953733515288201e-06, + "logits/chosen": -0.7781296968460083, + "logits/rejected": -0.7696087956428528, + "logps/chosen": -48.33035659790039, + "logps/rejected": -38.3043327331543, + "loss": 0.3447, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.8306980133056641, + "rewards/margins": 0.22629278898239136, + "rewards/rejected": 0.6044052243232727, + "step": 9093 + }, + { + "epoch": 1.48, + "learning_rate": 1.6943871816245826e-06, + "logits/chosen": -0.9722432494163513, + "logits/rejected": -1.0031615495681763, + "logps/chosen": -68.69757080078125, + "logps/rejected": -60.74470138549805, + "loss": 0.4498, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.664031982421875, + "rewards/margins": -0.20137667655944824, + "rewards/rejected": 2.8654086589813232, + "step": 9094 + }, + { + "epoch": 1.48, + "learning_rate": 1.6934012401069106e-06, + "logits/chosen": -1.2430120706558228, + "logits/rejected": -1.0632764101028442, + "logps/chosen": -70.16425323486328, + "logps/rejected": -7.249944686889648, + "loss": 0.2855, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.304184675216675, + "rewards/margins": 1.393341064453125, + "rewards/rejected": 0.9108436703681946, + "step": 9095 + }, + { + "epoch": 1.48, + "learning_rate": 1.6924155270439273e-06, + "logits/chosen": -1.4431995153427124, + "logits/rejected": -1.4359283447265625, + "logps/chosen": -71.9145736694336, + "logps/rejected": -53.909088134765625, + "loss": 0.5038, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.1525399684906006, + "rewards/margins": 0.11940312385559082, + "rewards/rejected": 2.0331368446350098, + "step": 9096 + }, + { + "epoch": 1.48, + "learning_rate": 1.6914300425037334e-06, + "logits/chosen": -1.3200780153274536, + "logits/rejected": -1.3006521463394165, + "logps/chosen": -56.276451110839844, + "logps/rejected": -43.70610046386719, + "loss": 1.1955, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.8617867231369019, + "rewards/margins": 0.5769566297531128, + "rewards/rejected": 1.284830093383789, + "step": 9097 + }, + { + "epoch": 1.48, + "learning_rate": 1.6904447865544188e-06, + "logits/chosen": -1.5810530185699463, + "logits/rejected": -1.564300298690796, + "logps/chosen": -41.01984405517578, + "logps/rejected": -55.55543518066406, + "loss": 0.2807, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.6729302406311035, + "rewards/margins": 0.5531508922576904, + "rewards/rejected": 3.119779348373413, + "step": 9098 + }, + { + "epoch": 1.48, + "learning_rate": 1.6894597592640527e-06, + "logits/chosen": -1.3016855716705322, + "logits/rejected": -1.3323270082473755, + "logps/chosen": -69.27587127685547, + "logps/rejected": -48.53661346435547, + "loss": 0.3175, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.7854058742523193, + "rewards/margins": 1.1096253395080566, + "rewards/rejected": 1.6757805347442627, + "step": 9099 + }, + { + "epoch": 1.48, + "learning_rate": 1.6884749607006938e-06, + "logits/chosen": -1.077014446258545, + "logits/rejected": -1.0934088230133057, + "logps/chosen": -72.5926513671875, + "logps/rejected": -130.9987030029297, + "loss": 0.2034, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.007709503173828, + "rewards/margins": 0.7510322332382202, + "rewards/rejected": 1.256677269935608, + "step": 9100 + }, + { + "epoch": 1.48, + "learning_rate": 1.6874903909323798e-06, + "logits/chosen": -0.8307816982269287, + "logits/rejected": -0.8361820578575134, + "logps/chosen": -8.347458839416504, + "logps/rejected": -2.6770212650299072, + "loss": 0.5202, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.26687002182006836, + "rewards/margins": -0.006752282381057739, + "rewards/rejected": 0.2736223042011261, + "step": 9101 + }, + { + "epoch": 1.48, + "learning_rate": 1.6865060500271385e-06, + "logits/chosen": -1.1817359924316406, + "logits/rejected": -1.164902687072754, + "logps/chosen": -14.290990829467773, + "logps/rejected": -0.5193482041358948, + "loss": 1.3643, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5994853973388672, + "rewards/margins": 0.46634364128112793, + "rewards/rejected": 0.13314174115657806, + "step": 9102 + }, + { + "epoch": 1.48, + "learning_rate": 1.6855219380529742e-06, + "logits/chosen": -1.2749590873718262, + "logits/rejected": -1.3107987642288208, + "logps/chosen": -50.98283386230469, + "logps/rejected": -45.50062942504883, + "loss": 1.1798, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.377487897872925, + "rewards/margins": -2.2556750774383545, + "rewards/rejected": 4.633162975311279, + "step": 9103 + }, + { + "epoch": 1.48, + "learning_rate": 1.6845380550778845e-06, + "logits/chosen": -1.43463134765625, + "logits/rejected": -1.4593908786773682, + "logps/chosen": -96.00325012207031, + "logps/rejected": -147.18495178222656, + "loss": 2.2704, + "rewards/accuracies": 0.0, + "rewards/chosen": 6.622062683105469, + "rewards/margins": -0.6208648681640625, + "rewards/rejected": 7.242927551269531, + "step": 9104 + }, + { + "epoch": 1.48, + "learning_rate": 1.683554401169842e-06, + "logits/chosen": -1.2974565029144287, + "logits/rejected": -1.3728535175323486, + "logps/chosen": -132.63571166992188, + "logps/rejected": -141.6083984375, + "loss": 1.687, + "rewards/accuracies": 0.0, + "rewards/chosen": 7.621328830718994, + "rewards/margins": -0.03488779067993164, + "rewards/rejected": 7.656216621398926, + "step": 9105 + }, + { + "epoch": 1.48, + "learning_rate": 1.6825709763968112e-06, + "logits/chosen": -1.3442133665084839, + "logits/rejected": -1.2728662490844727, + "logps/chosen": -53.00794219970703, + "logps/rejected": -7.156529426574707, + "loss": 0.2896, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.884519338607788, + "rewards/margins": 3.354137897491455, + "rewards/rejected": 0.5303813219070435, + "step": 9106 + }, + { + "epoch": 1.48, + "learning_rate": 1.681587780826735e-06, + "logits/chosen": -0.8635769486427307, + "logits/rejected": -0.8635769486427307, + "logps/chosen": -36.588592529296875, + "logps/rejected": -36.588592529296875, + "loss": 0.4776, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.0748612880706787, + "rewards/margins": 0.0, + "rewards/rejected": 2.0748612880706787, + "step": 9107 + }, + { + "epoch": 1.48, + "learning_rate": 1.6806048145275456e-06, + "logits/chosen": -1.3900121450424194, + "logits/rejected": -1.3636934757232666, + "logps/chosen": -61.57144546508789, + "logps/rejected": -80.94987487792969, + "loss": 0.6326, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.2003536224365234, + "rewards/margins": -0.8168613910675049, + "rewards/rejected": 3.0172150135040283, + "step": 9108 + }, + { + "epoch": 1.48, + "learning_rate": 1.6796220775671535e-06, + "logits/chosen": -1.452613115310669, + "logits/rejected": -1.48796546459198, + "logps/chosen": -77.60157775878906, + "logps/rejected": -99.83635711669922, + "loss": 4.1843, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.021909475326538, + "rewards/margins": -1.0620925426483154, + "rewards/rejected": 4.0840020179748535, + "step": 9109 + }, + { + "epoch": 1.48, + "learning_rate": 1.6786395700134594e-06, + "logits/chosen": -1.3378688097000122, + "logits/rejected": -1.2800662517547607, + "logps/chosen": -54.567352294921875, + "logps/rejected": -27.667362213134766, + "loss": 0.8722, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.5454552173614502, + "rewards/margins": -0.6212146282196045, + "rewards/rejected": 2.1666698455810547, + "step": 9110 + }, + { + "epoch": 1.48, + "learning_rate": 1.6776572919343426e-06, + "logits/chosen": -1.1171499490737915, + "logits/rejected": -1.0901652574539185, + "logps/chosen": -72.63667297363281, + "logps/rejected": -61.32038879394531, + "loss": 0.9327, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.528337836265564, + "rewards/margins": -1.6858781576156616, + "rewards/rejected": 3.2142159938812256, + "step": 9111 + }, + { + "epoch": 1.48, + "learning_rate": 1.6766752433976725e-06, + "logits/chosen": -1.5469955205917358, + "logits/rejected": -1.551048994064331, + "logps/chosen": -102.40599822998047, + "logps/rejected": -46.87986755371094, + "loss": 0.5119, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.4353890419006348, + "rewards/margins": 0.027808427810668945, + "rewards/rejected": 2.407580614089966, + "step": 9112 + }, + { + "epoch": 1.48, + "learning_rate": 1.6756934244712952e-06, + "logits/chosen": -1.2562228441238403, + "logits/rejected": -1.3018471002578735, + "logps/chosen": -34.77516555786133, + "logps/rejected": -68.41488647460938, + "loss": 0.4379, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.1812245845794678, + "rewards/margins": 0.0060994625091552734, + "rewards/rejected": 2.1751251220703125, + "step": 9113 + }, + { + "epoch": 1.48, + "learning_rate": 1.6747118352230495e-06, + "logits/chosen": -1.2100651264190674, + "logits/rejected": -1.1081408262252808, + "logps/chosen": -109.202392578125, + "logps/rejected": -65.20744323730469, + "loss": 0.3522, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.966879367828369, + "rewards/margins": 0.5440125465393066, + "rewards/rejected": 3.4228668212890625, + "step": 9114 + }, + { + "epoch": 1.48, + "learning_rate": 1.6737304757207502e-06, + "logits/chosen": -1.2419934272766113, + "logits/rejected": -1.293189525604248, + "logps/chosen": -62.44861602783203, + "logps/rejected": -57.78261184692383, + "loss": 1.3761, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.650728702545166, + "rewards/margins": -2.6594438552856445, + "rewards/rejected": 5.3101725578308105, + "step": 9115 + }, + { + "epoch": 1.48, + "learning_rate": 1.6727493460322014e-06, + "logits/chosen": -0.8169322609901428, + "logits/rejected": -0.8169322609901428, + "logps/chosen": -48.626556396484375, + "logps/rejected": -48.626556396484375, + "loss": 0.6277, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.820425510406494, + "rewards/margins": 0.0, + "rewards/rejected": 2.820425510406494, + "step": 9116 + }, + { + "epoch": 1.48, + "learning_rate": 1.671768446225192e-06, + "logits/chosen": -1.053517460823059, + "logits/rejected": -0.9358542561531067, + "logps/chosen": -31.494361877441406, + "logps/rejected": -59.634361267089844, + "loss": 0.2901, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.4126293659210205, + "rewards/margins": 2.175865888595581, + "rewards/rejected": 0.2367633879184723, + "step": 9117 + }, + { + "epoch": 1.48, + "learning_rate": 1.670787776367489e-06, + "logits/chosen": -1.2720130681991577, + "logits/rejected": -1.2866981029510498, + "logps/chosen": -106.07856750488281, + "logps/rejected": -114.53030395507812, + "loss": 0.2402, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.5859086513519287, + "rewards/margins": 0.4864685535430908, + "rewards/rejected": 2.099440097808838, + "step": 9118 + }, + { + "epoch": 1.48, + "learning_rate": 1.6698073365268507e-06, + "logits/chosen": -1.4176175594329834, + "logits/rejected": -1.4667693376541138, + "logps/chosen": -91.9046401977539, + "logps/rejected": -82.40413665771484, + "loss": 0.3221, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.2786240577697754, + "rewards/margins": 1.0144768953323364, + "rewards/rejected": 1.264147162437439, + "step": 9119 + }, + { + "epoch": 1.48, + "learning_rate": 1.6688271267710138e-06, + "logits/chosen": -1.2676289081573486, + "logits/rejected": -1.2815254926681519, + "logps/chosen": -67.26483154296875, + "logps/rejected": -101.43679809570312, + "loss": 0.3431, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.935713291168213, + "rewards/margins": 0.05722522735595703, + "rewards/rejected": 4.878488063812256, + "step": 9120 + }, + { + "epoch": 1.48, + "learning_rate": 1.6678471471677044e-06, + "logits/chosen": -1.0884554386138916, + "logits/rejected": -1.1126254796981812, + "logps/chosen": -58.676666259765625, + "logps/rejected": -102.87577819824219, + "loss": 0.3211, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.3101555109024048, + "rewards/margins": 0.18671345710754395, + "rewards/rejected": 1.1234420537948608, + "step": 9121 + }, + { + "epoch": 1.48, + "learning_rate": 1.6668673977846255e-06, + "logits/chosen": -1.2149730920791626, + "logits/rejected": -1.2611147165298462, + "logps/chosen": -48.04322052001953, + "logps/rejected": -86.17532348632812, + "loss": 0.3263, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.536794424057007, + "rewards/margins": 1.133492350578308, + "rewards/rejected": 1.4033020734786987, + "step": 9122 + }, + { + "epoch": 1.48, + "learning_rate": 1.6658878786894733e-06, + "logits/chosen": -1.0416905879974365, + "logits/rejected": -1.0147846937179565, + "logps/chosen": -44.22863006591797, + "logps/rejected": -39.57780075073242, + "loss": 1.0132, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.380954384803772, + "rewards/margins": -0.4765746593475342, + "rewards/rejected": 1.8575290441513062, + "step": 9123 + }, + { + "epoch": 1.48, + "learning_rate": 1.6649085899499196e-06, + "logits/chosen": -1.0319384336471558, + "logits/rejected": -1.064414143562317, + "logps/chosen": -59.37078094482422, + "logps/rejected": -58.65813446044922, + "loss": 0.3576, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.3762459754943848, + "rewards/margins": -0.019802093505859375, + "rewards/rejected": 3.396048069000244, + "step": 9124 + }, + { + "epoch": 1.48, + "learning_rate": 1.6639295316336267e-06, + "logits/chosen": -1.2517255544662476, + "logits/rejected": -1.2953124046325684, + "logps/chosen": -59.55958557128906, + "logps/rejected": -86.32124328613281, + "loss": 1.0105, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.0732605457305908, + "rewards/margins": -0.7958953380584717, + "rewards/rejected": 1.8691558837890625, + "step": 9125 + }, + { + "epoch": 1.48, + "learning_rate": 1.6629507038082355e-06, + "logits/chosen": -0.9492540955543518, + "logits/rejected": -0.9254205822944641, + "logps/chosen": -66.12208557128906, + "logps/rejected": -68.13288116455078, + "loss": 0.4512, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.777334690093994, + "rewards/margins": -0.11157917976379395, + "rewards/rejected": 2.888913869857788, + "step": 9126 + }, + { + "epoch": 1.48, + "learning_rate": 1.6619721065413764e-06, + "logits/chosen": -1.0962190628051758, + "logits/rejected": -1.0256175994873047, + "logps/chosen": -57.67776870727539, + "logps/rejected": -24.319202423095703, + "loss": 0.0891, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.8330013751983643, + "rewards/margins": 1.772815227508545, + "rewards/rejected": 0.060186196118593216, + "step": 9127 + }, + { + "epoch": 1.48, + "learning_rate": 1.6609937399006587e-06, + "logits/chosen": -1.4522573947906494, + "logits/rejected": -1.410008430480957, + "logps/chosen": -88.2257308959961, + "logps/rejected": -51.18779754638672, + "loss": 0.4677, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.3821189403533936, + "rewards/margins": 0.6157164573669434, + "rewards/rejected": 2.76640248298645, + "step": 9128 + }, + { + "epoch": 1.48, + "learning_rate": 1.6600156039536813e-06, + "logits/chosen": -1.4633773565292358, + "logits/rejected": -1.4633773565292358, + "logps/chosen": -93.2548828125, + "logps/rejected": -93.2548828125, + "loss": 0.4317, + "rewards/accuracies": 0.0, + "rewards/chosen": 6.91676664352417, + "rewards/margins": 0.0, + "rewards/rejected": 6.91676664352417, + "step": 9129 + }, + { + "epoch": 1.48, + "learning_rate": 1.6590376987680207e-06, + "logits/chosen": -1.184205412864685, + "logits/rejected": -1.0677530765533447, + "logps/chosen": -38.821922302246094, + "logps/rejected": -24.10712432861328, + "loss": 0.1913, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.5331764221191406, + "rewards/margins": 0.8932517766952515, + "rewards/rejected": 0.6399246454238892, + "step": 9130 + }, + { + "epoch": 1.48, + "learning_rate": 1.6580600244112443e-06, + "logits/chosen": -1.4834861755371094, + "logits/rejected": -1.5574228763580322, + "logps/chosen": -104.25767517089844, + "logps/rejected": -154.83238220214844, + "loss": 1.2302, + "rewards/accuracies": 0.0, + "rewards/chosen": 4.178142070770264, + "rewards/margins": -2.3695125579833984, + "rewards/rejected": 6.547654628753662, + "step": 9131 + }, + { + "epoch": 1.48, + "learning_rate": 1.6570825809508967e-06, + "logits/chosen": -1.2419534921646118, + "logits/rejected": -1.3334110975265503, + "logps/chosen": -98.5841293334961, + "logps/rejected": -102.43330383300781, + "loss": 1.2676, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.7287209033966064, + "rewards/margins": -2.046720266342163, + "rewards/rejected": 4.7754411697387695, + "step": 9132 + }, + { + "epoch": 1.48, + "learning_rate": 1.6561053684545135e-06, + "logits/chosen": -1.3547217845916748, + "logits/rejected": -1.3589383363723755, + "logps/chosen": -101.77082824707031, + "logps/rejected": -77.69288635253906, + "loss": 1.2359, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.7916030883789062, + "rewards/margins": -0.14842987060546875, + "rewards/rejected": 3.940032958984375, + "step": 9133 + }, + { + "epoch": 1.48, + "learning_rate": 1.6551283869896073e-06, + "logits/chosen": -1.0881285667419434, + "logits/rejected": -1.0409111976623535, + "logps/chosen": -86.1886215209961, + "logps/rejected": -79.71124267578125, + "loss": 0.2611, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9131247997283936, + "rewards/margins": 2.392955780029297, + "rewards/rejected": 0.5201690793037415, + "step": 9134 + }, + { + "epoch": 1.48, + "learning_rate": 1.654151636623682e-06, + "logits/chosen": -1.7606607675552368, + "logits/rejected": -1.8286672830581665, + "logps/chosen": -124.52192687988281, + "logps/rejected": -106.70642852783203, + "loss": 0.0659, + "rewards/accuracies": 1.0, + "rewards/chosen": 9.638768196105957, + "rewards/margins": 3.9749598503112793, + "rewards/rejected": 5.663808345794678, + "step": 9135 + }, + { + "epoch": 1.48, + "learning_rate": 1.6531751174242184e-06, + "logits/chosen": -1.0129393339157104, + "logits/rejected": -0.9264728426933289, + "logps/chosen": -48.748939514160156, + "logps/rejected": -39.97980499267578, + "loss": 0.843, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.557981252670288, + "rewards/margins": -1.4718644618988037, + "rewards/rejected": 4.029845714569092, + "step": 9136 + }, + { + "epoch": 1.48, + "learning_rate": 1.6521988294586877e-06, + "logits/chosen": -1.1485267877578735, + "logits/rejected": -1.146997094154358, + "logps/chosen": -38.106834411621094, + "logps/rejected": -99.0965347290039, + "loss": 0.4592, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.40417218208313, + "rewards/margins": 0.6238094568252563, + "rewards/rejected": 1.7803627252578735, + "step": 9137 + }, + { + "epoch": 1.48, + "learning_rate": 1.6512227727945391e-06, + "logits/chosen": -1.100516438484192, + "logits/rejected": -1.1026723384857178, + "logps/chosen": -34.47880172729492, + "logps/rejected": -35.84449005126953, + "loss": 0.8338, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.6065529584884644, + "rewards/margins": -0.24893951416015625, + "rewards/rejected": 1.8554924726486206, + "step": 9138 + }, + { + "epoch": 1.48, + "learning_rate": 1.6502469474992122e-06, + "logits/chosen": -1.111660122871399, + "logits/rejected": -1.007283329963684, + "logps/chosen": -56.395545959472656, + "logps/rejected": -53.61952209472656, + "loss": 0.2469, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.3820717334747314, + "rewards/margins": 0.8954148292541504, + "rewards/rejected": 2.486656904220581, + "step": 9139 + }, + { + "epoch": 1.48, + "learning_rate": 1.6492713536401233e-06, + "logits/chosen": -1.138960361480713, + "logits/rejected": -1.2156462669372559, + "logps/chosen": -59.75967788696289, + "logps/rejected": -67.51799774169922, + "loss": 1.7608, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.622074604034424, + "rewards/margins": -3.2557077407836914, + "rewards/rejected": 6.877782344818115, + "step": 9140 + }, + { + "epoch": 1.48, + "learning_rate": 1.6482959912846813e-06, + "logits/chosen": -1.0032932758331299, + "logits/rejected": -1.035159707069397, + "logps/chosen": -29.258947372436523, + "logps/rejected": -45.72179412841797, + "loss": 0.561, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.7012995481491089, + "rewards/margins": -0.4918898344039917, + "rewards/rejected": 2.1931893825531006, + "step": 9141 + }, + { + "epoch": 1.48, + "learning_rate": 1.6473208605002705e-06, + "logits/chosen": -1.4949702024459839, + "logits/rejected": -1.4906165599822998, + "logps/chosen": -85.06834411621094, + "logps/rejected": -78.73377990722656, + "loss": 1.2476, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.307499647140503, + "rewards/margins": -0.5318527221679688, + "rewards/rejected": 2.8393523693084717, + "step": 9142 + }, + { + "epoch": 1.48, + "learning_rate": 1.6463459613542664e-06, + "logits/chosen": -0.9785253405570984, + "logits/rejected": -0.9140478372573853, + "logps/chosen": -62.37782287597656, + "logps/rejected": -40.1776123046875, + "loss": 0.8225, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.6601265668869019, + "rewards/margins": 0.23073434829711914, + "rewards/rejected": 1.4293922185897827, + "step": 9143 + }, + { + "epoch": 1.48, + "learning_rate": 1.6453712939140221e-06, + "logits/chosen": -1.2958852052688599, + "logits/rejected": -1.3556846380233765, + "logps/chosen": -94.13650512695312, + "logps/rejected": -114.39820098876953, + "loss": 1.6237, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.9274277687072754, + "rewards/margins": -3.2073373794555664, + "rewards/rejected": 7.134765148162842, + "step": 9144 + }, + { + "epoch": 1.48, + "learning_rate": 1.644396858246881e-06, + "logits/chosen": -0.979732096195221, + "logits/rejected": -0.86516273021698, + "logps/chosen": -58.38277053833008, + "logps/rejected": -40.26679611206055, + "loss": 0.2059, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.5638424158096313, + "rewards/margins": 0.7946746945381165, + "rewards/rejected": 0.7691677212715149, + "step": 9145 + }, + { + "epoch": 1.48, + "learning_rate": 1.6434226544201648e-06, + "logits/chosen": -1.533457636833191, + "logits/rejected": -1.4999343156814575, + "logps/chosen": -145.03988647460938, + "logps/rejected": -113.50141906738281, + "loss": 0.0387, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.923313856124878, + "rewards/margins": 2.6359939575195312, + "rewards/rejected": 1.2873200178146362, + "step": 9146 + }, + { + "epoch": 1.48, + "learning_rate": 1.6424486825011842e-06, + "logits/chosen": -1.011277198791504, + "logits/rejected": -0.928357720375061, + "logps/chosen": -74.13188171386719, + "logps/rejected": -68.29792785644531, + "loss": 1.1721, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.6458771228790283, + "rewards/margins": -2.067833185195923, + "rewards/rejected": 4.713710308074951, + "step": 9147 + }, + { + "epoch": 1.48, + "learning_rate": 1.6414749425572291e-06, + "logits/chosen": -1.1376783847808838, + "logits/rejected": -1.141648292541504, + "logps/chosen": -40.46326446533203, + "logps/rejected": -34.39683532714844, + "loss": 0.3062, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.8126308917999268, + "rewards/margins": 0.7960227727890015, + "rewards/rejected": 1.0166081190109253, + "step": 9148 + }, + { + "epoch": 1.48, + "learning_rate": 1.640501434655578e-06, + "logits/chosen": -1.2467890977859497, + "logits/rejected": -1.3564832210540771, + "logps/chosen": -52.93049621582031, + "logps/rejected": -83.87026977539062, + "loss": 1.0239, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.0804641246795654, + "rewards/margins": -1.506312608718872, + "rewards/rejected": 3.5867767333984375, + "step": 9149 + }, + { + "epoch": 1.49, + "learning_rate": 1.6395281588634887e-06, + "logits/chosen": -1.212509036064148, + "logits/rejected": -1.215506911277771, + "logps/chosen": -5.3697309494018555, + "logps/rejected": -11.221837997436523, + "loss": 0.3279, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5936455726623535, + "rewards/margins": 0.09163272380828857, + "rewards/rejected": 0.5020128488540649, + "step": 9150 + }, + { + "epoch": 1.49, + "learning_rate": 1.6385551152482077e-06, + "logits/chosen": -1.2206735610961914, + "logits/rejected": -1.1667771339416504, + "logps/chosen": -88.03677368164062, + "logps/rejected": -70.20195007324219, + "loss": 0.2231, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.404895067214966, + "rewards/margins": 0.6235824823379517, + "rewards/rejected": 1.7813125848770142, + "step": 9151 + }, + { + "epoch": 1.49, + "learning_rate": 1.6375823038769606e-06, + "logits/chosen": -1.2383755445480347, + "logits/rejected": -1.2375578880310059, + "logps/chosen": -80.04496002197266, + "logps/rejected": -106.63442993164062, + "loss": 0.5524, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.127960205078125, + "rewards/margins": 0.6164512634277344, + "rewards/rejected": 1.5115089416503906, + "step": 9152 + }, + { + "epoch": 1.49, + "learning_rate": 1.6366097248169604e-06, + "logits/chosen": -1.103704810142517, + "logits/rejected": -1.0787664651870728, + "logps/chosen": -46.02735900878906, + "logps/rejected": -45.69727325439453, + "loss": 0.5532, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.099468231201172, + "rewards/margins": -0.1473374366760254, + "rewards/rejected": 2.2468056678771973, + "step": 9153 + }, + { + "epoch": 1.49, + "learning_rate": 1.6356373781354058e-06, + "logits/chosen": -1.5052391290664673, + "logits/rejected": -1.4898241758346558, + "logps/chosen": -51.92050552368164, + "logps/rejected": -61.524009704589844, + "loss": 0.1466, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.3703250885009766, + "rewards/margins": 1.98601496219635, + "rewards/rejected": 1.3843101263046265, + "step": 9154 + }, + { + "epoch": 1.49, + "learning_rate": 1.634665263899472e-06, + "logits/chosen": -1.3064197301864624, + "logits/rejected": -1.1835178136825562, + "logps/chosen": -123.82825469970703, + "logps/rejected": -80.10301208496094, + "loss": 0.3317, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.689464569091797, + "rewards/margins": 3.6507468223571777, + "rewards/rejected": 2.038717746734619, + "step": 9155 + }, + { + "epoch": 1.49, + "learning_rate": 1.633693382176328e-06, + "logits/chosen": -1.253426432609558, + "logits/rejected": -1.1601738929748535, + "logps/chosen": -101.48432922363281, + "logps/rejected": -72.40220642089844, + "loss": 0.1857, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.7750167846679688, + "rewards/margins": 0.8910751342773438, + "rewards/rejected": 2.883941650390625, + "step": 9156 + }, + { + "epoch": 1.49, + "learning_rate": 1.6327217330331162e-06, + "logits/chosen": -1.259037733078003, + "logits/rejected": -1.2504724264144897, + "logps/chosen": -79.41930389404297, + "logps/rejected": -155.67861938476562, + "loss": 1.0808, + "rewards/accuracies": 0.0, + "rewards/chosen": 5.692668914794922, + "rewards/margins": -1.9882516860961914, + "rewards/rejected": 7.680920600891113, + "step": 9157 + }, + { + "epoch": 1.49, + "learning_rate": 1.6317503165369735e-06, + "logits/chosen": -1.593590497970581, + "logits/rejected": -1.6647814512252808, + "logps/chosen": -33.92181396484375, + "logps/rejected": -103.68997192382812, + "loss": 1.2481, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.7391605377197266, + "rewards/margins": -1.9518141746520996, + "rewards/rejected": 4.690974712371826, + "step": 9158 + }, + { + "epoch": 1.49, + "learning_rate": 1.6307791327550122e-06, + "logits/chosen": -1.488411784172058, + "logits/rejected": -1.4851301908493042, + "logps/chosen": -167.42811584472656, + "logps/rejected": -25.528640747070312, + "loss": 0.0043, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.491868495941162, + "rewards/margins": 4.95952033996582, + "rewards/rejected": 1.5323479175567627, + "step": 9159 + }, + { + "epoch": 1.49, + "learning_rate": 1.6298081817543338e-06, + "logits/chosen": -1.4107210636138916, + "logits/rejected": -1.2943966388702393, + "logps/chosen": -102.27181243896484, + "logps/rejected": -50.98808288574219, + "loss": 0.0151, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.438347816467285, + "rewards/margins": 3.652890920639038, + "rewards/rejected": 1.785456895828247, + "step": 9160 + }, + { + "epoch": 1.49, + "learning_rate": 1.6288374636020194e-06, + "logits/chosen": -1.2434345483779907, + "logits/rejected": -1.0288370847702026, + "logps/chosen": -131.67953491210938, + "logps/rejected": -53.449623107910156, + "loss": 0.414, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.130291938781738, + "rewards/margins": 2.9311532974243164, + "rewards/rejected": 2.199138641357422, + "step": 9161 + }, + { + "epoch": 1.49, + "learning_rate": 1.6278669783651396e-06, + "logits/chosen": -1.6038588285446167, + "logits/rejected": -1.658182978630066, + "logps/chosen": -133.06124877929688, + "logps/rejected": -171.75588989257812, + "loss": 2.8364, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.2199676036834717, + "rewards/margins": -5.6602935791015625, + "rewards/rejected": 7.880261421203613, + "step": 9162 + }, + { + "epoch": 1.49, + "learning_rate": 1.6268967261107426e-06, + "logits/chosen": -1.2394431829452515, + "logits/rejected": -1.1277012825012207, + "logps/chosen": -106.07283782958984, + "logps/rejected": -60.648372650146484, + "loss": 0.0201, + "rewards/accuracies": 1.0, + "rewards/chosen": 7.872203826904297, + "rewards/margins": 3.2624783515930176, + "rewards/rejected": 4.609725475311279, + "step": 9163 + }, + { + "epoch": 1.49, + "learning_rate": 1.625926706905867e-06, + "logits/chosen": -1.067786693572998, + "logits/rejected": -1.1166247129440308, + "logps/chosen": -69.84523010253906, + "logps/rejected": -94.12959289550781, + "loss": 0.1655, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.363104224205017, + "rewards/margins": 0.9516822099685669, + "rewards/rejected": 0.4114219844341278, + "step": 9164 + }, + { + "epoch": 1.49, + "learning_rate": 1.6249569208175286e-06, + "logits/chosen": -0.7637352347373962, + "logits/rejected": -0.7238719463348389, + "logps/chosen": -13.158608436584473, + "logps/rejected": -1.5828475952148438, + "loss": 0.8698, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.9611382484436035, + "rewards/margins": 0.5590939521789551, + "rewards/rejected": 0.40204429626464844, + "step": 9165 + }, + { + "epoch": 1.49, + "learning_rate": 1.6239873679127337e-06, + "logits/chosen": -1.4198311567306519, + "logits/rejected": -1.3958454132080078, + "logps/chosen": -58.57879638671875, + "logps/rejected": -124.78567504882812, + "loss": 0.4317, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.0326569080352783, + "rewards/margins": -0.23961186408996582, + "rewards/rejected": 3.272268772125244, + "step": 9166 + }, + { + "epoch": 1.49, + "learning_rate": 1.6230180482584657e-06, + "logits/chosen": -1.234178900718689, + "logits/rejected": -1.200295329093933, + "logps/chosen": -86.72549438476562, + "logps/rejected": -71.4405517578125, + "loss": 0.5263, + "rewards/accuracies": 0.0, + "rewards/chosen": 4.004854679107666, + "rewards/margins": -0.3952460289001465, + "rewards/rejected": 4.4001007080078125, + "step": 9167 + }, + { + "epoch": 1.49, + "learning_rate": 1.6220489619216988e-06, + "logits/chosen": -1.0996043682098389, + "logits/rejected": -1.0235719680786133, + "logps/chosen": -66.76880645751953, + "logps/rejected": -61.71088790893555, + "loss": 0.3355, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.5269837379455566, + "rewards/margins": 0.8493176698684692, + "rewards/rejected": 1.6776660680770874, + "step": 9168 + }, + { + "epoch": 1.49, + "learning_rate": 1.621080108969385e-06, + "logits/chosen": -1.2084518671035767, + "logits/rejected": -1.1633144617080688, + "logps/chosen": -89.85360717773438, + "logps/rejected": -87.52196502685547, + "loss": 1.0374, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.4244126081466675, + "rewards/margins": -1.864585041999817, + "rewards/rejected": 3.2889976501464844, + "step": 9169 + }, + { + "epoch": 1.49, + "learning_rate": 1.6201114894684656e-06, + "logits/chosen": -1.544930100440979, + "logits/rejected": -1.4959888458251953, + "logps/chosen": -77.68484497070312, + "logps/rejected": -56.16032791137695, + "loss": 0.2808, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.493833303451538, + "rewards/margins": 1.1622579097747803, + "rewards/rejected": 2.331575393676758, + "step": 9170 + }, + { + "epoch": 1.49, + "learning_rate": 1.6191431034858596e-06, + "logits/chosen": -1.2584792375564575, + "logits/rejected": -0.8267675042152405, + "logps/chosen": -148.60633850097656, + "logps/rejected": -121.87014770507812, + "loss": 1.5104, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1613800525665283, + "rewards/margins": 0.07910609245300293, + "rewards/rejected": 3.0822739601135254, + "step": 9171 + }, + { + "epoch": 1.49, + "learning_rate": 1.6181749510884765e-06, + "logits/chosen": -1.2974376678466797, + "logits/rejected": -1.2331117391586304, + "logps/chosen": -46.158653259277344, + "logps/rejected": -45.362571716308594, + "loss": 0.1346, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.4563987255096436, + "rewards/margins": 1.640608549118042, + "rewards/rejected": 1.8157901763916016, + "step": 9172 + }, + { + "epoch": 1.49, + "learning_rate": 1.6172070323432038e-06, + "logits/chosen": -0.9006499648094177, + "logits/rejected": -0.8999170660972595, + "logps/chosen": -72.12236022949219, + "logps/rejected": -133.37586975097656, + "loss": 1.3557, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.852126359939575, + "rewards/margins": 1.1673439741134644, + "rewards/rejected": 1.6847823858261108, + "step": 9173 + }, + { + "epoch": 1.49, + "learning_rate": 1.6162393473169186e-06, + "logits/chosen": -1.2395105361938477, + "logits/rejected": -1.237901210784912, + "logps/chosen": -5.054209232330322, + "logps/rejected": -16.76790428161621, + "loss": 2.18, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3509616553783417, + "rewards/margins": 0.08799970149993896, + "rewards/rejected": 0.2629619538784027, + "step": 9174 + }, + { + "epoch": 1.49, + "learning_rate": 1.6152718960764775e-06, + "logits/chosen": -1.1162701845169067, + "logits/rejected": -1.1218435764312744, + "logps/chosen": -50.131195068359375, + "logps/rejected": -77.82039642333984, + "loss": 2.091, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.93236243724823, + "rewards/margins": 0.07021260261535645, + "rewards/rejected": 1.8621498346328735, + "step": 9175 + }, + { + "epoch": 1.49, + "learning_rate": 1.6143046786887195e-06, + "logits/chosen": -0.949958860874176, + "logits/rejected": -0.9287973046302795, + "logps/chosen": -93.48345184326172, + "logps/rejected": -63.213768005371094, + "loss": 0.7968, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.7351082563400269, + "rewards/margins": 0.9829499125480652, + "rewards/rejected": 0.7521583437919617, + "step": 9176 + }, + { + "epoch": 1.49, + "learning_rate": 1.6133376952204743e-06, + "logits/chosen": -1.5487253665924072, + "logits/rejected": -1.5143638849258423, + "logps/chosen": -92.37393188476562, + "logps/rejected": -42.068817138671875, + "loss": 0.1333, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.337063789367676, + "rewards/margins": 1.3923699855804443, + "rewards/rejected": 2.9446938037872314, + "step": 9177 + }, + { + "epoch": 1.49, + "learning_rate": 1.612370945738548e-06, + "logits/chosen": -1.2341289520263672, + "logits/rejected": -1.2341289520263672, + "logps/chosen": -32.40287780761719, + "logps/rejected": -32.40287780761719, + "loss": 0.3496, + "rewards/accuracies": 0.0, + "rewards/chosen": 4.7208476066589355, + "rewards/margins": 0.0, + "rewards/rejected": 4.7208476066589355, + "step": 9178 + }, + { + "epoch": 1.49, + "learning_rate": 1.6114044303097366e-06, + "logits/chosen": -1.6981602907180786, + "logits/rejected": -1.574275016784668, + "logps/chosen": -77.28662109375, + "logps/rejected": -61.708526611328125, + "loss": 0.0079, + "rewards/accuracies": 1.0, + "rewards/chosen": 7.715915203094482, + "rewards/margins": 4.757695198059082, + "rewards/rejected": 2.9582200050354004, + "step": 9179 + }, + { + "epoch": 1.49, + "learning_rate": 1.6104381490008143e-06, + "logits/chosen": -1.7885246276855469, + "logits/rejected": -1.6995757818222046, + "logps/chosen": -129.68533325195312, + "logps/rejected": -44.26261520385742, + "loss": 0.0251, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.7765092849731445, + "rewards/margins": 4.548345565795898, + "rewards/rejected": 1.2281635999679565, + "step": 9180 + }, + { + "epoch": 1.49, + "learning_rate": 1.6094721018785452e-06, + "logits/chosen": -1.114770531654358, + "logits/rejected": -0.9719042181968689, + "logps/chosen": -22.740459442138672, + "logps/rejected": -84.48658752441406, + "loss": 2.3946, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.1522163152694702, + "rewards/margins": -4.777738094329834, + "rewards/rejected": 5.929954528808594, + "step": 9181 + }, + { + "epoch": 1.49, + "learning_rate": 1.608506289009671e-06, + "logits/chosen": -1.425320029258728, + "logits/rejected": -1.4068351984024048, + "logps/chosen": -45.51249694824219, + "logps/rejected": -51.24810028076172, + "loss": 0.4195, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.385446310043335, + "rewards/margins": 0.5004785060882568, + "rewards/rejected": 2.884967803955078, + "step": 9182 + }, + { + "epoch": 1.49, + "learning_rate": 1.607540710460923e-06, + "logits/chosen": -1.0256720781326294, + "logits/rejected": -1.0845109224319458, + "logps/chosen": -48.94746398925781, + "logps/rejected": -87.52808380126953, + "loss": 2.2413, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.131150960922241, + "rewards/margins": -4.200242042541504, + "rewards/rejected": 7.331392765045166, + "step": 9183 + }, + { + "epoch": 1.49, + "learning_rate": 1.6065753662990109e-06, + "logits/chosen": -1.4740376472473145, + "logits/rejected": -1.1731044054031372, + "logps/chosen": -194.26904296875, + "logps/rejected": -24.82771110534668, + "loss": 0.0126, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.880902290344238, + "rewards/margins": 4.321859359741211, + "rewards/rejected": 0.5590431094169617, + "step": 9184 + }, + { + "epoch": 1.49, + "learning_rate": 1.6056102565906334e-06, + "logits/chosen": -1.4102870225906372, + "logits/rejected": -1.3244128227233887, + "logps/chosen": -178.7481689453125, + "logps/rejected": -9.19701862335205, + "loss": 0.0022, + "rewards/accuracies": 1.0, + "rewards/chosen": 7.463647365570068, + "rewards/margins": 6.477217674255371, + "rewards/rejected": 0.9864298701286316, + "step": 9185 + }, + { + "epoch": 1.49, + "learning_rate": 1.6046453814024671e-06, + "logits/chosen": -1.257886290550232, + "logits/rejected": -1.2108391523361206, + "logps/chosen": -91.05490112304688, + "logps/rejected": -48.85878372192383, + "loss": 0.09, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.623002767562866, + "rewards/margins": 1.8904485702514648, + "rewards/rejected": 0.7325542569160461, + "step": 9186 + }, + { + "epoch": 1.49, + "learning_rate": 1.60368074080118e-06, + "logits/chosen": -1.6709970235824585, + "logits/rejected": -1.5875654220581055, + "logps/chosen": -105.1009750366211, + "logps/rejected": -24.127931594848633, + "loss": 0.1797, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9909539222717285, + "rewards/margins": 1.1519984006881714, + "rewards/rejected": 1.8389555215835571, + "step": 9187 + }, + { + "epoch": 1.49, + "learning_rate": 1.6027163348534159e-06, + "logits/chosen": -1.3947288990020752, + "logits/rejected": -1.3829737901687622, + "logps/chosen": -78.72737121582031, + "logps/rejected": -109.31148529052734, + "loss": 0.4779, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.9697754383087158, + "rewards/margins": 0.9555199146270752, + "rewards/rejected": 1.0142555236816406, + "step": 9188 + }, + { + "epoch": 1.49, + "learning_rate": 1.6017521636258094e-06, + "logits/chosen": -1.189344882965088, + "logits/rejected": -1.0727286338806152, + "logps/chosen": -41.784942626953125, + "logps/rejected": -7.24124002456665, + "loss": 0.1214, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2245490550994873, + "rewards/margins": 2.1021010875701904, + "rewards/rejected": 1.1224479675292969, + "step": 9189 + }, + { + "epoch": 1.49, + "learning_rate": 1.6007882271849718e-06, + "logits/chosen": -1.2394073009490967, + "logits/rejected": -1.22298264503479, + "logps/chosen": -112.58808135986328, + "logps/rejected": -66.91859436035156, + "loss": 0.1742, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.346647024154663, + "rewards/margins": 1.1755188703536987, + "rewards/rejected": 1.1711281538009644, + "step": 9190 + }, + { + "epoch": 1.49, + "learning_rate": 1.599824525597506e-06, + "logits/chosen": -1.502974271774292, + "logits/rejected": -1.4864287376403809, + "logps/chosen": -149.42535400390625, + "logps/rejected": -108.43095397949219, + "loss": 0.099, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.4481658935546875, + "rewards/margins": 1.6337509155273438, + "rewards/rejected": 2.8144149780273438, + "step": 9191 + }, + { + "epoch": 1.49, + "learning_rate": 1.5988610589299908e-06, + "logits/chosen": -1.105255126953125, + "logits/rejected": -0.9476332664489746, + "logps/chosen": -109.45857238769531, + "logps/rejected": -68.86183166503906, + "loss": 0.068, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.016278266906738, + "rewards/margins": 1.951021671295166, + "rewards/rejected": 2.0652565956115723, + "step": 9192 + }, + { + "epoch": 1.49, + "learning_rate": 1.5978978272489964e-06, + "logits/chosen": -1.2988229990005493, + "logits/rejected": -1.2840081453323364, + "logps/chosen": -64.16539001464844, + "logps/rejected": -83.0836181640625, + "loss": 0.3503, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.3508362770080566, + "rewards/margins": 0.24657225608825684, + "rewards/rejected": 3.1042640209198, + "step": 9193 + }, + { + "epoch": 1.49, + "learning_rate": 1.5969348306210692e-06, + "logits/chosen": -1.0775336027145386, + "logits/rejected": -1.0968067646026611, + "logps/chosen": -76.82023620605469, + "logps/rejected": -78.1643295288086, + "loss": 1.024, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.7361679077148438, + "rewards/margins": -0.047359466552734375, + "rewards/rejected": 2.783527374267578, + "step": 9194 + }, + { + "epoch": 1.49, + "learning_rate": 1.5959720691127473e-06, + "logits/chosen": -1.3267632722854614, + "logits/rejected": -1.286528468132019, + "logps/chosen": -60.99928665161133, + "logps/rejected": -69.52542114257812, + "loss": 0.241, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.8398433923721313, + "rewards/margins": 0.499264121055603, + "rewards/rejected": 1.3405792713165283, + "step": 9195 + }, + { + "epoch": 1.49, + "learning_rate": 1.5950095427905438e-06, + "logits/chosen": -1.202189326286316, + "logits/rejected": -1.282860517501831, + "logps/chosen": -72.56625366210938, + "logps/rejected": -109.1473617553711, + "loss": 0.298, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.7720322608947754, + "rewards/margins": 0.23101425170898438, + "rewards/rejected": 2.541018009185791, + "step": 9196 + }, + { + "epoch": 1.49, + "learning_rate": 1.594047251720965e-06, + "logits/chosen": -1.583105206489563, + "logits/rejected": -1.4698988199234009, + "logps/chosen": -92.9352035522461, + "logps/rejected": -14.341882705688477, + "loss": 0.0784, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.928006649017334, + "rewards/margins": 4.947206497192383, + "rewards/rejected": 0.9808000922203064, + "step": 9197 + }, + { + "epoch": 1.49, + "learning_rate": 1.5930851959704918e-06, + "logits/chosen": -1.2937278747558594, + "logits/rejected": -1.2096052169799805, + "logps/chosen": -134.03994750976562, + "logps/rejected": -40.60445785522461, + "loss": 0.1853, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.361248970031738, + "rewards/margins": 1.8981809616088867, + "rewards/rejected": 3.4630680084228516, + "step": 9198 + }, + { + "epoch": 1.49, + "learning_rate": 1.5921233756055965e-06, + "logits/chosen": -1.2390623092651367, + "logits/rejected": -1.1672762632369995, + "logps/chosen": -84.63973999023438, + "logps/rejected": -64.75090026855469, + "loss": 0.1684, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.095419406890869, + "rewards/margins": 3.16513991355896, + "rewards/rejected": 2.930279493331909, + "step": 9199 + }, + { + "epoch": 1.49, + "learning_rate": 1.5911617906927285e-06, + "logits/chosen": -0.4576117992401123, + "logits/rejected": -0.4576117992401123, + "logps/chosen": -38.88960266113281, + "logps/rejected": -38.88960266113281, + "loss": 1.3794, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.205390214920044, + "rewards/margins": 0.0, + "rewards/rejected": 1.205390214920044, + "step": 9200 + }, + { + "epoch": 1.49, + "learning_rate": 1.590200441298328e-06, + "logits/chosen": -1.372014045715332, + "logits/rejected": -1.1811083555221558, + "logps/chosen": -204.88186645507812, + "logps/rejected": -76.0597152709961, + "loss": 0.0593, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.473010301589966, + "rewards/margins": 3.4444541931152344, + "rewards/rejected": 0.028556061908602715, + "step": 9201 + }, + { + "epoch": 1.49, + "learning_rate": 1.589239327488812e-06, + "logits/chosen": -1.2943469285964966, + "logits/rejected": -1.2943469285964966, + "logps/chosen": -53.356441497802734, + "logps/rejected": -53.356441497802734, + "loss": 0.3803, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.638031482696533, + "rewards/margins": 0.0, + "rewards/rejected": 3.638031482696533, + "step": 9202 + }, + { + "epoch": 1.49, + "learning_rate": 1.5882784493305863e-06, + "logits/chosen": -1.4622938632965088, + "logits/rejected": -1.3789318799972534, + "logps/chosen": -89.82411193847656, + "logps/rejected": -18.56058120727539, + "loss": 0.0829, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.7612167596817017, + "rewards/margins": 1.7426315546035767, + "rewards/rejected": 0.018585205078125, + "step": 9203 + }, + { + "epoch": 1.49, + "learning_rate": 1.5873178068900364e-06, + "logits/chosen": -1.3242079019546509, + "logits/rejected": -1.2503740787506104, + "logps/chosen": -77.26399230957031, + "logps/rejected": -23.40911102294922, + "loss": 0.0779, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.7412948608398438, + "rewards/margins": 1.7953219413757324, + "rewards/rejected": 0.9459728598594666, + "step": 9204 + }, + { + "epoch": 1.49, + "learning_rate": 1.586357400233537e-06, + "logits/chosen": -0.7242841720581055, + "logits/rejected": -0.6600291728973389, + "logps/chosen": -52.23210525512695, + "logps/rejected": -1.383395791053772, + "loss": 0.3612, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.6177951693534851, + "rewards/margins": 0.20250973105430603, + "rewards/rejected": 0.4152854382991791, + "step": 9205 + }, + { + "epoch": 1.49, + "learning_rate": 1.585397229427439e-06, + "logits/chosen": -1.099316120147705, + "logits/rejected": -1.07096266746521, + "logps/chosen": -52.91313552856445, + "logps/rejected": -42.6739616394043, + "loss": 1.0965, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.754502534866333, + "rewards/margins": -1.0937912464141846, + "rewards/rejected": 3.8482937812805176, + "step": 9206 + }, + { + "epoch": 1.49, + "learning_rate": 1.5844372945380832e-06, + "logits/chosen": -1.3225700855255127, + "logits/rejected": -1.1881461143493652, + "logps/chosen": -80.81287384033203, + "logps/rejected": -47.106971740722656, + "loss": 0.058, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.8720741271972656, + "rewards/margins": 2.158644199371338, + "rewards/rejected": -0.2865699827671051, + "step": 9207 + }, + { + "epoch": 1.49, + "learning_rate": 1.5834775956317937e-06, + "logits/chosen": -1.6757996082305908, + "logits/rejected": -1.447603702545166, + "logps/chosen": -126.7537612915039, + "logps/rejected": -33.38096618652344, + "loss": 0.0114, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.750331878662109, + "rewards/margins": 4.9918413162231445, + "rewards/rejected": 0.7584907412528992, + "step": 9208 + }, + { + "epoch": 1.49, + "learning_rate": 1.582518132774873e-06, + "logits/chosen": -1.1038998365402222, + "logits/rejected": -1.0361976623535156, + "logps/chosen": -41.344261169433594, + "logps/rejected": -58.015804290771484, + "loss": 0.9401, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.2864861488342285, + "rewards/margins": -1.6828982830047607, + "rewards/rejected": 3.9693844318389893, + "step": 9209 + }, + { + "epoch": 1.49, + "learning_rate": 1.581558906033615e-06, + "logits/chosen": -1.1124290227890015, + "logits/rejected": -1.0671722888946533, + "logps/chosen": -92.93003845214844, + "logps/rejected": -84.97003173828125, + "loss": 0.2179, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.198249101638794, + "rewards/margins": 1.5030066967010498, + "rewards/rejected": 0.6952423453330994, + "step": 9210 + }, + { + "epoch": 1.5, + "learning_rate": 1.580599915474289e-06, + "logits/chosen": -1.3246004581451416, + "logits/rejected": -1.2410038709640503, + "logps/chosen": -91.58882141113281, + "logps/rejected": -85.69133758544922, + "loss": 0.2283, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.906550884246826, + "rewards/margins": 0.5487957000732422, + "rewards/rejected": 4.357755184173584, + "step": 9211 + }, + { + "epoch": 1.5, + "learning_rate": 1.5796411611631557e-06, + "logits/chosen": -1.4323639869689941, + "logits/rejected": -1.529205560684204, + "logps/chosen": -50.579917907714844, + "logps/rejected": -135.36366271972656, + "loss": 1.0149, + "rewards/accuracies": 0.0, + "rewards/chosen": 6.761546611785889, + "rewards/margins": -1.888439655303955, + "rewards/rejected": 8.649986267089844, + "step": 9212 + }, + { + "epoch": 1.5, + "learning_rate": 1.578682643166453e-06, + "logits/chosen": -1.0678632259368896, + "logits/rejected": -0.9992278218269348, + "logps/chosen": -45.36775207519531, + "logps/rejected": -31.585586547851562, + "loss": 0.2618, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.834066152572632, + "rewards/margins": 0.37761592864990234, + "rewards/rejected": 2.4564502239227295, + "step": 9213 + }, + { + "epoch": 1.5, + "learning_rate": 1.5777243615504085e-06, + "logits/chosen": -1.4033907651901245, + "logits/rejected": -1.4297897815704346, + "logps/chosen": -39.41073226928711, + "logps/rejected": -69.19561767578125, + "loss": 0.4497, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.9579311609268188, + "rewards/margins": 0.32877469062805176, + "rewards/rejected": 1.629156470298767, + "step": 9214 + }, + { + "epoch": 1.5, + "learning_rate": 1.576766316381227e-06, + "logits/chosen": -0.9211920499801636, + "logits/rejected": -0.9281501770019531, + "logps/chosen": -29.59124755859375, + "logps/rejected": -36.28028106689453, + "loss": 0.4709, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.106201171875, + "rewards/margins": -0.4363217353820801, + "rewards/rejected": 2.54252290725708, + "step": 9215 + }, + { + "epoch": 1.5, + "learning_rate": 1.5758085077251039e-06, + "logits/chosen": -1.4221506118774414, + "logits/rejected": -1.3021440505981445, + "logps/chosen": -69.00071716308594, + "logps/rejected": -62.768157958984375, + "loss": 0.4734, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.031979560852051, + "rewards/margins": 1.5825607776641846, + "rewards/rejected": 2.449418783187866, + "step": 9216 + }, + { + "epoch": 1.5, + "learning_rate": 1.5748509356482112e-06, + "logits/chosen": -1.1261720657348633, + "logits/rejected": -1.0375492572784424, + "logps/chosen": -57.7686882019043, + "logps/rejected": -45.0368537902832, + "loss": 0.4215, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.299783706665039, + "rewards/margins": -0.16355359554290771, + "rewards/rejected": 1.4633373022079468, + "step": 9217 + }, + { + "epoch": 1.5, + "learning_rate": 1.5738936002167116e-06, + "logits/chosen": -1.0246304273605347, + "logits/rejected": -0.968643307685852, + "logps/chosen": -51.75632858276367, + "logps/rejected": -55.52688980102539, + "loss": 0.2568, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.8174641132354736, + "rewards/margins": 0.41271209716796875, + "rewards/rejected": 1.4047520160675049, + "step": 9218 + }, + { + "epoch": 1.5, + "learning_rate": 1.5729365014967441e-06, + "logits/chosen": -1.084397792816162, + "logits/rejected": -1.0305479764938354, + "logps/chosen": -33.99583435058594, + "logps/rejected": -19.781841278076172, + "loss": 0.6313, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.8925789594650269, + "rewards/margins": -0.6509746313095093, + "rewards/rejected": 2.543553590774536, + "step": 9219 + }, + { + "epoch": 1.5, + "learning_rate": 1.571979639554439e-06, + "logits/chosen": -1.116767168045044, + "logits/rejected": -1.1028114557266235, + "logps/chosen": -44.49965286254883, + "logps/rejected": -13.326166152954102, + "loss": 0.7363, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.9229973554611206, + "rewards/margins": 1.0868072509765625, + "rewards/rejected": 0.8361900448799133, + "step": 9220 + }, + { + "epoch": 1.5, + "learning_rate": 1.571023014455903e-06, + "logits/chosen": -1.680468201637268, + "logits/rejected": -1.5365222692489624, + "logps/chosen": -124.5345230102539, + "logps/rejected": -16.128902435302734, + "loss": 0.0082, + "rewards/accuracies": 1.0, + "rewards/chosen": 9.013506889343262, + "rewards/margins": 8.006335258483887, + "rewards/rejected": 1.007171630859375, + "step": 9221 + }, + { + "epoch": 1.5, + "learning_rate": 1.5700666262672326e-06, + "logits/chosen": -2.113037347793579, + "logits/rejected": -2.1114537715911865, + "logps/chosen": -50.71489715576172, + "logps/rejected": -20.761747360229492, + "loss": 0.1249, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.800714135169983, + "rewards/margins": 1.287540078163147, + "rewards/rejected": 0.5131740570068359, + "step": 9222 + }, + { + "epoch": 1.5, + "learning_rate": 1.5691104750545027e-06, + "logits/chosen": -1.583709955215454, + "logits/rejected": -1.5423157215118408, + "logps/chosen": -39.15479278564453, + "logps/rejected": -30.145004272460938, + "loss": 0.4932, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.355001211166382, + "rewards/margins": -0.3391287326812744, + "rewards/rejected": 3.6941299438476562, + "step": 9223 + }, + { + "epoch": 1.5, + "learning_rate": 1.5681545608837779e-06, + "logits/chosen": -1.4756985902786255, + "logits/rejected": -1.4723495244979858, + "logps/chosen": -202.8358154296875, + "logps/rejected": -121.44755554199219, + "loss": 0.5949, + "rewards/accuracies": 0.0, + "rewards/chosen": 4.603600978851318, + "rewards/margins": -0.7808732986450195, + "rewards/rejected": 5.384474277496338, + "step": 9224 + }, + { + "epoch": 1.5, + "learning_rate": 1.5671988838210977e-06, + "logits/chosen": -1.6715483665466309, + "logits/rejected": -1.5980541706085205, + "logps/chosen": -100.10913848876953, + "logps/rejected": -24.085613250732422, + "loss": 0.5301, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2308952808380127, + "rewards/margins": 1.5902214050292969, + "rewards/rejected": 1.6406738758087158, + "step": 9225 + }, + { + "epoch": 1.5, + "learning_rate": 1.566243443932496e-06, + "logits/chosen": -1.5559053421020508, + "logits/rejected": -1.5464318990707397, + "logps/chosen": -42.329463958740234, + "logps/rejected": -48.800498962402344, + "loss": 0.3631, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.9253219366073608, + "rewards/margins": 0.3136413097381592, + "rewards/rejected": 1.6116806268692017, + "step": 9226 + }, + { + "epoch": 1.5, + "learning_rate": 1.5652882412839793e-06, + "logits/chosen": -1.3854948282241821, + "logits/rejected": -1.4190666675567627, + "logps/chosen": -134.1241455078125, + "logps/rejected": -146.8737030029297, + "loss": 2.537, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.4354188442230225, + "rewards/margins": -4.973367691040039, + "rewards/rejected": 7.408786296844482, + "step": 9227 + }, + { + "epoch": 1.5, + "learning_rate": 1.5643332759415475e-06, + "logits/chosen": -1.3282939195632935, + "logits/rejected": -1.3232816457748413, + "logps/chosen": -46.929481506347656, + "logps/rejected": -53.21791076660156, + "loss": 0.4289, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.8372552394866943, + "rewards/margins": -0.3007628917694092, + "rewards/rejected": 3.1380181312561035, + "step": 9228 + }, + { + "epoch": 1.5, + "learning_rate": 1.5633785479711761e-06, + "logits/chosen": -0.9886179566383362, + "logits/rejected": -0.9568278193473816, + "logps/chosen": -61.3315544128418, + "logps/rejected": -132.09825134277344, + "loss": 4.1453, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.149995803833008, + "rewards/margins": 0.7760944366455078, + "rewards/rejected": 1.3739013671875, + "step": 9229 + }, + { + "epoch": 1.5, + "learning_rate": 1.5624240574388306e-06, + "logits/chosen": -1.4155409336090088, + "logits/rejected": -1.3177318572998047, + "logps/chosen": -34.144840240478516, + "logps/rejected": -51.652015686035156, + "loss": 0.8272, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.4614826440811157, + "rewards/margins": 0.6009441614151001, + "rewards/rejected": 0.8605384826660156, + "step": 9230 + }, + { + "epoch": 1.5, + "learning_rate": 1.5614698044104547e-06, + "logits/chosen": -1.4073349237442017, + "logits/rejected": -1.4366577863693237, + "logps/chosen": -59.29230499267578, + "logps/rejected": -68.25494384765625, + "loss": 0.6337, + "rewards/accuracies": 0.0, + "rewards/chosen": 4.003507137298584, + "rewards/margins": -0.8126649856567383, + "rewards/rejected": 4.816172122955322, + "step": 9231 + }, + { + "epoch": 1.5, + "learning_rate": 1.5605157889519818e-06, + "logits/chosen": -1.3177739381790161, + "logits/rejected": -1.2265080213546753, + "logps/chosen": -35.79910659790039, + "logps/rejected": -20.179224014282227, + "loss": 0.1874, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.784313917160034, + "rewards/margins": 2.0332305431365967, + "rewards/rejected": 0.7510833740234375, + "step": 9232 + }, + { + "epoch": 1.5, + "learning_rate": 1.5595620111293214e-06, + "logits/chosen": -1.8518438339233398, + "logits/rejected": -1.7563815116882324, + "logps/chosen": -79.1761703491211, + "logps/rejected": -62.40296936035156, + "loss": 0.4625, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2958436012268066, + "rewards/margins": 0.5898957252502441, + "rewards/rejected": 2.7059478759765625, + "step": 9233 + }, + { + "epoch": 1.5, + "learning_rate": 1.5586084710083737e-06, + "logits/chosen": -1.3834357261657715, + "logits/rejected": -1.2892076969146729, + "logps/chosen": -135.1788330078125, + "logps/rejected": -130.264892578125, + "loss": 0.0324, + "rewards/accuracies": 1.0, + "rewards/chosen": 8.750121116638184, + "rewards/margins": 5.517047882080078, + "rewards/rejected": 3.2330734729766846, + "step": 9234 + }, + { + "epoch": 1.5, + "learning_rate": 1.5576551686550162e-06, + "logits/chosen": -1.5990099906921387, + "logits/rejected": -1.574064016342163, + "logps/chosen": -249.9379119873047, + "logps/rejected": -76.44171905517578, + "loss": 0.0454, + "rewards/accuracies": 1.0, + "rewards/chosen": 9.64323902130127, + "rewards/margins": 6.41353702545166, + "rewards/rejected": 3.2297019958496094, + "step": 9235 + }, + { + "epoch": 1.5, + "learning_rate": 1.5567021041351166e-06, + "logits/chosen": -0.9396721124649048, + "logits/rejected": -1.0044374465942383, + "logps/chosen": -45.53873062133789, + "logps/rejected": -82.18305969238281, + "loss": 0.4426, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.205630898475647, + "rewards/margins": 1.3412052392959595, + "rewards/rejected": -0.1355743408203125, + "step": 9236 + }, + { + "epoch": 1.5, + "learning_rate": 1.5557492775145189e-06, + "logits/chosen": -1.3876429796218872, + "logits/rejected": -1.2701756954193115, + "logps/chosen": -145.75218200683594, + "logps/rejected": -33.05347442626953, + "loss": 0.2189, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.688847541809082, + "rewards/margins": 4.249547481536865, + "rewards/rejected": 1.4393001794815063, + "step": 9237 + }, + { + "epoch": 1.5, + "learning_rate": 1.5547966888590582e-06, + "logits/chosen": -1.0924440622329712, + "logits/rejected": -1.0853835344314575, + "logps/chosen": -36.7619514465332, + "logps/rejected": -54.48808288574219, + "loss": 0.3738, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.582653760910034, + "rewards/margins": -0.008281707763671875, + "rewards/rejected": 2.590935468673706, + "step": 9238 + }, + { + "epoch": 1.5, + "learning_rate": 1.553844338234546e-06, + "logits/chosen": -1.3435710668563843, + "logits/rejected": -1.4334938526153564, + "logps/chosen": -161.95323181152344, + "logps/rejected": -123.43602752685547, + "loss": 1.6005, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.2960708141326904, + "rewards/margins": -3.133046865463257, + "rewards/rejected": 5.429117679595947, + "step": 9239 + }, + { + "epoch": 1.5, + "learning_rate": 1.5528922257067836e-06, + "logits/chosen": -1.281234860420227, + "logits/rejected": -1.203446865081787, + "logps/chosen": -48.62840270996094, + "logps/rejected": -18.398616790771484, + "loss": 0.1187, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.943047285079956, + "rewards/margins": 2.82979154586792, + "rewards/rejected": 0.11325569450855255, + "step": 9240 + }, + { + "epoch": 1.5, + "learning_rate": 1.5519403513415498e-06, + "logits/chosen": -1.2989070415496826, + "logits/rejected": -1.0489661693572998, + "logps/chosen": -231.88699340820312, + "logps/rejected": -132.86575317382812, + "loss": 0.1617, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.427524089813232, + "rewards/margins": 1.3989977836608887, + "rewards/rejected": 5.028526306152344, + "step": 9241 + }, + { + "epoch": 1.5, + "learning_rate": 1.5509887152046138e-06, + "logits/chosen": -1.1503363847732544, + "logits/rejected": -1.196258306503296, + "logps/chosen": -38.9229736328125, + "logps/rejected": -54.58753967285156, + "loss": 0.4599, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.6408302783966064, + "rewards/margins": 0.24138641357421875, + "rewards/rejected": 1.3994438648223877, + "step": 9242 + }, + { + "epoch": 1.5, + "learning_rate": 1.5500373173617205e-06, + "logits/chosen": -1.1502411365509033, + "logits/rejected": -1.0241279602050781, + "logps/chosen": -110.65318298339844, + "logps/rejected": -25.790781021118164, + "loss": 0.2271, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.772451877593994, + "rewards/margins": 2.8150393962860107, + "rewards/rejected": -0.04258747026324272, + "step": 9243 + }, + { + "epoch": 1.5, + "learning_rate": 1.5490861578786055e-06, + "logits/chosen": -1.227567195892334, + "logits/rejected": -0.992760181427002, + "logps/chosen": -165.2572784423828, + "logps/rejected": -74.54326629638672, + "loss": 0.1414, + "rewards/accuracies": 1.0, + "rewards/chosen": 8.067044258117676, + "rewards/margins": 3.276183605194092, + "rewards/rejected": 4.790860652923584, + "step": 9244 + }, + { + "epoch": 1.5, + "learning_rate": 1.5481352368209856e-06, + "logits/chosen": -1.0775315761566162, + "logits/rejected": -1.1127642393112183, + "logps/chosen": -76.83451080322266, + "logps/rejected": -67.27178955078125, + "loss": 0.3631, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9783012866973877, + "rewards/margins": 0.610410213470459, + "rewards/rejected": 2.3678910732269287, + "step": 9245 + }, + { + "epoch": 1.5, + "learning_rate": 1.5471845542545572e-06, + "logits/chosen": -1.6478463411331177, + "logits/rejected": -1.6663949489593506, + "logps/chosen": -100.33763122558594, + "logps/rejected": -79.43324279785156, + "loss": 0.171, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.3046815395355225, + "rewards/margins": 1.677865743637085, + "rewards/rejected": 1.6268157958984375, + "step": 9246 + }, + { + "epoch": 1.5, + "learning_rate": 1.5462341102450068e-06, + "logits/chosen": -1.1733430624008179, + "logits/rejected": -1.072441816329956, + "logps/chosen": -91.57762145996094, + "logps/rejected": -97.83106231689453, + "loss": 1.1199, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.749485731124878, + "rewards/margins": -2.1238930225372314, + "rewards/rejected": 5.873378753662109, + "step": 9247 + }, + { + "epoch": 1.5, + "learning_rate": 1.5452839048579981e-06, + "logits/chosen": -1.3549425601959229, + "logits/rejected": -1.3997342586517334, + "logps/chosen": -284.11260986328125, + "logps/rejected": -66.99089050292969, + "loss": 0.3175, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.644116401672363, + "rewards/margins": 1.6261322498321533, + "rewards/rejected": 3.01798415184021, + "step": 9248 + }, + { + "epoch": 1.5, + "learning_rate": 1.5443339381591842e-06, + "logits/chosen": -1.3165998458862305, + "logits/rejected": -1.3060630559921265, + "logps/chosen": -103.69384765625, + "logps/rejected": -157.00006103515625, + "loss": 0.4171, + "rewards/accuracies": 1.0, + "rewards/chosen": 8.641824722290039, + "rewards/margins": 0.26397705078125, + "rewards/rejected": 8.377847671508789, + "step": 9249 + }, + { + "epoch": 1.5, + "learning_rate": 1.543384210214196e-06, + "logits/chosen": -1.4076615571975708, + "logits/rejected": -1.494870662689209, + "logps/chosen": -277.3977966308594, + "logps/rejected": -176.17218017578125, + "loss": 1.8415, + "rewards/accuracies": 0.0, + "rewards/chosen": 6.445849895477295, + "rewards/margins": -3.650351047515869, + "rewards/rejected": 10.096200942993164, + "step": 9250 + }, + { + "epoch": 1.5, + "learning_rate": 1.5424347210886538e-06, + "logits/chosen": -1.446231484413147, + "logits/rejected": -1.4953696727752686, + "logps/chosen": -201.68634033203125, + "logps/rejected": -145.2423095703125, + "loss": 0.2669, + "rewards/accuracies": 1.0, + "rewards/chosen": 7.193243503570557, + "rewards/margins": 1.0767793655395508, + "rewards/rejected": 6.116464138031006, + "step": 9251 + }, + { + "epoch": 1.5, + "learning_rate": 1.5414854708481542e-06, + "logits/chosen": -0.9068372249603271, + "logits/rejected": -0.862523078918457, + "logps/chosen": -51.445770263671875, + "logps/rejected": -41.697242736816406, + "loss": 0.3599, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.6209259033203125, + "rewards/margins": 0.6103699207305908, + "rewards/rejected": 2.0105559825897217, + "step": 9252 + }, + { + "epoch": 1.5, + "learning_rate": 1.5405364595582861e-06, + "logits/chosen": -1.4929479360580444, + "logits/rejected": -1.4585891962051392, + "logps/chosen": -51.32026672363281, + "logps/rejected": -61.09379959106445, + "loss": 0.7702, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.268463134765625, + "rewards/margins": -1.290548324584961, + "rewards/rejected": 4.559011459350586, + "step": 9253 + }, + { + "epoch": 1.5, + "learning_rate": 1.5395876872846132e-06, + "logits/chosen": -1.1042765378952026, + "logits/rejected": -1.1663318872451782, + "logps/chosen": -67.65839385986328, + "logps/rejected": -97.18698120117188, + "loss": 0.3104, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9195632934570312, + "rewards/margins": 0.16118168830871582, + "rewards/rejected": 2.7583816051483154, + "step": 9254 + }, + { + "epoch": 1.5, + "learning_rate": 1.5386391540926899e-06, + "logits/chosen": -1.8495748043060303, + "logits/rejected": -1.8289096355438232, + "logps/chosen": -165.42735290527344, + "logps/rejected": -37.78864288330078, + "loss": 0.2081, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1788864135742188, + "rewards/margins": 0.7493903636932373, + "rewards/rejected": 2.4294960498809814, + "step": 9255 + }, + { + "epoch": 1.5, + "learning_rate": 1.5376908600480477e-06, + "logits/chosen": -1.4732239246368408, + "logits/rejected": -1.3490678071975708, + "logps/chosen": -81.85028839111328, + "logps/rejected": -29.937068939208984, + "loss": 0.9878, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.52344274520874, + "rewards/margins": 1.2318415641784668, + "rewards/rejected": 4.291601181030273, + "step": 9256 + }, + { + "epoch": 1.5, + "learning_rate": 1.5367428052162081e-06, + "logits/chosen": -1.3508855104446411, + "logits/rejected": -1.3508855104446411, + "logps/chosen": -1.9016896486282349, + "logps/rejected": -1.9016896486282349, + "loss": 0.7138, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.6641244292259216, + "rewards/margins": 0.0, + "rewards/rejected": 0.6641244292259216, + "step": 9257 + }, + { + "epoch": 1.5, + "learning_rate": 1.535794989662669e-06, + "logits/chosen": -1.3170983791351318, + "logits/rejected": -1.3548762798309326, + "logps/chosen": -114.66911315917969, + "logps/rejected": -74.49625396728516, + "loss": 0.7228, + "rewards/accuracies": 0.0, + "rewards/chosen": 5.475599765777588, + "rewards/margins": -1.1753029823303223, + "rewards/rejected": 6.65090274810791, + "step": 9258 + }, + { + "epoch": 1.5, + "learning_rate": 1.5348474134529196e-06, + "logits/chosen": -0.8840765953063965, + "logits/rejected": -0.9256807565689087, + "logps/chosen": -17.528947830200195, + "logps/rejected": -46.75724411010742, + "loss": 0.2386, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.0451462268829346, + "rewards/margins": 0.6525417566299438, + "rewards/rejected": 0.39260444045066833, + "step": 9259 + }, + { + "epoch": 1.5, + "learning_rate": 1.5339000766524247e-06, + "logits/chosen": -1.0970691442489624, + "logits/rejected": -1.0604665279388428, + "logps/chosen": -18.713294982910156, + "logps/rejected": -16.863374710083008, + "loss": 0.3614, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.2535899877548218, + "rewards/margins": -0.02750265598297119, + "rewards/rejected": 1.281092643737793, + "step": 9260 + }, + { + "epoch": 1.5, + "learning_rate": 1.5329529793266396e-06, + "logits/chosen": -1.522231936454773, + "logits/rejected": -1.6090385913848877, + "logps/chosen": -59.665184020996094, + "logps/rejected": -87.35269165039062, + "loss": 1.6259, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.886420488357544, + "rewards/margins": -2.8338615894317627, + "rewards/rejected": 6.720282077789307, + "step": 9261 + }, + { + "epoch": 1.5, + "learning_rate": 1.532006121540996e-06, + "logits/chosen": -0.9416375160217285, + "logits/rejected": -0.9399372339248657, + "logps/chosen": -4.184502601623535, + "logps/rejected": -1.4790371656417847, + "loss": 0.9192, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5967300534248352, + "rewards/margins": 0.06773507595062256, + "rewards/rejected": 0.5289949774742126, + "step": 9262 + }, + { + "epoch": 1.5, + "learning_rate": 1.5310595033609171e-06, + "logits/chosen": -1.0138041973114014, + "logits/rejected": -1.1596534252166748, + "logps/chosen": -60.07408142089844, + "logps/rejected": -120.40367126464844, + "loss": 1.6199, + "rewards/accuracies": 0.0, + "rewards/chosen": 4.456414699554443, + "rewards/margins": -0.27663278579711914, + "rewards/rejected": 4.7330474853515625, + "step": 9263 + }, + { + "epoch": 1.5, + "learning_rate": 1.5301131248518015e-06, + "logits/chosen": -1.1981480121612549, + "logits/rejected": -1.210662841796875, + "logps/chosen": -87.28768920898438, + "logps/rejected": -118.17366790771484, + "loss": 0.5639, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0254342555999756, + "rewards/margins": 0.6082108020782471, + "rewards/rejected": 2.4172234535217285, + "step": 9264 + }, + { + "epoch": 1.5, + "learning_rate": 1.5291669860790381e-06, + "logits/chosen": -1.1871490478515625, + "logits/rejected": -1.200772762298584, + "logps/chosen": -77.81656646728516, + "logps/rejected": -108.73308563232422, + "loss": 0.5194, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.821941375732422, + "rewards/margins": 0.9361664056777954, + "rewards/rejected": 1.8857749700546265, + "step": 9265 + }, + { + "epoch": 1.5, + "learning_rate": 1.5282210871079929e-06, + "logits/chosen": -1.1762551069259644, + "logits/rejected": -1.2241489887237549, + "logps/chosen": -84.40023803710938, + "logps/rejected": -114.09737396240234, + "loss": 2.2382, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.584320068359375, + "rewards/margins": -1.102402687072754, + "rewards/rejected": 4.686722755432129, + "step": 9266 + }, + { + "epoch": 1.5, + "learning_rate": 1.527275428004022e-06, + "logits/chosen": -0.8621987104415894, + "logits/rejected": -0.8658223748207092, + "logps/chosen": -11.576168060302734, + "logps/rejected": -1.2072832584381104, + "loss": 0.4238, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.06494217365980148, + "rewards/margins": -0.2870858907699585, + "rewards/rejected": 0.3520280718803406, + "step": 9267 + }, + { + "epoch": 1.5, + "learning_rate": 1.526330008832458e-06, + "logits/chosen": -1.0925822257995605, + "logits/rejected": -1.0925822257995605, + "logps/chosen": -7.289777755737305, + "logps/rejected": -7.289777755737305, + "loss": 0.3973, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.9280683398246765, + "rewards/margins": 0.0, + "rewards/rejected": 0.9280683398246765, + "step": 9268 + }, + { + "epoch": 1.5, + "learning_rate": 1.5253848296586238e-06, + "logits/chosen": -1.0185483694076538, + "logits/rejected": -1.0558022260665894, + "logps/chosen": -43.775054931640625, + "logps/rejected": -108.55084991455078, + "loss": 1.9868, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.366131544113159, + "rewards/margins": -2.6505119800567627, + "rewards/rejected": 6.016643524169922, + "step": 9269 + }, + { + "epoch": 1.5, + "learning_rate": 1.5244398905478197e-06, + "logits/chosen": -1.1611104011535645, + "logits/rejected": -1.1589404344558716, + "logps/chosen": -3.8726582527160645, + "logps/rejected": -3.8138837814331055, + "loss": 0.4885, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.4327094554901123, + "rewards/margins": -0.20201724767684937, + "rewards/rejected": 0.6347267031669617, + "step": 9270 + }, + { + "epoch": 1.5, + "learning_rate": 1.523495191565334e-06, + "logits/chosen": -1.4839853048324585, + "logits/rejected": -1.4875288009643555, + "logps/chosen": -28.838417053222656, + "logps/rejected": -39.08289337158203, + "loss": 1.6271, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.5533493757247925, + "rewards/margins": -3.079692840576172, + "rewards/rejected": 4.633042335510254, + "step": 9271 + }, + { + "epoch": 1.5, + "learning_rate": 1.5225507327764345e-06, + "logits/chosen": -1.413017988204956, + "logits/rejected": -1.4001855850219727, + "logps/chosen": -41.95560836791992, + "logps/rejected": -55.72510528564453, + "loss": 0.465, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.30987286567688, + "rewards/margins": -0.4236025810241699, + "rewards/rejected": 2.73347544670105, + "step": 9272 + }, + { + "epoch": 1.51, + "learning_rate": 1.5216065142463766e-06, + "logits/chosen": -1.369804859161377, + "logits/rejected": -1.2380375862121582, + "logps/chosen": -117.13740539550781, + "logps/rejected": -48.80897521972656, + "loss": 0.0685, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.085392951965332, + "rewards/margins": 2.000021457672119, + "rewards/rejected": 2.085371494293213, + "step": 9273 + }, + { + "epoch": 1.51, + "learning_rate": 1.5206625360403943e-06, + "logits/chosen": -1.4196603298187256, + "logits/rejected": -1.4436396360397339, + "logps/chosen": -185.98818969726562, + "logps/rejected": -195.522216796875, + "loss": 0.2104, + "rewards/accuracies": 1.0, + "rewards/chosen": 7.697824001312256, + "rewards/margins": 0.6502742767333984, + "rewards/rejected": 7.047549724578857, + "step": 9274 + }, + { + "epoch": 1.51, + "learning_rate": 1.519718798223711e-06, + "logits/chosen": -1.2287644147872925, + "logits/rejected": -1.2618770599365234, + "logps/chosen": -71.82081604003906, + "logps/rejected": -98.53785705566406, + "loss": 0.5829, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.056521892547607, + "rewards/margins": 2.8609423637390137, + "rewards/rejected": 3.1955795288085938, + "step": 9275 + }, + { + "epoch": 1.51, + "learning_rate": 1.5187753008615258e-06, + "logits/chosen": -1.0724629163742065, + "logits/rejected": -1.2673975229263306, + "logps/chosen": -64.85420227050781, + "logps/rejected": -171.03994750976562, + "loss": 1.0994, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.2391388416290283, + "rewards/margins": -1.2547729015350342, + "rewards/rejected": 3.4939117431640625, + "step": 9276 + }, + { + "epoch": 1.51, + "learning_rate": 1.5178320440190297e-06, + "logits/chosen": -1.4441115856170654, + "logits/rejected": -1.1763157844543457, + "logps/chosen": -156.36439514160156, + "logps/rejected": -14.822530746459961, + "loss": 0.1539, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.304881572723389, + "rewards/margins": 4.339776515960693, + "rewards/rejected": 0.965104877948761, + "step": 9277 + }, + { + "epoch": 1.51, + "learning_rate": 1.516889027761389e-06, + "logits/chosen": -1.3645025491714478, + "logits/rejected": -1.3131284713745117, + "logps/chosen": -229.70700073242188, + "logps/rejected": -66.28692626953125, + "loss": 0.014, + "rewards/accuracies": 1.0, + "rewards/chosen": 7.370062351226807, + "rewards/margins": 3.8553993701934814, + "rewards/rejected": 3.514662981033325, + "step": 9278 + }, + { + "epoch": 1.51, + "learning_rate": 1.5159462521537587e-06, + "logits/chosen": -0.8954735994338989, + "logits/rejected": -0.8537749648094177, + "logps/chosen": -91.73678588867188, + "logps/rejected": -88.96559143066406, + "loss": 0.6545, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.6553276777267456, + "rewards/margins": -0.13069379329681396, + "rewards/rejected": 1.7860214710235596, + "step": 9279 + }, + { + "epoch": 1.51, + "learning_rate": 1.515003717261278e-06, + "logits/chosen": -0.9393394589424133, + "logits/rejected": -0.9986302256584167, + "logps/chosen": -52.3077392578125, + "logps/rejected": -36.972774505615234, + "loss": 1.1717, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.6418564319610596, + "rewards/margins": -1.9353435039520264, + "rewards/rejected": 3.577199935913086, + "step": 9280 + }, + { + "epoch": 1.51, + "learning_rate": 1.5140614231490646e-06, + "logits/chosen": -1.4467816352844238, + "logits/rejected": -1.4159120321273804, + "logps/chosen": -131.40997314453125, + "logps/rejected": -105.71611022949219, + "loss": 0.2336, + "rewards/accuracies": 1.0, + "rewards/chosen": 7.99734354019165, + "rewards/margins": 2.0210509300231934, + "rewards/rejected": 5.976292610168457, + "step": 9281 + }, + { + "epoch": 1.51, + "learning_rate": 1.5131193698822234e-06, + "logits/chosen": -1.2353081703186035, + "logits/rejected": -1.2280195951461792, + "logps/chosen": -72.272705078125, + "logps/rejected": -59.19025421142578, + "loss": 0.6219, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.0875535011291504, + "rewards/margins": -0.8035140037536621, + "rewards/rejected": 2.8910675048828125, + "step": 9282 + }, + { + "epoch": 1.51, + "learning_rate": 1.512177557525838e-06, + "logits/chosen": -0.8921740651130676, + "logits/rejected": -0.834923505783081, + "logps/chosen": -58.47684097290039, + "logps/rejected": -27.52874755859375, + "loss": 0.1554, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.159885883331299, + "rewards/margins": 1.2583729028701782, + "rewards/rejected": 1.9015129804611206, + "step": 9283 + }, + { + "epoch": 1.51, + "learning_rate": 1.5112359861449832e-06, + "logits/chosen": -1.4132583141326904, + "logits/rejected": -1.3305175304412842, + "logps/chosen": -68.79615783691406, + "logps/rejected": -146.8416748046875, + "loss": 1.8219, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.4713776111602783, + "rewards/margins": -3.5245392322540283, + "rewards/rejected": 5.995916843414307, + "step": 9284 + }, + { + "epoch": 1.51, + "learning_rate": 1.5102946558047094e-06, + "logits/chosen": -1.6202284097671509, + "logits/rejected": -1.622488021850586, + "logps/chosen": -170.11856079101562, + "logps/rejected": -131.2264862060547, + "loss": 0.1139, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.825291633605957, + "rewards/margins": 2.081134080886841, + "rewards/rejected": 3.744157552719116, + "step": 9285 + }, + { + "epoch": 1.51, + "learning_rate": 1.5093535665700566e-06, + "logits/chosen": -1.4412065744400024, + "logits/rejected": -1.4372785091400146, + "logps/chosen": -59.199554443359375, + "logps/rejected": -61.486473083496094, + "loss": 0.6606, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.457714796066284, + "rewards/margins": 0.13468074798583984, + "rewards/rejected": 3.3230340480804443, + "step": 9286 + }, + { + "epoch": 1.51, + "learning_rate": 1.508412718506042e-06, + "logits/chosen": -1.5341074466705322, + "logits/rejected": -1.505168080329895, + "logps/chosen": -54.98371887207031, + "logps/rejected": -210.39602661132812, + "loss": 2.1813, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.485279083251953, + "rewards/margins": -4.032454967498779, + "rewards/rejected": 6.517734050750732, + "step": 9287 + }, + { + "epoch": 1.51, + "learning_rate": 1.5074721116776724e-06, + "logits/chosen": -1.2916089296340942, + "logits/rejected": -1.1163123846054077, + "logps/chosen": -210.85931396484375, + "logps/rejected": -62.99631881713867, + "loss": 0.0513, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.853091716766357, + "rewards/margins": 2.336977958679199, + "rewards/rejected": 4.516113758087158, + "step": 9288 + }, + { + "epoch": 1.51, + "learning_rate": 1.5065317461499312e-06, + "logits/chosen": -1.1708009243011475, + "logits/rejected": -1.1486634016036987, + "logps/chosen": -76.14501953125, + "logps/rejected": -54.7987060546875, + "loss": 1.2527, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.472586154937744, + "rewards/margins": -0.217132568359375, + "rewards/rejected": 2.689718723297119, + "step": 9289 + }, + { + "epoch": 1.51, + "learning_rate": 1.5055916219877931e-06, + "logits/chosen": -1.6473783254623413, + "logits/rejected": -1.6540950536727905, + "logps/chosen": -141.8994140625, + "logps/rejected": -73.2823486328125, + "loss": 0.0067, + "rewards/accuracies": 1.0, + "rewards/chosen": 7.405688762664795, + "rewards/margins": 4.529994487762451, + "rewards/rejected": 2.8756942749023438, + "step": 9290 + }, + { + "epoch": 1.51, + "learning_rate": 1.5046517392562077e-06, + "logits/chosen": -1.2192885875701904, + "logits/rejected": -1.1887201070785522, + "logps/chosen": -53.51829528808594, + "logps/rejected": -66.38385772705078, + "loss": 1.4594, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.366895318031311, + "rewards/margins": -1.59266197681427, + "rewards/rejected": 2.959557294845581, + "step": 9291 + }, + { + "epoch": 1.51, + "learning_rate": 1.5037120980201153e-06, + "logits/chosen": -1.1583306789398193, + "logits/rejected": -1.253887414932251, + "logps/chosen": -51.719261169433594, + "logps/rejected": -88.67562103271484, + "loss": 1.9582, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.9537293910980225, + "rewards/margins": -2.16792368888855, + "rewards/rejected": 5.121653079986572, + "step": 9292 + }, + { + "epoch": 1.51, + "learning_rate": 1.502772698344433e-06, + "logits/chosen": -1.1927566528320312, + "logits/rejected": -1.1788618564605713, + "logps/chosen": -146.530029296875, + "logps/rejected": -131.56813049316406, + "loss": 0.0071, + "rewards/accuracies": 1.0, + "rewards/chosen": 7.170102119445801, + "rewards/margins": 4.6576948165893555, + "rewards/rejected": 2.512407064437866, + "step": 9293 + }, + { + "epoch": 1.51, + "learning_rate": 1.5018335402940681e-06, + "logits/chosen": -1.5297596454620361, + "logits/rejected": -1.5431301593780518, + "logps/chosen": -130.13394165039062, + "logps/rejected": -164.11585998535156, + "loss": 1.2601, + "rewards/accuracies": 0.0, + "rewards/chosen": 7.84832763671875, + "rewards/margins": -2.4302902221679688, + "rewards/rejected": 10.278617858886719, + "step": 9294 + }, + { + "epoch": 1.51, + "learning_rate": 1.500894623933904e-06, + "logits/chosen": -0.7602936029434204, + "logits/rejected": -0.7602936029434204, + "logps/chosen": -59.81539535522461, + "logps/rejected": -59.81539535522461, + "loss": 0.3528, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.4104083776474, + "rewards/margins": 0.0, + "rewards/rejected": 1.4104083776474, + "step": 9295 + }, + { + "epoch": 1.51, + "learning_rate": 1.499955949328814e-06, + "logits/chosen": -1.192150354385376, + "logits/rejected": -1.2199652194976807, + "logps/chosen": -54.40235137939453, + "logps/rejected": -94.29353332519531, + "loss": 0.165, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.798387885093689, + "rewards/margins": 1.0783698558807373, + "rewards/rejected": 0.7200180292129517, + "step": 9296 + }, + { + "epoch": 1.51, + "learning_rate": 1.4990175165436482e-06, + "logits/chosen": -1.5398691892623901, + "logits/rejected": -1.54453444480896, + "logps/chosen": -79.56349182128906, + "logps/rejected": -58.52622604370117, + "loss": 0.279, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.263685703277588, + "rewards/margins": 1.0211811065673828, + "rewards/rejected": 2.242504596710205, + "step": 9297 + }, + { + "epoch": 1.51, + "learning_rate": 1.4980793256432474e-06, + "logits/chosen": -1.0863571166992188, + "logits/rejected": -1.102928876876831, + "logps/chosen": -55.464759826660156, + "logps/rejected": -70.86555480957031, + "loss": 1.4529, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.076082706451416, + "rewards/margins": -1.1215744018554688, + "rewards/rejected": 4.197657108306885, + "step": 9298 + }, + { + "epoch": 1.51, + "learning_rate": 1.4971413766924287e-06, + "logits/chosen": -1.071026086807251, + "logits/rejected": -0.8921542167663574, + "logps/chosen": -78.54545593261719, + "logps/rejected": -63.788700103759766, + "loss": 0.0634, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.558513164520264, + "rewards/margins": 4.134772300720215, + "rewards/rejected": 1.4237407445907593, + "step": 9299 + }, + { + "epoch": 1.51, + "learning_rate": 1.4962036697559956e-06, + "logits/chosen": -1.4177720546722412, + "logits/rejected": -1.565076231956482, + "logps/chosen": -50.74598693847656, + "logps/rejected": -110.33705139160156, + "loss": 2.3338, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.017192840576172, + "rewards/margins": -4.243279457092285, + "rewards/rejected": 7.260472297668457, + "step": 9300 + }, + { + "epoch": 1.51, + "learning_rate": 1.4952662048987377e-06, + "logits/chosen": -1.0897890329360962, + "logits/rejected": -1.1418628692626953, + "logps/chosen": -73.4822006225586, + "logps/rejected": -109.0357666015625, + "loss": 0.3422, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.0179283618927, + "rewards/margins": 0.2931785583496094, + "rewards/rejected": 1.7247498035430908, + "step": 9301 + }, + { + "epoch": 1.51, + "learning_rate": 1.4943289821854212e-06, + "logits/chosen": -1.363020658493042, + "logits/rejected": -1.3183398246765137, + "logps/chosen": -158.7071990966797, + "logps/rejected": -237.57778930664062, + "loss": 0.339, + "rewards/accuracies": 1.0, + "rewards/chosen": 7.345755100250244, + "rewards/margins": 0.05581045150756836, + "rewards/rejected": 7.289944648742676, + "step": 9302 + }, + { + "epoch": 1.51, + "learning_rate": 1.4933920016808028e-06, + "logits/chosen": -0.7522308826446533, + "logits/rejected": -0.6935426592826843, + "logps/chosen": -17.825620651245117, + "logps/rejected": -12.66048812866211, + "loss": 0.7361, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.0414209365844727, + "rewards/margins": 0.29417455196380615, + "rewards/rejected": 0.7472463846206665, + "step": 9303 + }, + { + "epoch": 1.51, + "learning_rate": 1.4924552634496154e-06, + "logits/chosen": -1.1137644052505493, + "logits/rejected": -1.1137644052505493, + "logps/chosen": -0.9569490551948547, + "logps/rejected": -0.9569490551948547, + "loss": 0.3644, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.19082163274288177, + "rewards/margins": 0.0, + "rewards/rejected": 0.19082163274288177, + "step": 9304 + }, + { + "epoch": 1.51, + "learning_rate": 1.4915187675565824e-06, + "logits/chosen": -1.3736460208892822, + "logits/rejected": -1.5810514688491821, + "logps/chosen": -62.68642044067383, + "logps/rejected": -145.64657592773438, + "loss": 3.0824, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.6456799507141113, + "rewards/margins": -5.85015344619751, + "rewards/rejected": 9.495833396911621, + "step": 9305 + }, + { + "epoch": 1.51, + "learning_rate": 1.4905825140664038e-06, + "logits/chosen": -1.2874598503112793, + "logits/rejected": -1.2815070152282715, + "logps/chosen": -41.01467514038086, + "logps/rejected": -58.49776077270508, + "loss": 0.3519, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.856126070022583, + "rewards/margins": 0.6141433715820312, + "rewards/rejected": 2.2419826984405518, + "step": 9306 + }, + { + "epoch": 1.51, + "learning_rate": 1.4896465030437678e-06, + "logits/chosen": -1.5944228172302246, + "logits/rejected": -1.5152438879013062, + "logps/chosen": -162.68875122070312, + "logps/rejected": -164.9181365966797, + "loss": 0.342, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.353981018066406, + "rewards/margins": 0.04700899124145508, + "rewards/rejected": 6.306972026824951, + "step": 9307 + }, + { + "epoch": 1.51, + "learning_rate": 1.4887107345533425e-06, + "logits/chosen": -1.4341412782669067, + "logits/rejected": -1.3341509103775024, + "logps/chosen": -114.46538543701172, + "logps/rejected": -98.48875427246094, + "loss": 0.3044, + "rewards/accuracies": 1.0, + "rewards/chosen": 7.669574737548828, + "rewards/margins": 1.7351584434509277, + "rewards/rejected": 5.9344162940979, + "step": 9308 + }, + { + "epoch": 1.51, + "learning_rate": 1.487775208659782e-06, + "logits/chosen": -1.1626605987548828, + "logits/rejected": -1.1853466033935547, + "logps/chosen": -52.045005798339844, + "logps/rejected": -105.68716430664062, + "loss": 0.4882, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.934783935546875, + "rewards/margins": 0.7750084400177002, + "rewards/rejected": 2.159775495529175, + "step": 9309 + }, + { + "epoch": 1.51, + "learning_rate": 1.4868399254277205e-06, + "logits/chosen": -1.229353427886963, + "logits/rejected": -1.16494882106781, + "logps/chosen": -68.78005981445312, + "logps/rejected": -49.19633483886719, + "loss": 0.5305, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.031651258468628, + "rewards/margins": -0.30861663818359375, + "rewards/rejected": 2.3402678966522217, + "step": 9310 + }, + { + "epoch": 1.51, + "learning_rate": 1.48590488492178e-06, + "logits/chosen": -1.2186062335968018, + "logits/rejected": -1.1919901371002197, + "logps/chosen": -54.46333694458008, + "logps/rejected": -90.65963745117188, + "loss": 0.3958, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.8570553064346313, + "rewards/margins": 0.31933093070983887, + "rewards/rejected": 1.5377243757247925, + "step": 9311 + }, + { + "epoch": 1.51, + "learning_rate": 1.4849700872065591e-06, + "logits/chosen": -1.6201870441436768, + "logits/rejected": -1.58390212059021, + "logps/chosen": -68.11404418945312, + "logps/rejected": -38.23345947265625, + "loss": 0.0798, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.3425827026367188, + "rewards/margins": 1.9120872020721436, + "rewards/rejected": 0.4304954707622528, + "step": 9312 + }, + { + "epoch": 1.51, + "learning_rate": 1.4840355323466483e-06, + "logits/chosen": -1.5122923851013184, + "logits/rejected": -1.3928248882293701, + "logps/chosen": -62.5219612121582, + "logps/rejected": -30.17904281616211, + "loss": 0.3158, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.4133708477020264, + "rewards/margins": 2.7685458660125732, + "rewards/rejected": 0.6448249816894531, + "step": 9313 + }, + { + "epoch": 1.51, + "learning_rate": 1.4831012204066114e-06, + "logits/chosen": -1.2195303440093994, + "logits/rejected": -1.1775110960006714, + "logps/chosen": -70.61283111572266, + "logps/rejected": -48.253684997558594, + "loss": 0.6564, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.33171010017395, + "rewards/margins": -0.7952187061309814, + "rewards/rejected": 3.1269288063049316, + "step": 9314 + }, + { + "epoch": 1.51, + "learning_rate": 1.482167151451005e-06, + "logits/chosen": -0.9697309136390686, + "logits/rejected": -0.9343701004981995, + "logps/chosen": -71.58594512939453, + "logps/rejected": -29.99469757080078, + "loss": 0.4275, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.866060733795166, + "rewards/margins": 1.738771915435791, + "rewards/rejected": 1.127288818359375, + "step": 9315 + }, + { + "epoch": 1.51, + "learning_rate": 1.4812333255443605e-06, + "logits/chosen": -1.5738012790679932, + "logits/rejected": -1.609126091003418, + "logps/chosen": -104.63905334472656, + "logps/rejected": -147.6002197265625, + "loss": 1.3016, + "rewards/accuracies": 0.0, + "rewards/chosen": 6.113620281219482, + "rewards/margins": -2.509500026702881, + "rewards/rejected": 8.623120307922363, + "step": 9316 + }, + { + "epoch": 1.51, + "learning_rate": 1.4802997427512e-06, + "logits/chosen": -0.9963002800941467, + "logits/rejected": -0.9963002800941467, + "logps/chosen": -0.6759200096130371, + "logps/rejected": -0.6759200096130371, + "loss": 2.4145, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.20682501792907715, + "rewards/margins": 0.0, + "rewards/rejected": 0.20682501792907715, + "step": 9317 + }, + { + "epoch": 1.51, + "learning_rate": 1.4793664031360216e-06, + "logits/chosen": -1.2202558517456055, + "logits/rejected": -1.1971486806869507, + "logps/chosen": -130.30560302734375, + "logps/rejected": -70.36419677734375, + "loss": 0.2015, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.0900285243988037, + "rewards/margins": 0.7661385536193848, + "rewards/rejected": 1.323889970779419, + "step": 9318 + }, + { + "epoch": 1.51, + "learning_rate": 1.4784333067633143e-06, + "logits/chosen": -1.4759721755981445, + "logits/rejected": -1.3969388008117676, + "logps/chosen": -172.6542205810547, + "logps/rejected": -115.74734497070312, + "loss": 1.1357, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.203037977218628, + "rewards/margins": 0.4321410655975342, + "rewards/rejected": 2.7708969116210938, + "step": 9319 + }, + { + "epoch": 1.51, + "learning_rate": 1.4775004536975423e-06, + "logits/chosen": -1.225468397140503, + "logits/rejected": -1.187267541885376, + "logps/chosen": -34.79915237426758, + "logps/rejected": -62.67990493774414, + "loss": 0.0992, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.971994400024414, + "rewards/margins": 1.6542807817459106, + "rewards/rejected": 1.3177136182785034, + "step": 9320 + }, + { + "epoch": 1.51, + "learning_rate": 1.4765678440031605e-06, + "logits/chosen": -1.438735842704773, + "logits/rejected": -1.4273114204406738, + "logps/chosen": -60.90851593017578, + "logps/rejected": -17.577701568603516, + "loss": 0.4671, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.5064674615859985, + "rewards/margins": -0.4284706115722656, + "rewards/rejected": 0.9349380731582642, + "step": 9321 + }, + { + "epoch": 1.51, + "learning_rate": 1.4756354777446004e-06, + "logits/chosen": -1.4265695810317993, + "logits/rejected": -1.2458511590957642, + "logps/chosen": -132.95620727539062, + "logps/rejected": -106.54249572753906, + "loss": 0.0189, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.811700344085693, + "rewards/margins": 3.3139326572418213, + "rewards/rejected": 1.497767686843872, + "step": 9322 + }, + { + "epoch": 1.51, + "learning_rate": 1.4747033549862821e-06, + "logits/chosen": -1.0162631273269653, + "logits/rejected": -1.0302060842514038, + "logps/chosen": -30.926803588867188, + "logps/rejected": -24.31796646118164, + "loss": 0.3471, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.5008068084716797, + "rewards/margins": 0.1340564489364624, + "rewards/rejected": 1.3667503595352173, + "step": 9323 + }, + { + "epoch": 1.51, + "learning_rate": 1.4737714757926036e-06, + "logits/chosen": -0.880272626876831, + "logits/rejected": -0.8756475448608398, + "logps/chosen": -1.9884982109069824, + "logps/rejected": -9.792798042297363, + "loss": 0.9927, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2386348694562912, + "rewards/margins": 0.1399526596069336, + "rewards/rejected": 0.0986822172999382, + "step": 9324 + }, + { + "epoch": 1.51, + "learning_rate": 1.4728398402279527e-06, + "logits/chosen": -1.0103163719177246, + "logits/rejected": -1.0103163719177246, + "logps/chosen": -33.25485610961914, + "logps/rejected": -33.25485610961914, + "loss": 0.8219, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.8728176355361938, + "rewards/margins": 0.0, + "rewards/rejected": 1.8728176355361938, + "step": 9325 + }, + { + "epoch": 1.51, + "learning_rate": 1.4719084483566925e-06, + "logits/chosen": -1.3664627075195312, + "logits/rejected": -1.301957368850708, + "logps/chosen": -145.1580047607422, + "logps/rejected": -162.2762908935547, + "loss": 0.1232, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.267680644989014, + "rewards/margins": 1.275970697402954, + "rewards/rejected": 3.9917099475860596, + "step": 9326 + }, + { + "epoch": 1.51, + "learning_rate": 1.470977300243177e-06, + "logits/chosen": -1.2760703563690186, + "logits/rejected": -1.2460538148880005, + "logps/chosen": -58.299766540527344, + "logps/rejected": -45.572025299072266, + "loss": 0.2874, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.4994407892227173, + "rewards/margins": 0.2573215961456299, + "rewards/rejected": 1.2421191930770874, + "step": 9327 + }, + { + "epoch": 1.51, + "learning_rate": 1.4700463959517363e-06, + "logits/chosen": -1.2554744482040405, + "logits/rejected": -1.2512458562850952, + "logps/chosen": -99.016845703125, + "logps/rejected": -85.90573120117188, + "loss": 1.1601, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.610699415206909, + "rewards/margins": 1.3479689359664917, + "rewards/rejected": 1.2627304792404175, + "step": 9328 + }, + { + "epoch": 1.51, + "learning_rate": 1.4691157355466912e-06, + "logits/chosen": -0.9485864639282227, + "logits/rejected": -0.9485864639282227, + "logps/chosen": -64.21690368652344, + "logps/rejected": -64.21690368652344, + "loss": 1.3888, + "rewards/accuracies": 0.0, + "rewards/chosen": 4.199853420257568, + "rewards/margins": 0.0, + "rewards/rejected": 4.199853420257568, + "step": 9329 + }, + { + "epoch": 1.51, + "learning_rate": 1.4681853190923374e-06, + "logits/chosen": -1.2327264547348022, + "logits/rejected": -1.2059013843536377, + "logps/chosen": -200.78717041015625, + "logps/rejected": -69.40116882324219, + "loss": 0.2105, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.483673095703125, + "rewards/margins": 5.136476039886475, + "rewards/rejected": 1.3471969366073608, + "step": 9330 + }, + { + "epoch": 1.51, + "learning_rate": 1.467255146652961e-06, + "logits/chosen": -1.252639651298523, + "logits/rejected": -1.257885456085205, + "logps/chosen": -54.314849853515625, + "logps/rejected": -56.32162857055664, + "loss": 1.3937, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.641204833984375, + "rewards/margins": -2.610166549682617, + "rewards/rejected": 6.251371383666992, + "step": 9331 + }, + { + "epoch": 1.51, + "learning_rate": 1.4663252182928257e-06, + "logits/chosen": -1.1121101379394531, + "logits/rejected": -1.1942057609558105, + "logps/chosen": -48.791473388671875, + "logps/rejected": -58.57621383666992, + "loss": 1.1691, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.463141679763794, + "rewards/margins": -1.4306774139404297, + "rewards/rejected": 3.8938190937042236, + "step": 9332 + }, + { + "epoch": 1.51, + "learning_rate": 1.4653955340761833e-06, + "logits/chosen": -1.0106256008148193, + "logits/rejected": -0.9986995458602905, + "logps/chosen": -69.79027557373047, + "logps/rejected": -70.04182434082031, + "loss": 0.2548, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1339950561523438, + "rewards/margins": 0.4092743396759033, + "rewards/rejected": 2.7247207164764404, + "step": 9333 + }, + { + "epoch": 1.52, + "learning_rate": 1.4644660940672628e-06, + "logits/chosen": -1.3312792778015137, + "logits/rejected": -1.2592198848724365, + "logps/chosen": -87.67967224121094, + "logps/rejected": -62.305511474609375, + "loss": 0.3944, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.650444030761719, + "rewards/margins": 2.406144618988037, + "rewards/rejected": 3.2442994117736816, + "step": 9334 + }, + { + "epoch": 1.52, + "learning_rate": 1.4635368983302821e-06, + "logits/chosen": -1.0408562421798706, + "logits/rejected": -1.0377705097198486, + "logps/chosen": -52.42579650878906, + "logps/rejected": -37.6072998046875, + "loss": 2.0378, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.298146963119507, + "rewards/margins": 0.5636200904846191, + "rewards/rejected": 1.7345268726348877, + "step": 9335 + }, + { + "epoch": 1.52, + "learning_rate": 1.4626079469294408e-06, + "logits/chosen": -1.0843069553375244, + "logits/rejected": -1.0676934719085693, + "logps/chosen": -80.24996948242188, + "logps/rejected": -70.30630493164062, + "loss": 0.0672, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.5180115699768066, + "rewards/margins": 1.9518258571624756, + "rewards/rejected": 0.5661857724189758, + "step": 9336 + }, + { + "epoch": 1.52, + "learning_rate": 1.4616792399289176e-06, + "logits/chosen": -0.9926654100418091, + "logits/rejected": -0.9925885796546936, + "logps/chosen": -79.2198486328125, + "logps/rejected": -46.28892517089844, + "loss": 0.5323, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.2436386346817017, + "rewards/margins": -0.4868934154510498, + "rewards/rejected": 1.7305320501327515, + "step": 9337 + }, + { + "epoch": 1.52, + "learning_rate": 1.4607507773928809e-06, + "logits/chosen": -1.2702189683914185, + "logits/rejected": -1.2465672492980957, + "logps/chosen": -104.429443359375, + "logps/rejected": -78.33153533935547, + "loss": 0.1214, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.1597795486450195, + "rewards/margins": 1.4235613346099854, + "rewards/rejected": 3.736218214035034, + "step": 9338 + }, + { + "epoch": 1.52, + "learning_rate": 1.4598225593854747e-06, + "logits/chosen": -1.6989467144012451, + "logits/rejected": -1.7559795379638672, + "logps/chosen": -74.64450073242188, + "logps/rejected": -120.72212219238281, + "loss": 1.2504, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.2345168590545654, + "rewards/margins": -1.764526605606079, + "rewards/rejected": 4.9990434646606445, + "step": 9339 + }, + { + "epoch": 1.52, + "learning_rate": 1.4588945859708343e-06, + "logits/chosen": -1.2585500478744507, + "logits/rejected": -1.1059130430221558, + "logps/chosen": -91.4284439086914, + "logps/rejected": -60.45714569091797, + "loss": 0.475, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.614197731018066, + "rewards/margins": 0.4100608825683594, + "rewards/rejected": 5.204136848449707, + "step": 9340 + }, + { + "epoch": 1.52, + "learning_rate": 1.45796685721307e-06, + "logits/chosen": -1.1334971189498901, + "logits/rejected": -1.0154746770858765, + "logps/chosen": -41.57370376586914, + "logps/rejected": -37.50337219238281, + "loss": 0.5031, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.702448606491089, + "rewards/margins": -0.3077404499053955, + "rewards/rejected": 3.0101890563964844, + "step": 9341 + }, + { + "epoch": 1.52, + "learning_rate": 1.4570393731762821e-06, + "logits/chosen": -1.313862919807434, + "logits/rejected": -1.2619023323059082, + "logps/chosen": -48.93982696533203, + "logps/rejected": -63.61429214477539, + "loss": 0.1148, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.3780007362365723, + "rewards/margins": 1.5566045045852661, + "rewards/rejected": 1.8213962316513062, + "step": 9342 + }, + { + "epoch": 1.52, + "learning_rate": 1.4561121339245487e-06, + "logits/chosen": -1.0313478708267212, + "logits/rejected": -1.3014037609100342, + "logps/chosen": -35.526573181152344, + "logps/rejected": -29.65812873840332, + "loss": 0.3093, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.15193247795105, + "rewards/margins": 1.2094422578811646, + "rewards/rejected": 1.9424902200698853, + "step": 9343 + }, + { + "epoch": 1.52, + "learning_rate": 1.4551851395219352e-06, + "logits/chosen": -1.3356269598007202, + "logits/rejected": -1.2996138334274292, + "logps/chosen": -93.80583190917969, + "logps/rejected": -112.3072509765625, + "loss": 2.4462, + "rewards/accuracies": 0.0, + "rewards/chosen": 6.530208110809326, + "rewards/margins": -3.2437024116516113, + "rewards/rejected": 9.773910522460938, + "step": 9344 + }, + { + "epoch": 1.52, + "learning_rate": 1.4542583900324863e-06, + "logits/chosen": -1.6293596029281616, + "logits/rejected": -1.627779245376587, + "logps/chosen": -74.87886047363281, + "logps/rejected": -101.91268920898438, + "loss": 1.163, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.838266134262085, + "rewards/margins": -1.5463454723358154, + "rewards/rejected": 4.3846116065979, + "step": 9345 + }, + { + "epoch": 1.52, + "learning_rate": 1.453331885520234e-06, + "logits/chosen": -1.281936526298523, + "logits/rejected": -1.129167079925537, + "logps/chosen": -113.07893371582031, + "logps/rejected": -70.57765197753906, + "loss": 0.0573, + "rewards/accuracies": 1.0, + "rewards/chosen": 7.999220371246338, + "rewards/margins": 3.0313172340393066, + "rewards/rejected": 4.967903137207031, + "step": 9346 + }, + { + "epoch": 1.52, + "learning_rate": 1.4524056260491876e-06, + "logits/chosen": -0.558827817440033, + "logits/rejected": -0.5754197835922241, + "logps/chosen": -3.099822759628296, + "logps/rejected": -24.78302001953125, + "loss": 0.8063, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.3954521417617798, + "rewards/margins": -0.4849376678466797, + "rewards/rejected": 0.8803898096084595, + "step": 9347 + }, + { + "epoch": 1.52, + "learning_rate": 1.4514796116833462e-06, + "logits/chosen": -1.2388639450073242, + "logits/rejected": -1.177099347114563, + "logps/chosen": -81.56793975830078, + "logps/rejected": -54.88098907470703, + "loss": 0.3047, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.309382915496826, + "rewards/margins": 1.3736751079559326, + "rewards/rejected": 2.9357078075408936, + "step": 9348 + }, + { + "epoch": 1.52, + "learning_rate": 1.450553842486686e-06, + "logits/chosen": -1.0161747932434082, + "logits/rejected": -1.0045344829559326, + "logps/chosen": -57.8586540222168, + "logps/rejected": -102.00056457519531, + "loss": 0.3673, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.9108471274375916, + "rewards/margins": 0.1501293182373047, + "rewards/rejected": 0.7607178092002869, + "step": 9349 + }, + { + "epoch": 1.52, + "learning_rate": 1.4496283185231719e-06, + "logits/chosen": -1.2947032451629639, + "logits/rejected": -1.3111796379089355, + "logps/chosen": -78.28032684326172, + "logps/rejected": -114.2605209350586, + "loss": 0.2725, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.210226535797119, + "rewards/margins": 0.3381859064102173, + "rewards/rejected": 1.8720406293869019, + "step": 9350 + }, + { + "epoch": 1.52, + "learning_rate": 1.4487030398567447e-06, + "logits/chosen": -0.7437968254089355, + "logits/rejected": -0.7437968254089355, + "logps/chosen": -1.3344988822937012, + "logps/rejected": -1.3344988822937012, + "loss": 0.3477, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.1934788078069687, + "rewards/margins": 0.0, + "rewards/rejected": 0.1934788078069687, + "step": 9351 + }, + { + "epoch": 1.52, + "learning_rate": 1.447778006551337e-06, + "logits/chosen": -0.9941132068634033, + "logits/rejected": -1.027696967124939, + "logps/chosen": -30.925996780395508, + "logps/rejected": -75.94569396972656, + "loss": 0.5992, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.6149160265922546, + "rewards/margins": -0.5087305903434753, + "rewards/rejected": 1.12364661693573, + "step": 9352 + }, + { + "epoch": 1.52, + "learning_rate": 1.4468532186708562e-06, + "logits/chosen": -1.236746072769165, + "logits/rejected": -1.2632852792739868, + "logps/chosen": -122.8866195678711, + "logps/rejected": -160.76588439941406, + "loss": 2.9212, + "rewards/accuracies": 0.0, + "rewards/chosen": 4.9698710441589355, + "rewards/margins": -5.8327460289001465, + "rewards/rejected": 10.802617073059082, + "step": 9353 + }, + { + "epoch": 1.52, + "learning_rate": 1.445928676279199e-06, + "logits/chosen": -1.391466498374939, + "logits/rejected": -1.3860119581222534, + "logps/chosen": -69.71163177490234, + "logps/rejected": -131.71380615234375, + "loss": 0.3328, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.365903615951538, + "rewards/margins": 0.07325363159179688, + "rewards/rejected": 2.292649984359741, + "step": 9354 + }, + { + "epoch": 1.52, + "learning_rate": 1.4450043794402397e-06, + "logits/chosen": -1.4268832206726074, + "logits/rejected": -1.2936152219772339, + "logps/chosen": -112.34602355957031, + "logps/rejected": -52.70232391357422, + "loss": 0.1, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.096093654632568, + "rewards/margins": 2.4373679161071777, + "rewards/rejected": 3.6587257385253906, + "step": 9355 + }, + { + "epoch": 1.52, + "learning_rate": 1.4440803282178417e-06, + "logits/chosen": -1.2854270935058594, + "logits/rejected": -1.273137092590332, + "logps/chosen": -46.510074615478516, + "logps/rejected": -87.30519104003906, + "loss": 0.4637, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.6267621517181396, + "rewards/margins": 2.584501266479492, + "rewards/rejected": 1.042260766029358, + "step": 9356 + }, + { + "epoch": 1.52, + "learning_rate": 1.4431565226758453e-06, + "logits/chosen": -1.0113173723220825, + "logits/rejected": -0.8876267075538635, + "logps/chosen": -73.80191040039062, + "logps/rejected": -42.72060012817383, + "loss": 0.292, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.5148162841796875, + "rewards/margins": 1.1013133525848389, + "rewards/rejected": 1.4135029315948486, + "step": 9357 + }, + { + "epoch": 1.52, + "learning_rate": 1.4422329628780795e-06, + "logits/chosen": -1.1164244413375854, + "logits/rejected": -1.113913893699646, + "logps/chosen": -19.477060317993164, + "logps/rejected": -4.801248073577881, + "loss": 0.8802, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5503015518188477, + "rewards/margins": 0.11433053016662598, + "rewards/rejected": 0.4359710216522217, + "step": 9358 + }, + { + "epoch": 1.52, + "learning_rate": 1.4413096488883505e-06, + "logits/chosen": -1.2817277908325195, + "logits/rejected": -1.1607756614685059, + "logps/chosen": -70.51049041748047, + "logps/rejected": -43.92364501953125, + "loss": 0.1335, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.24348521232605, + "rewards/margins": 1.267704725265503, + "rewards/rejected": 1.9757804870605469, + "step": 9359 + }, + { + "epoch": 1.52, + "learning_rate": 1.4403865807704532e-06, + "logits/chosen": -1.5826659202575684, + "logits/rejected": -1.4045292139053345, + "logps/chosen": -154.05526733398438, + "logps/rejected": -35.47429275512695, + "loss": 0.2154, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.733245849609375, + "rewards/margins": 4.984357833862305, + "rewards/rejected": 1.7488880157470703, + "step": 9360 + }, + { + "epoch": 1.52, + "learning_rate": 1.439463758588161e-06, + "logits/chosen": -1.4434196949005127, + "logits/rejected": -1.4384725093841553, + "logps/chosen": -176.32708740234375, + "logps/rejected": -166.665771484375, + "loss": 0.2422, + "rewards/accuracies": 1.0, + "rewards/chosen": 9.443188667297363, + "rewards/margins": 0.49245262145996094, + "rewards/rejected": 8.950736045837402, + "step": 9361 + }, + { + "epoch": 1.52, + "learning_rate": 1.4385411824052343e-06, + "logits/chosen": -1.1427980661392212, + "logits/rejected": -1.1422851085662842, + "logps/chosen": -45.35997009277344, + "logps/rejected": -74.88262939453125, + "loss": 0.283, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.807415008544922, + "rewards/margins": 0.44266510009765625, + "rewards/rejected": 2.3647499084472656, + "step": 9362 + }, + { + "epoch": 1.52, + "learning_rate": 1.4376188522854111e-06, + "logits/chosen": -1.3613797426223755, + "logits/rejected": -1.2849054336547852, + "logps/chosen": -98.94441223144531, + "logps/rejected": -74.29412841796875, + "loss": 0.1683, + "rewards/accuracies": 1.0, + "rewards/chosen": 7.118834018707275, + "rewards/margins": 3.2067947387695312, + "rewards/rejected": 3.912039279937744, + "step": 9363 + }, + { + "epoch": 1.52, + "learning_rate": 1.4366967682924194e-06, + "logits/chosen": -0.8561145663261414, + "logits/rejected": -0.8692525029182434, + "logps/chosen": -74.06703186035156, + "logps/rejected": -113.04263305664062, + "loss": 0.3487, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1039397716522217, + "rewards/margins": 1.725897192955017, + "rewards/rejected": 1.3780425786972046, + "step": 9364 + }, + { + "epoch": 1.52, + "learning_rate": 1.4357749304899632e-06, + "logits/chosen": -1.3158677816390991, + "logits/rejected": -1.2402418851852417, + "logps/chosen": -101.58763122558594, + "logps/rejected": -51.7832145690918, + "loss": 0.7188, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.4574546813964844, + "rewards/margins": -1.1363911628723145, + "rewards/rejected": 3.593845844268799, + "step": 9365 + }, + { + "epoch": 1.52, + "learning_rate": 1.4348533389417356e-06, + "logits/chosen": -1.0757219791412354, + "logits/rejected": -1.089781403541565, + "logps/chosen": -91.51730346679688, + "logps/rejected": -107.50215911865234, + "loss": 0.1191, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.6522157192230225, + "rewards/margins": 1.4455246925354004, + "rewards/rejected": 1.206691026687622, + "step": 9366 + }, + { + "epoch": 1.52, + "learning_rate": 1.4339319937114076e-06, + "logits/chosen": -0.8320407867431641, + "logits/rejected": -0.815946638584137, + "logps/chosen": -78.00387573242188, + "logps/rejected": -38.986854553222656, + "loss": 0.5678, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.5147781372070312, + "rewards/margins": -0.6958444118499756, + "rewards/rejected": 2.210622549057007, + "step": 9367 + }, + { + "epoch": 1.52, + "learning_rate": 1.4330108948626376e-06, + "logits/chosen": -1.2048295736312866, + "logits/rejected": -1.019778847694397, + "logps/chosen": -79.69377136230469, + "logps/rejected": -24.39750862121582, + "loss": 0.0986, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9460785388946533, + "rewards/margins": 2.0070855617523193, + "rewards/rejected": 0.9389929175376892, + "step": 9368 + }, + { + "epoch": 1.52, + "learning_rate": 1.4320900424590623e-06, + "logits/chosen": -1.293900728225708, + "logits/rejected": -1.1559652090072632, + "logps/chosen": -124.26893615722656, + "logps/rejected": -42.69090270996094, + "loss": 0.0087, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.4048843383789062, + "rewards/margins": 4.081881523132324, + "rewards/rejected": -0.6769973635673523, + "step": 9369 + }, + { + "epoch": 1.52, + "learning_rate": 1.4311694365643048e-06, + "logits/chosen": -1.164567470550537, + "logits/rejected": -1.1466082334518433, + "logps/chosen": -77.15724182128906, + "logps/rejected": -49.57147979736328, + "loss": 0.9055, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.9995254278182983, + "rewards/margins": -0.7217477560043335, + "rewards/rejected": 2.721273183822632, + "step": 9370 + }, + { + "epoch": 1.52, + "learning_rate": 1.430249077241972e-06, + "logits/chosen": -1.216544508934021, + "logits/rejected": -1.2327381372451782, + "logps/chosen": -64.41724395751953, + "logps/rejected": -51.60591506958008, + "loss": 0.6619, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.8151008486747742, + "rewards/margins": -0.9763638377189636, + "rewards/rejected": 1.7914646863937378, + "step": 9371 + }, + { + "epoch": 1.52, + "learning_rate": 1.4293289645556496e-06, + "logits/chosen": -1.279098391532898, + "logits/rejected": -1.3745211362838745, + "logps/chosen": -55.339195251464844, + "logps/rejected": -135.2190704345703, + "loss": 0.438, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.468388319015503, + "rewards/margins": 0.21776866912841797, + "rewards/rejected": 3.250619649887085, + "step": 9372 + }, + { + "epoch": 1.52, + "learning_rate": 1.4284090985689115e-06, + "logits/chosen": -1.0105026960372925, + "logits/rejected": -1.0284550189971924, + "logps/chosen": -37.06821060180664, + "logps/rejected": -80.39115905761719, + "loss": 1.0811, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.7039120197296143, + "rewards/margins": -1.88142728805542, + "rewards/rejected": 3.585339307785034, + "step": 9373 + }, + { + "epoch": 1.52, + "learning_rate": 1.4274894793453075e-06, + "logits/chosen": -1.460499882698059, + "logits/rejected": -1.3968698978424072, + "logps/chosen": -141.19808959960938, + "logps/rejected": -70.83343505859375, + "loss": 0.235, + "rewards/accuracies": 1.0, + "rewards/chosen": 8.837431907653809, + "rewards/margins": 5.075563430786133, + "rewards/rejected": 3.7618682384490967, + "step": 9374 + }, + { + "epoch": 1.52, + "learning_rate": 1.4265701069483796e-06, + "logits/chosen": -1.165828824043274, + "logits/rejected": -0.8419855833053589, + "logps/chosen": -78.83648681640625, + "logps/rejected": -49.2225341796875, + "loss": 0.9587, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.9678398370742798, + "rewards/margins": -0.10191571712493896, + "rewards/rejected": 2.0697555541992188, + "step": 9375 + }, + { + "epoch": 1.52, + "learning_rate": 1.4256509814416431e-06, + "logits/chosen": -1.4242339134216309, + "logits/rejected": -1.2794337272644043, + "logps/chosen": -67.09426879882812, + "logps/rejected": -33.87178421020508, + "loss": 0.0761, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.411346435546875, + "rewards/margins": 2.0696372985839844, + "rewards/rejected": 2.3417091369628906, + "step": 9376 + }, + { + "epoch": 1.52, + "learning_rate": 1.4247321028886046e-06, + "logits/chosen": -1.2470250129699707, + "logits/rejected": -1.3526026010513306, + "logps/chosen": -97.35110473632812, + "logps/rejected": -155.40150451660156, + "loss": 2.5511, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.9304168224334717, + "rewards/margins": -4.970451354980469, + "rewards/rejected": 7.9008684158325195, + "step": 9377 + }, + { + "epoch": 1.52, + "learning_rate": 1.4238134713527467e-06, + "logits/chosen": -1.5439895391464233, + "logits/rejected": -1.6028392314910889, + "logps/chosen": -80.28659057617188, + "logps/rejected": -72.414794921875, + "loss": 0.3616, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.727922201156616, + "rewards/margins": 0.16850900650024414, + "rewards/rejected": 2.559413194656372, + "step": 9378 + }, + { + "epoch": 1.52, + "learning_rate": 1.4228950868975417e-06, + "logits/chosen": -1.0288550853729248, + "logits/rejected": -1.0278791189193726, + "logps/chosen": -5.114566326141357, + "logps/rejected": -1.5552845001220703, + "loss": 0.683, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.253950834274292, + "rewards/margins": 0.041092172265052795, + "rewards/rejected": 0.2128586620092392, + "step": 9379 + }, + { + "epoch": 1.52, + "learning_rate": 1.4219769495864371e-06, + "logits/chosen": -1.0170499086380005, + "logits/rejected": -1.0166358947753906, + "logps/chosen": -54.99209213256836, + "logps/rejected": -61.6009635925293, + "loss": 0.274, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.0133023262023926, + "rewards/margins": 0.3859063386917114, + "rewards/rejected": 1.6273959875106812, + "step": 9380 + }, + { + "epoch": 1.52, + "learning_rate": 1.4210590594828722e-06, + "logits/chosen": -1.2992949485778809, + "logits/rejected": -1.4496560096740723, + "logps/chosen": -62.24421310424805, + "logps/rejected": -120.9885025024414, + "loss": 0.6473, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.8520724773406982, + "rewards/margins": -0.8452842235565186, + "rewards/rejected": 4.697356700897217, + "step": 9381 + }, + { + "epoch": 1.52, + "learning_rate": 1.4201414166502596e-06, + "logits/chosen": -1.0287585258483887, + "logits/rejected": -0.9774810075759888, + "logps/chosen": -67.88786315917969, + "logps/rejected": -72.9054183959961, + "loss": 0.1195, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0316505432128906, + "rewards/margins": 1.324864149093628, + "rewards/rejected": 1.7067863941192627, + "step": 9382 + }, + { + "epoch": 1.52, + "learning_rate": 1.4192240211520042e-06, + "logits/chosen": -1.2592624425888062, + "logits/rejected": -1.3036199808120728, + "logps/chosen": -73.98977661132812, + "logps/rejected": -77.3425521850586, + "loss": 0.2983, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0387306213378906, + "rewards/margins": 0.6907691955566406, + "rewards/rejected": 2.34796142578125, + "step": 9383 + }, + { + "epoch": 1.52, + "learning_rate": 1.4183068730514853e-06, + "logits/chosen": -1.3254038095474243, + "logits/rejected": -1.4213167428970337, + "logps/chosen": -70.05497741699219, + "logps/rejected": -85.75887298583984, + "loss": 1.7005, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.689434766769409, + "rewards/margins": -3.208184003829956, + "rewards/rejected": 5.897618770599365, + "step": 9384 + }, + { + "epoch": 1.52, + "learning_rate": 1.4173899724120733e-06, + "logits/chosen": -1.2517132759094238, + "logits/rejected": -1.3773659467697144, + "logps/chosen": -77.60617065429688, + "logps/rejected": -106.10033416748047, + "loss": 1.5505, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.0858001708984375, + "rewards/margins": -2.313617706298828, + "rewards/rejected": 5.399417877197266, + "step": 9385 + }, + { + "epoch": 1.52, + "learning_rate": 1.4164733192971135e-06, + "logits/chosen": -1.386525273323059, + "logits/rejected": -1.3915035724639893, + "logps/chosen": -91.40873718261719, + "logps/rejected": -224.89093017578125, + "loss": 1.9273, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.2304017543792725, + "rewards/margins": -3.741515874862671, + "rewards/rejected": 5.971917629241943, + "step": 9386 + }, + { + "epoch": 1.52, + "learning_rate": 1.415556913769941e-06, + "logits/chosen": -1.328993797302246, + "logits/rejected": -1.3161131143569946, + "logps/chosen": -67.95645141601562, + "logps/rejected": -34.988895416259766, + "loss": 2.1076, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.440930128097534, + "rewards/margins": 0.9658111333847046, + "rewards/rejected": 1.4751189947128296, + "step": 9387 + }, + { + "epoch": 1.52, + "learning_rate": 1.4146407558938695e-06, + "logits/chosen": -1.156219244003296, + "logits/rejected": -0.9963683485984802, + "logps/chosen": -44.46235656738281, + "logps/rejected": -27.761747360229492, + "loss": 0.0228, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.5020179748535156, + "rewards/margins": 3.3663086891174316, + "rewards/rejected": 0.13570919632911682, + "step": 9388 + }, + { + "epoch": 1.52, + "learning_rate": 1.4137248457321972e-06, + "logits/chosen": -1.2519867420196533, + "logits/rejected": -1.318764567375183, + "logps/chosen": -69.98690795898438, + "logps/rejected": -117.8796615600586, + "loss": 0.3598, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.135484457015991, + "rewards/margins": -0.031883955001831055, + "rewards/rejected": 3.1673684120178223, + "step": 9389 + }, + { + "epoch": 1.52, + "learning_rate": 1.4128091833482021e-06, + "logits/chosen": -1.588045358657837, + "logits/rejected": -1.5174305438995361, + "logps/chosen": -130.51556396484375, + "logps/rejected": -36.399906158447266, + "loss": 0.1556, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.3297882080078125, + "rewards/margins": 1.6089000701904297, + "rewards/rejected": 3.720888137817383, + "step": 9390 + }, + { + "epoch": 1.52, + "learning_rate": 1.411893768805151e-06, + "logits/chosen": -1.2935118675231934, + "logits/rejected": -1.2935118675231934, + "logps/chosen": -78.45953369140625, + "logps/rejected": -78.45953369140625, + "loss": 0.4745, + "rewards/accuracies": 0.0, + "rewards/chosen": 4.590938091278076, + "rewards/margins": 0.0, + "rewards/rejected": 4.590938091278076, + "step": 9391 + }, + { + "epoch": 1.52, + "learning_rate": 1.4109786021662908e-06, + "logits/chosen": -1.1651067733764648, + "logits/rejected": -1.0726025104522705, + "logps/chosen": -58.692649841308594, + "logps/rejected": -69.65122985839844, + "loss": 0.1213, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9315240383148193, + "rewards/margins": 1.3142861127853394, + "rewards/rejected": 1.61723792552948, + "step": 9392 + }, + { + "epoch": 1.52, + "learning_rate": 1.4100636834948478e-06, + "logits/chosen": -0.9904301166534424, + "logits/rejected": -0.994872510433197, + "logps/chosen": -15.305031776428223, + "logps/rejected": -2.571118116378784, + "loss": 0.6294, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.379218190908432, + "rewards/margins": 0.22606264054775238, + "rewards/rejected": 0.15315555036067963, + "step": 9393 + }, + { + "epoch": 1.52, + "learning_rate": 1.4091490128540374e-06, + "logits/chosen": -1.2782725095748901, + "logits/rejected": -1.3011364936828613, + "logps/chosen": -32.454376220703125, + "logps/rejected": -65.72410583496094, + "loss": 2.9899, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.5928609371185303, + "rewards/margins": 0.6054463386535645, + "rewards/rejected": 1.9874145984649658, + "step": 9394 + }, + { + "epoch": 1.52, + "learning_rate": 1.4082345903070516e-06, + "logits/chosen": -1.0200144052505493, + "logits/rejected": -1.069048285484314, + "logps/chosen": -57.25672912597656, + "logps/rejected": -94.6261215209961, + "loss": 0.6412, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.4833641052246094, + "rewards/margins": 1.046216607093811, + "rewards/rejected": 1.4371474981307983, + "step": 9395 + }, + { + "epoch": 1.53, + "learning_rate": 1.4073204159170723e-06, + "logits/chosen": -1.425003170967102, + "logits/rejected": -1.4678435325622559, + "logps/chosen": -42.16324996948242, + "logps/rejected": -88.21546936035156, + "loss": 0.5154, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.8068958520889282, + "rewards/margins": -0.5840343236923218, + "rewards/rejected": 1.39093017578125, + "step": 9396 + }, + { + "epoch": 1.53, + "learning_rate": 1.4064064897472558e-06, + "logits/chosen": -1.062018871307373, + "logits/rejected": -1.0234586000442505, + "logps/chosen": -39.579856872558594, + "logps/rejected": -47.06378936767578, + "loss": 0.5406, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.7425587177276611, + "rewards/margins": 0.08285939693450928, + "rewards/rejected": 1.6596993207931519, + "step": 9397 + }, + { + "epoch": 1.53, + "learning_rate": 1.4054928118607498e-06, + "logits/chosen": -1.1937175989151, + "logits/rejected": -1.1474095582962036, + "logps/chosen": -96.81165313720703, + "logps/rejected": -48.99032974243164, + "loss": 0.1382, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.149636268615723, + "rewards/margins": 1.1594135761260986, + "rewards/rejected": 3.990222692489624, + "step": 9398 + }, + { + "epoch": 1.53, + "learning_rate": 1.4045793823206772e-06, + "logits/chosen": -1.5007809400558472, + "logits/rejected": -1.339961051940918, + "logps/chosen": -109.16268920898438, + "logps/rejected": -97.1954116821289, + "loss": 0.028, + "rewards/accuracies": 1.0, + "rewards/chosen": 7.966531276702881, + "rewards/margins": 3.975409507751465, + "rewards/rejected": 3.991121768951416, + "step": 9399 + }, + { + "epoch": 1.53, + "learning_rate": 1.4036662011901507e-06, + "logits/chosen": -0.8781846761703491, + "logits/rejected": -0.8910647034645081, + "logps/chosen": -124.28097534179688, + "logps/rejected": -62.68785858154297, + "loss": 0.4102, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.505087375640869, + "rewards/margins": 4.367392539978027, + "rewards/rejected": 1.1376945972442627, + "step": 9400 + }, + { + "epoch": 1.53, + "learning_rate": 1.4027532685322592e-06, + "logits/chosen": -1.4259265661239624, + "logits/rejected": -1.2741307020187378, + "logps/chosen": -104.60542297363281, + "logps/rejected": -68.38908386230469, + "loss": 0.1279, + "rewards/accuracies": 1.0, + "rewards/chosen": 7.553898811340332, + "rewards/margins": 2.645211696624756, + "rewards/rejected": 4.908687114715576, + "step": 9401 + }, + { + "epoch": 1.53, + "learning_rate": 1.4018405844100814e-06, + "logits/chosen": -1.320568323135376, + "logits/rejected": -1.2983741760253906, + "logps/chosen": -95.44416809082031, + "logps/rejected": -118.66938781738281, + "loss": 0.0264, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.479656934738159, + "rewards/margins": 3.1554152965545654, + "rewards/rejected": -0.6757583618164062, + "step": 9402 + }, + { + "epoch": 1.53, + "learning_rate": 1.400928148886671e-06, + "logits/chosen": -1.3500179052352905, + "logits/rejected": -1.1744728088378906, + "logps/chosen": -124.11308288574219, + "logps/rejected": -53.735679626464844, + "loss": 0.1207, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.454847812652588, + "rewards/margins": 3.134648084640503, + "rewards/rejected": 3.320199728012085, + "step": 9403 + }, + { + "epoch": 1.53, + "learning_rate": 1.4000159620250724e-06, + "logits/chosen": -1.4577220678329468, + "logits/rejected": -1.520421028137207, + "logps/chosen": -112.84690856933594, + "logps/rejected": -81.34151458740234, + "loss": 0.2383, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.745979309082031, + "rewards/margins": 0.8225960731506348, + "rewards/rejected": 4.9233832359313965, + "step": 9404 + }, + { + "epoch": 1.53, + "learning_rate": 1.3991040238883063e-06, + "logits/chosen": -1.3319265842437744, + "logits/rejected": -1.2308945655822754, + "logps/chosen": -60.783836364746094, + "logps/rejected": -14.633650779724121, + "loss": 0.0096, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.681674957275391, + "rewards/margins": 4.276553630828857, + "rewards/rejected": 1.4051213264465332, + "step": 9405 + }, + { + "epoch": 1.53, + "learning_rate": 1.3981923345393816e-06, + "logits/chosen": -1.0543220043182373, + "logits/rejected": -1.0539203882217407, + "logps/chosen": -3.1852171421051025, + "logps/rejected": -3.8408806324005127, + "loss": 0.5794, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.24722841382026672, + "rewards/margins": -0.06066665053367615, + "rewards/rejected": 0.30789506435394287, + "step": 9406 + }, + { + "epoch": 1.53, + "learning_rate": 1.397280894041284e-06, + "logits/chosen": -1.0718404054641724, + "logits/rejected": -1.1070497035980225, + "logps/chosen": -51.58831024169922, + "logps/rejected": -91.53575134277344, + "loss": 1.1899, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.776801347732544, + "rewards/margins": -2.2816383838653564, + "rewards/rejected": 4.0584397315979, + "step": 9407 + }, + { + "epoch": 1.53, + "learning_rate": 1.3963697024569894e-06, + "logits/chosen": -1.0463125705718994, + "logits/rejected": -1.052971601486206, + "logps/chosen": -54.991825103759766, + "logps/rejected": -88.44483947753906, + "loss": 0.7849, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.4832592010498047, + "rewards/margins": -0.6927227973937988, + "rewards/rejected": 3.1759819984436035, + "step": 9408 + }, + { + "epoch": 1.53, + "learning_rate": 1.395458759849449e-06, + "logits/chosen": -1.3768011331558228, + "logits/rejected": -1.350155234336853, + "logps/chosen": -39.99947738647461, + "logps/rejected": -71.61566162109375, + "loss": 1.0852, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.175597906112671, + "rewards/margins": 0.04542064666748047, + "rewards/rejected": 2.1301772594451904, + "step": 9409 + }, + { + "epoch": 1.53, + "learning_rate": 1.394548066281603e-06, + "logits/chosen": -1.0631334781646729, + "logits/rejected": -0.9132211804389954, + "logps/chosen": -97.99166107177734, + "logps/rejected": -58.069026947021484, + "loss": 0.5989, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.1310524940490723, + "rewards/margins": 1.032585620880127, + "rewards/rejected": 1.0984668731689453, + "step": 9410 + }, + { + "epoch": 1.53, + "learning_rate": 1.3936376218163694e-06, + "logits/chosen": -1.464415431022644, + "logits/rejected": -1.5484442710876465, + "logps/chosen": -71.00088500976562, + "logps/rejected": -117.9612045288086, + "loss": 1.6663, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.210528612136841, + "rewards/margins": -2.6347849369049072, + "rewards/rejected": 5.845313549041748, + "step": 9411 + }, + { + "epoch": 1.53, + "learning_rate": 1.3927274265166534e-06, + "logits/chosen": -1.4379067420959473, + "logits/rejected": -1.253900170326233, + "logps/chosen": -102.18992614746094, + "logps/rejected": -63.0343017578125, + "loss": 0.0088, + "rewards/accuracies": 1.0, + "rewards/chosen": 8.616615295410156, + "rewards/margins": 4.054149627685547, + "rewards/rejected": 4.562465667724609, + "step": 9412 + }, + { + "epoch": 1.53, + "learning_rate": 1.3918174804453388e-06, + "logits/chosen": -1.5207502841949463, + "logits/rejected": -1.5314048528671265, + "logps/chosen": -73.01200866699219, + "logps/rejected": -48.201942443847656, + "loss": 1.4773, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.6006393432617188, + "rewards/margins": 1.9589462280273438, + "rewards/rejected": 0.641693115234375, + "step": 9413 + }, + { + "epoch": 1.53, + "learning_rate": 1.3909077836652968e-06, + "logits/chosen": -1.5006017684936523, + "logits/rejected": -1.4232581853866577, + "logps/chosen": -107.82426452636719, + "logps/rejected": -56.335166931152344, + "loss": 1.0945, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.478938341140747, + "rewards/margins": -0.20082783699035645, + "rewards/rejected": 2.6797661781311035, + "step": 9414 + }, + { + "epoch": 1.53, + "learning_rate": 1.3899983362393755e-06, + "logits/chosen": -1.3236318826675415, + "logits/rejected": -1.3595892190933228, + "logps/chosen": -113.19073486328125, + "logps/rejected": -119.09461212158203, + "loss": 0.4349, + "rewards/accuracies": 0.0, + "rewards/chosen": 7.352122783660889, + "rewards/margins": -0.2594456672668457, + "rewards/rejected": 7.611568450927734, + "step": 9415 + }, + { + "epoch": 1.53, + "learning_rate": 1.3890891382304128e-06, + "logits/chosen": -1.4828102588653564, + "logits/rejected": -1.5143287181854248, + "logps/chosen": -123.30448913574219, + "logps/rejected": -75.73211669921875, + "loss": 0.7615, + "rewards/accuracies": 1.0, + "rewards/chosen": 7.638267517089844, + "rewards/margins": 4.844624996185303, + "rewards/rejected": 2.793642520904541, + "step": 9416 + }, + { + "epoch": 1.53, + "learning_rate": 1.3881801897012225e-06, + "logits/chosen": -1.112221360206604, + "logits/rejected": -1.0417793989181519, + "logps/chosen": -39.22386169433594, + "logps/rejected": -58.80199432373047, + "loss": 0.1853, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.5788025856018066, + "rewards/margins": 0.8186805248260498, + "rewards/rejected": 2.760122060775757, + "step": 9417 + }, + { + "epoch": 1.53, + "learning_rate": 1.3872714907146074e-06, + "logits/chosen": -1.55532705783844, + "logits/rejected": -1.6615527868270874, + "logps/chosen": -66.84529113769531, + "logps/rejected": -120.39350891113281, + "loss": 1.3705, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.2871062755584717, + "rewards/margins": -2.6733672618865967, + "rewards/rejected": 4.960473537445068, + "step": 9418 + }, + { + "epoch": 1.53, + "learning_rate": 1.3863630413333463e-06, + "logits/chosen": -1.4254752397537231, + "logits/rejected": -1.1934970617294312, + "logps/chosen": -90.23646545410156, + "logps/rejected": -27.49115753173828, + "loss": 0.0201, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.6566619873046875, + "rewards/margins": 4.348158359527588, + "rewards/rejected": -0.6914964914321899, + "step": 9419 + }, + { + "epoch": 1.53, + "learning_rate": 1.3854548416202084e-06, + "logits/chosen": -1.0752369165420532, + "logits/rejected": -1.1197692155838013, + "logps/chosen": -80.82574462890625, + "logps/rejected": -128.93170166015625, + "loss": 0.4047, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.777052402496338, + "rewards/margins": 0.9102174043655396, + "rewards/rejected": 1.8668349981307983, + "step": 9420 + }, + { + "epoch": 1.53, + "learning_rate": 1.3845468916379384e-06, + "logits/chosen": -1.0608278512954712, + "logits/rejected": -1.0384875535964966, + "logps/chosen": -62.64472198486328, + "logps/rejected": -56.775428771972656, + "loss": 0.2481, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2629616260528564, + "rewards/margins": 0.6067535877227783, + "rewards/rejected": 2.656208038330078, + "step": 9421 + }, + { + "epoch": 1.53, + "learning_rate": 1.3836391914492698e-06, + "logits/chosen": -1.1562079191207886, + "logits/rejected": -1.1562079191207886, + "logps/chosen": -37.53652572631836, + "logps/rejected": -37.53652572631836, + "loss": 0.6483, + "rewards/accuracies": 0.0, + "rewards/chosen": 4.048886775970459, + "rewards/margins": 0.0, + "rewards/rejected": 4.048886775970459, + "step": 9422 + }, + { + "epoch": 1.53, + "learning_rate": 1.3827317411169127e-06, + "logits/chosen": -1.4552196264266968, + "logits/rejected": -1.2937958240509033, + "logps/chosen": -164.9473876953125, + "logps/rejected": -35.07184600830078, + "loss": 0.0591, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.5115418434143066, + "rewards/margins": 2.993159532546997, + "rewards/rejected": 0.5183822512626648, + "step": 9423 + }, + { + "epoch": 1.53, + "learning_rate": 1.3818245407035675e-06, + "logits/chosen": -0.9890239834785461, + "logits/rejected": -0.9853448867797852, + "logps/chosen": -1.4454574584960938, + "logps/rejected": -1.6585330963134766, + "loss": 0.3968, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.4212619364261627, + "rewards/margins": 0.13428297638893127, + "rewards/rejected": 0.28697896003723145, + "step": 9424 + }, + { + "epoch": 1.53, + "learning_rate": 1.3809175902719097e-06, + "logits/chosen": -0.9226194024085999, + "logits/rejected": -0.9124599099159241, + "logps/chosen": -13.369258880615234, + "logps/rejected": -12.1356840133667, + "loss": 0.5034, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.921543300151825, + "rewards/margins": -0.10467106103897095, + "rewards/rejected": 1.026214361190796, + "step": 9425 + }, + { + "epoch": 1.53, + "learning_rate": 1.3800108898846022e-06, + "logits/chosen": -0.9480141997337341, + "logits/rejected": -0.7639453411102295, + "logps/chosen": -89.0704345703125, + "logps/rejected": -32.55194091796875, + "loss": 0.1647, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.4537994861602783, + "rewards/margins": 2.1190764904022217, + "rewards/rejected": 0.3347229063510895, + "step": 9426 + }, + { + "epoch": 1.53, + "learning_rate": 1.3791044396042908e-06, + "logits/chosen": -1.3812013864517212, + "logits/rejected": -1.3532538414001465, + "logps/chosen": -73.37432861328125, + "logps/rejected": -51.91338348388672, + "loss": 1.4771, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.5185439586639404, + "rewards/margins": -0.4714837074279785, + "rewards/rejected": 2.990027666091919, + "step": 9427 + }, + { + "epoch": 1.53, + "learning_rate": 1.3781982394936e-06, + "logits/chosen": -1.3229159116744995, + "logits/rejected": -1.381378173828125, + "logps/chosen": -27.652809143066406, + "logps/rejected": -67.1307373046875, + "loss": 1.0205, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.5212773084640503, + "rewards/margins": -1.7348917722702026, + "rewards/rejected": 3.256169080734253, + "step": 9428 + }, + { + "epoch": 1.53, + "learning_rate": 1.3772922896151424e-06, + "logits/chosen": -1.057782769203186, + "logits/rejected": -1.036291480064392, + "logps/chosen": -49.884849548339844, + "logps/rejected": -39.940391540527344, + "loss": 0.3675, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.4731277227401733, + "rewards/margins": 0.4576537609100342, + "rewards/rejected": 1.0154739618301392, + "step": 9429 + }, + { + "epoch": 1.53, + "learning_rate": 1.3763865900315076e-06, + "logits/chosen": -1.2933017015457153, + "logits/rejected": -1.1952768564224243, + "logps/chosen": -143.6731414794922, + "logps/rejected": -35.99280548095703, + "loss": 0.3496, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.300876140594482, + "rewards/margins": 5.091436862945557, + "rewards/rejected": 0.2094394713640213, + "step": 9430 + }, + { + "epoch": 1.53, + "learning_rate": 1.3754811408052742e-06, + "logits/chosen": -1.436471700668335, + "logits/rejected": -1.40249502658844, + "logps/chosen": -64.75282287597656, + "logps/rejected": -35.13328170776367, + "loss": 0.4218, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.443666934967041, + "rewards/margins": -0.17308831214904785, + "rewards/rejected": 3.616755247116089, + "step": 9431 + }, + { + "epoch": 1.53, + "learning_rate": 1.3745759419989974e-06, + "logits/chosen": -1.3694887161254883, + "logits/rejected": -1.5544795989990234, + "logps/chosen": -93.97837829589844, + "logps/rejected": -120.41032409667969, + "loss": 0.5702, + "rewards/accuracies": 0.0, + "rewards/chosen": 4.022682189941406, + "rewards/margins": -0.7239518165588379, + "rewards/rejected": 4.746634006500244, + "step": 9432 + }, + { + "epoch": 1.53, + "learning_rate": 1.3736709936752195e-06, + "logits/chosen": -1.6303300857543945, + "logits/rejected": -1.5347758531570435, + "logps/chosen": -62.174537658691406, + "logps/rejected": -27.724178314208984, + "loss": 0.0966, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.143813371658325, + "rewards/margins": 1.6427083015441895, + "rewards/rejected": 0.5011051297187805, + "step": 9433 + }, + { + "epoch": 1.53, + "learning_rate": 1.3727662958964627e-06, + "logits/chosen": -1.6030614376068115, + "logits/rejected": -1.5151255130767822, + "logps/chosen": -130.5396270751953, + "logps/rejected": -70.09884643554688, + "loss": 0.0381, + "rewards/accuracies": 1.0, + "rewards/chosen": 7.330039978027344, + "rewards/margins": 2.899740695953369, + "rewards/rejected": 4.430299282073975, + "step": 9434 + }, + { + "epoch": 1.53, + "learning_rate": 1.3718618487252345e-06, + "logits/chosen": -1.4450099468231201, + "logits/rejected": -1.514840841293335, + "logps/chosen": -173.91229248046875, + "logps/rejected": -129.8572998046875, + "loss": 0.0179, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.209205627441406, + "rewards/margins": 3.3843917846679688, + "rewards/rejected": 1.8248138427734375, + "step": 9435 + }, + { + "epoch": 1.53, + "learning_rate": 1.3709576522240214e-06, + "logits/chosen": -1.503252387046814, + "logits/rejected": -1.4476577043533325, + "logps/chosen": -70.21524047851562, + "logps/rejected": -51.28419494628906, + "loss": 0.26, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.600379943847656, + "rewards/margins": 0.38585662841796875, + "rewards/rejected": 4.2145233154296875, + "step": 9436 + }, + { + "epoch": 1.53, + "learning_rate": 1.3700537064552982e-06, + "logits/chosen": -1.135989785194397, + "logits/rejected": -1.0592288970947266, + "logps/chosen": -61.36210632324219, + "logps/rejected": -39.36614990234375, + "loss": 0.9979, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.917378306388855, + "rewards/margins": -0.6838043928146362, + "rewards/rejected": 2.601182699203491, + "step": 9437 + }, + { + "epoch": 1.53, + "learning_rate": 1.369150011481515e-06, + "logits/chosen": -0.8603190779685974, + "logits/rejected": -0.8661143779754639, + "logps/chosen": -3.2284178733825684, + "logps/rejected": -5.920719623565674, + "loss": 0.5855, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.42045098543167114, + "rewards/margins": 0.3158016800880432, + "rewards/rejected": 0.10464930534362793, + "step": 9438 + }, + { + "epoch": 1.53, + "learning_rate": 1.3682465673651124e-06, + "logits/chosen": -1.5524908304214478, + "logits/rejected": -1.5736653804779053, + "logps/chosen": -89.69373321533203, + "logps/rejected": -81.06120300292969, + "loss": 0.9352, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.934511661529541, + "rewards/margins": 0.3758981227874756, + "rewards/rejected": 2.5586135387420654, + "step": 9439 + }, + { + "epoch": 1.53, + "learning_rate": 1.367343374168506e-06, + "logits/chosen": -1.4054173231124878, + "logits/rejected": -1.3006120920181274, + "logps/chosen": -84.19712829589844, + "logps/rejected": -46.58891296386719, + "loss": 0.0904, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.214381694793701, + "rewards/margins": 2.2166199684143066, + "rewards/rejected": 1.997761607170105, + "step": 9440 + }, + { + "epoch": 1.53, + "learning_rate": 1.366440431954102e-06, + "logits/chosen": -1.2210723161697388, + "logits/rejected": -1.2329823970794678, + "logps/chosen": -21.170669555664062, + "logps/rejected": -91.62619018554688, + "loss": 2.7358, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.6939854025840759, + "rewards/margins": -4.827065467834473, + "rewards/rejected": 5.521050930023193, + "step": 9441 + }, + { + "epoch": 1.53, + "learning_rate": 1.3655377407842813e-06, + "logits/chosen": -0.7989168167114258, + "logits/rejected": -0.7941614389419556, + "logps/chosen": -7.572758197784424, + "logps/rejected": -1.0680269002914429, + "loss": 0.3533, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3802984654903412, + "rewards/margins": 0.06663668155670166, + "rewards/rejected": 0.3136617839336395, + "step": 9442 + }, + { + "epoch": 1.53, + "learning_rate": 1.3646353007214148e-06, + "logits/chosen": -1.1425457000732422, + "logits/rejected": -0.9636358618736267, + "logps/chosen": -105.3025131225586, + "logps/rejected": -28.721498489379883, + "loss": 0.0558, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.898231029510498, + "rewards/margins": 2.156229019165039, + "rewards/rejected": 2.742002010345459, + "step": 9443 + }, + { + "epoch": 1.53, + "learning_rate": 1.363733111827849e-06, + "logits/chosen": -1.499712586402893, + "logits/rejected": -1.4474866390228271, + "logps/chosen": -129.37008666992188, + "logps/rejected": -30.99363136291504, + "loss": 0.8156, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2273499965667725, + "rewards/margins": 1.1935374736785889, + "rewards/rejected": 2.0338125228881836, + "step": 9444 + }, + { + "epoch": 1.53, + "learning_rate": 1.3628311741659205e-06, + "logits/chosen": -1.306434154510498, + "logits/rejected": -1.2010419368743896, + "logps/chosen": -82.60772705078125, + "logps/rejected": -19.185104370117188, + "loss": 0.1088, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.6648224592208862, + "rewards/margins": 1.5476738214492798, + "rewards/rejected": 0.11714859306812286, + "step": 9445 + }, + { + "epoch": 1.53, + "learning_rate": 1.361929487797941e-06, + "logits/chosen": -1.4191031455993652, + "logits/rejected": -1.4507865905761719, + "logps/chosen": -20.187952041625977, + "logps/rejected": -53.49561309814453, + "loss": 1.4235, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.8171156644821167, + "rewards/margins": 0.9242849946022034, + "rewards/rejected": 0.8928306698799133, + "step": 9446 + }, + { + "epoch": 1.53, + "learning_rate": 1.3610280527862118e-06, + "logits/chosen": -1.4213155508041382, + "logits/rejected": -1.473909616470337, + "logps/chosen": -85.96577453613281, + "logps/rejected": -143.76376342773438, + "loss": 0.7322, + "rewards/accuracies": 0.0, + "rewards/chosen": 5.444947719573975, + "rewards/margins": -0.23072052001953125, + "rewards/rejected": 5.675668239593506, + "step": 9447 + }, + { + "epoch": 1.53, + "learning_rate": 1.3601268691930097e-06, + "logits/chosen": -0.8971258401870728, + "logits/rejected": -0.9194422960281372, + "logps/chosen": -52.76554870605469, + "logps/rejected": -57.372501373291016, + "loss": 0.6407, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.9177253246307373, + "rewards/margins": -0.9223606586456299, + "rewards/rejected": 3.840085983276367, + "step": 9448 + }, + { + "epoch": 1.53, + "learning_rate": 1.3592259370806022e-06, + "logits/chosen": -1.4401236772537231, + "logits/rejected": -1.4186153411865234, + "logps/chosen": -89.51663970947266, + "logps/rejected": -66.95785522460938, + "loss": 1.763, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.066631317138672, + "rewards/margins": 1.0620476007461548, + "rewards/rejected": 1.004583716392517, + "step": 9449 + }, + { + "epoch": 1.53, + "learning_rate": 1.3583252565112315e-06, + "logits/chosen": -1.3967009782791138, + "logits/rejected": -1.3548314571380615, + "logps/chosen": -61.83190155029297, + "logps/rejected": -52.230743408203125, + "loss": 0.1947, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.6767830848693848, + "rewards/margins": 0.7478424310684204, + "rewards/rejected": 1.9289406538009644, + "step": 9450 + }, + { + "epoch": 1.53, + "learning_rate": 1.3574248275471296e-06, + "logits/chosen": -1.1607246398925781, + "logits/rejected": -1.0401359796524048, + "logps/chosen": -70.84765625, + "logps/rejected": -138.8178253173828, + "loss": 1.176, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.1196708679199219, + "rewards/margins": -1.8177604675292969, + "rewards/rejected": 2.9374313354492188, + "step": 9451 + }, + { + "epoch": 1.53, + "learning_rate": 1.3565246502505035e-06, + "logits/chosen": -1.144261360168457, + "logits/rejected": -1.144331455230713, + "logps/chosen": -102.50080871582031, + "logps/rejected": -97.66181182861328, + "loss": 0.3814, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.783570766448975, + "rewards/margins": 3.2976508140563965, + "rewards/rejected": 1.4859199523925781, + "step": 9452 + }, + { + "epoch": 1.53, + "learning_rate": 1.3556247246835512e-06, + "logits/chosen": -1.4441652297973633, + "logits/rejected": -1.462135910987854, + "logps/chosen": -108.00592041015625, + "logps/rejected": -139.85926818847656, + "loss": 0.1178, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.942892551422119, + "rewards/margins": 1.4211838245391846, + "rewards/rejected": 2.5217087268829346, + "step": 9453 + }, + { + "epoch": 1.53, + "learning_rate": 1.3547250509084453e-06, + "logits/chosen": -1.5607140064239502, + "logits/rejected": -1.5749863386154175, + "logps/chosen": -101.09059143066406, + "logps/rejected": -154.57760620117188, + "loss": 0.1482, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2879350185394287, + "rewards/margins": 1.1219162940979004, + "rewards/rejected": 2.1660187244415283, + "step": 9454 + }, + { + "epoch": 1.53, + "learning_rate": 1.3538256289873487e-06, + "logits/chosen": -1.1434683799743652, + "logits/rejected": -1.1330128908157349, + "logps/chosen": -55.52369689941406, + "logps/rejected": -85.72682189941406, + "loss": 0.0922, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.264011383056641, + "rewards/margins": 1.7053534984588623, + "rewards/rejected": 2.5586578845977783, + "step": 9455 + }, + { + "epoch": 1.53, + "learning_rate": 1.3529264589823982e-06, + "logits/chosen": -1.200110912322998, + "logits/rejected": -1.2325506210327148, + "logps/chosen": -55.37057876586914, + "logps/rejected": -69.9702377319336, + "loss": 0.6427, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.386077642440796, + "rewards/margins": -0.3081018924713135, + "rewards/rejected": 2.6941795349121094, + "step": 9456 + }, + { + "epoch": 1.53, + "learning_rate": 1.3520275409557226e-06, + "logits/chosen": -1.3664757013320923, + "logits/rejected": -1.4118448495864868, + "logps/chosen": -119.51760864257812, + "logps/rejected": -168.74893188476562, + "loss": 0.2779, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.235264539718628, + "rewards/margins": 0.5774184465408325, + "rewards/rejected": 1.6578460931777954, + "step": 9457 + }, + { + "epoch": 1.54, + "learning_rate": 1.3511288749694245e-06, + "logits/chosen": -1.0834825038909912, + "logits/rejected": -1.0834908485412598, + "logps/chosen": -5.978227138519287, + "logps/rejected": -2.6951918601989746, + "loss": 0.3659, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3059888482093811, + "rewards/margins": 0.01982647180557251, + "rewards/rejected": 0.2861623764038086, + "step": 9458 + }, + { + "epoch": 1.54, + "learning_rate": 1.350230461085597e-06, + "logits/chosen": -1.5395691394805908, + "logits/rejected": -1.5414596796035767, + "logps/chosen": -62.317466735839844, + "logps/rejected": -73.44624328613281, + "loss": 0.7972, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.2489724159240723, + "rewards/margins": -1.0911331176757812, + "rewards/rejected": 4.3401055335998535, + "step": 9459 + }, + { + "epoch": 1.54, + "learning_rate": 1.3493322993663088e-06, + "logits/chosen": -1.2073590755462646, + "logits/rejected": -1.4196951389312744, + "logps/chosen": -85.47261810302734, + "logps/rejected": -37.600059509277344, + "loss": 0.0366, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0003821849823, + "rewards/margins": 2.643932342529297, + "rewards/rejected": 0.3564499020576477, + "step": 9460 + }, + { + "epoch": 1.54, + "learning_rate": 1.3484343898736169e-06, + "logits/chosen": -1.2743967771530151, + "logits/rejected": -1.3465690612792969, + "logps/chosen": -75.49549865722656, + "logps/rejected": -139.3934326171875, + "loss": 2.3695, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.7582039833068848, + "rewards/margins": -3.591568946838379, + "rewards/rejected": 6.349772930145264, + "step": 9461 + }, + { + "epoch": 1.54, + "learning_rate": 1.3475367326695559e-06, + "logits/chosen": -1.379083514213562, + "logits/rejected": -1.294293761253357, + "logps/chosen": -64.5369873046875, + "logps/rejected": -57.28368377685547, + "loss": 0.2081, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.846694231033325, + "rewards/margins": 0.6783249378204346, + "rewards/rejected": 3.1683692932128906, + "step": 9462 + }, + { + "epoch": 1.54, + "learning_rate": 1.3466393278161465e-06, + "logits/chosen": -1.1156235933303833, + "logits/rejected": -1.0327908992767334, + "logps/chosen": -54.51689910888672, + "logps/rejected": -50.188880920410156, + "loss": 0.2972, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.231022596359253, + "rewards/margins": 0.6051627397537231, + "rewards/rejected": 1.6258598566055298, + "step": 9463 + }, + { + "epoch": 1.54, + "learning_rate": 1.345742175375393e-06, + "logits/chosen": -0.8692147731781006, + "logits/rejected": -0.8641396164894104, + "logps/chosen": -74.13250732421875, + "logps/rejected": -50.32087707519531, + "loss": 0.2689, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.2187278270721436, + "rewards/margins": 1.073758602142334, + "rewards/rejected": 1.1449692249298096, + "step": 9464 + }, + { + "epoch": 1.54, + "learning_rate": 1.3448452754092766e-06, + "logits/chosen": -1.5961459875106812, + "logits/rejected": -1.6318626403808594, + "logps/chosen": -38.36332321166992, + "logps/rejected": -124.28703308105469, + "loss": 0.8969, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.842653274536133, + "rewards/margins": -0.7592711448669434, + "rewards/rejected": 4.601924419403076, + "step": 9465 + }, + { + "epoch": 1.54, + "learning_rate": 1.3439486279797682e-06, + "logits/chosen": -1.3153061866760254, + "logits/rejected": -1.3153061866760254, + "logps/chosen": -110.05768585205078, + "logps/rejected": -110.05768585205078, + "loss": 0.5147, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.268578290939331, + "rewards/margins": 0.0, + "rewards/rejected": 3.268578290939331, + "step": 9466 + }, + { + "epoch": 1.54, + "learning_rate": 1.343052233148814e-06, + "logits/chosen": -1.1161980628967285, + "logits/rejected": -1.0484445095062256, + "logps/chosen": -38.76061248779297, + "logps/rejected": -33.505409240722656, + "loss": 1.872, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.1944644451141357, + "rewards/margins": 0.7389693260192871, + "rewards/rejected": 1.4554951190948486, + "step": 9467 + }, + { + "epoch": 1.54, + "learning_rate": 1.3421560909783493e-06, + "logits/chosen": -1.5227535963058472, + "logits/rejected": -1.6050461530685425, + "logps/chosen": -44.83687210083008, + "logps/rejected": -85.78462219238281, + "loss": 1.9842, + "rewards/accuracies": 0.0, + "rewards/chosen": 4.046708583831787, + "rewards/margins": -3.7873687744140625, + "rewards/rejected": 7.83407735824585, + "step": 9468 + }, + { + "epoch": 1.54, + "learning_rate": 1.3412602015302866e-06, + "logits/chosen": -1.5504233837127686, + "logits/rejected": -1.6421657800674438, + "logps/chosen": -85.19632720947266, + "logps/rejected": -140.9355010986328, + "loss": 1.8468, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.39506459236145, + "rewards/margins": -3.10832142829895, + "rewards/rejected": 6.5033860206604, + "step": 9469 + }, + { + "epoch": 1.54, + "learning_rate": 1.3403645648665265e-06, + "logits/chosen": -1.094709873199463, + "logits/rejected": -1.0003440380096436, + "logps/chosen": -50.51628875732422, + "logps/rejected": -34.18476867675781, + "loss": 0.3037, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.5584876537323, + "rewards/margins": 0.6548242568969727, + "rewards/rejected": 2.903663396835327, + "step": 9470 + }, + { + "epoch": 1.54, + "learning_rate": 1.3394691810489451e-06, + "logits/chosen": -1.264262318611145, + "logits/rejected": -1.1848329305648804, + "logps/chosen": -125.61810302734375, + "logps/rejected": -73.89043426513672, + "loss": 1.5711, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1859405040740967, + "rewards/margins": 0.35179662704467773, + "rewards/rejected": 2.834143877029419, + "step": 9471 + }, + { + "epoch": 1.54, + "learning_rate": 1.3385740501394084e-06, + "logits/chosen": -1.0296272039413452, + "logits/rejected": -1.0296272039413452, + "logps/chosen": -25.98090171813965, + "logps/rejected": -25.98090171813965, + "loss": 0.5308, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.0726423263549805, + "rewards/margins": 0.0, + "rewards/rejected": 1.0726423263549805, + "step": 9472 + }, + { + "epoch": 1.54, + "learning_rate": 1.337679172199759e-06, + "logits/chosen": -1.3019636869430542, + "logits/rejected": -1.356840968132019, + "logps/chosen": -18.754243850708008, + "logps/rejected": -47.578346252441406, + "loss": 1.2774, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.700068950653076, + "rewards/margins": -1.9014334678649902, + "rewards/rejected": 4.601502418518066, + "step": 9473 + }, + { + "epoch": 1.54, + "learning_rate": 1.3367845472918272e-06, + "logits/chosen": -1.3899052143096924, + "logits/rejected": -1.144668459892273, + "logps/chosen": -74.10324096679688, + "logps/rejected": -118.89263916015625, + "loss": 1.6924, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.295001268386841, + "rewards/margins": -0.7635254859924316, + "rewards/rejected": 3.0585267543792725, + "step": 9474 + }, + { + "epoch": 1.54, + "learning_rate": 1.33589017547742e-06, + "logits/chosen": -1.180151343345642, + "logits/rejected": -0.9757340550422668, + "logps/chosen": -107.17510986328125, + "logps/rejected": -61.12283706665039, + "loss": 0.0089, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.7762298583984375, + "rewards/margins": 4.060218334197998, + "rewards/rejected": 1.71601140499115, + "step": 9475 + }, + { + "epoch": 1.54, + "learning_rate": 1.334996056818333e-06, + "logits/chosen": -1.2651104927062988, + "logits/rejected": -1.4804123640060425, + "logps/chosen": -95.07715606689453, + "logps/rejected": -34.51882553100586, + "loss": 1.1308, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.681415557861328, + "rewards/margins": 2.4048805236816406, + "rewards/rejected": 0.2765350341796875, + "step": 9476 + }, + { + "epoch": 1.54, + "learning_rate": 1.334102191376338e-06, + "logits/chosen": -1.0541660785675049, + "logits/rejected": -1.0541660785675049, + "logps/chosen": -71.25845336914062, + "logps/rejected": -71.25845336914062, + "loss": 1.198, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.1493470668792725, + "rewards/margins": 0.0, + "rewards/rejected": 2.1493470668792725, + "step": 9477 + }, + { + "epoch": 1.54, + "learning_rate": 1.3332085792131966e-06, + "logits/chosen": -1.110103964805603, + "logits/rejected": -1.121783971786499, + "logps/chosen": -39.410980224609375, + "logps/rejected": -47.545738220214844, + "loss": 1.2534, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.660531759262085, + "rewards/margins": -1.2423315048217773, + "rewards/rejected": 3.9028632640838623, + "step": 9478 + }, + { + "epoch": 1.54, + "learning_rate": 1.3323152203906447e-06, + "logits/chosen": -1.4718217849731445, + "logits/rejected": -1.4718217849731445, + "logps/chosen": -81.58378601074219, + "logps/rejected": -81.58378601074219, + "loss": 0.4232, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.520747423171997, + "rewards/margins": 0.0, + "rewards/rejected": 1.520747423171997, + "step": 9479 + }, + { + "epoch": 1.54, + "learning_rate": 1.3314221149704093e-06, + "logits/chosen": -1.2192081212997437, + "logits/rejected": -1.2204285860061646, + "logps/chosen": -106.11477661132812, + "logps/rejected": -91.85757446289062, + "loss": 0.3931, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.6420257091522217, + "rewards/margins": 1.7993552684783936, + "rewards/rejected": 1.8426704406738281, + "step": 9480 + }, + { + "epoch": 1.54, + "learning_rate": 1.3305292630141914e-06, + "logits/chosen": -1.3127150535583496, + "logits/rejected": -1.204477310180664, + "logps/chosen": -59.09923553466797, + "logps/rejected": -20.059062957763672, + "loss": 0.256, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1063668727874756, + "rewards/margins": 2.782503604888916, + "rewards/rejected": 0.3238632380962372, + "step": 9481 + }, + { + "epoch": 1.54, + "learning_rate": 1.3296366645836823e-06, + "logits/chosen": -1.1830824613571167, + "logits/rejected": -1.1830824613571167, + "logps/chosen": -52.5589714050293, + "logps/rejected": -52.5589714050293, + "loss": 3.0204, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.242328405380249, + "rewards/margins": 0.0, + "rewards/rejected": 3.242328405380249, + "step": 9482 + }, + { + "epoch": 1.54, + "learning_rate": 1.3287443197405486e-06, + "logits/chosen": -1.5030498504638672, + "logits/rejected": -1.5244022607803345, + "logps/chosen": -73.17845153808594, + "logps/rejected": -46.72942352294922, + "loss": 0.6329, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.3605728149414062, + "rewards/margins": -0.5545432567596436, + "rewards/rejected": 3.91511607170105, + "step": 9483 + }, + { + "epoch": 1.54, + "learning_rate": 1.3278522285464463e-06, + "logits/chosen": -1.7093579769134521, + "logits/rejected": -1.6223993301391602, + "logps/chosen": -104.43138122558594, + "logps/rejected": -132.69613647460938, + "loss": 0.3235, + "rewards/accuracies": 1.0, + "rewards/chosen": 8.830845832824707, + "rewards/margins": 2.3653459548950195, + "rewards/rejected": 6.4654998779296875, + "step": 9484 + }, + { + "epoch": 1.54, + "learning_rate": 1.3269603910630073e-06, + "logits/chosen": -1.4884830713272095, + "logits/rejected": -1.253417730331421, + "logps/chosen": -76.16504669189453, + "logps/rejected": -28.950016021728516, + "loss": 0.0111, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.134871006011963, + "rewards/margins": 4.419902324676514, + "rewards/rejected": 0.7149685025215149, + "step": 9485 + }, + { + "epoch": 1.54, + "learning_rate": 1.3260688073518523e-06, + "logits/chosen": -1.2100684642791748, + "logits/rejected": -1.2503407001495361, + "logps/chosen": -66.2646713256836, + "logps/rejected": -126.13119506835938, + "loss": 0.1686, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.9752190113067627, + "rewards/margins": 1.1042518615722656, + "rewards/rejected": 0.8709670901298523, + "step": 9486 + }, + { + "epoch": 1.54, + "learning_rate": 1.3251774774745785e-06, + "logits/chosen": -1.0233333110809326, + "logits/rejected": -0.9944295883178711, + "logps/chosen": -87.95022583007812, + "logps/rejected": -62.46003723144531, + "loss": 2.4709, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.7117928266525269, + "rewards/margins": -0.6807190179824829, + "rewards/rejected": 2.3925118446350098, + "step": 9487 + }, + { + "epoch": 1.54, + "learning_rate": 1.3242864014927704e-06, + "logits/chosen": -1.2959065437316895, + "logits/rejected": -1.302937626838684, + "logps/chosen": -79.39266967773438, + "logps/rejected": -109.4682846069336, + "loss": 0.1837, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.145001173019409, + "rewards/margins": 0.9755882024765015, + "rewards/rejected": 1.1694129705429077, + "step": 9488 + }, + { + "epoch": 1.54, + "learning_rate": 1.3233955794679908e-06, + "logits/chosen": -1.1862736940383911, + "logits/rejected": -1.249140977859497, + "logps/chosen": -116.88566589355469, + "logps/rejected": -91.66427612304688, + "loss": 0.6569, + "rewards/accuracies": 1.0, + "rewards/chosen": 7.634837627410889, + "rewards/margins": 0.940983772277832, + "rewards/rejected": 6.693853855133057, + "step": 9489 + }, + { + "epoch": 1.54, + "learning_rate": 1.32250501146179e-06, + "logits/chosen": -1.1375874280929565, + "logits/rejected": -1.0183767080307007, + "logps/chosen": -46.26628494262695, + "logps/rejected": -32.66329574584961, + "loss": 0.2152, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.209199905395508, + "rewards/margins": 0.8171660900115967, + "rewards/rejected": 1.3920338153839111, + "step": 9490 + }, + { + "epoch": 1.54, + "learning_rate": 1.3216146975356942e-06, + "logits/chosen": -0.8868551254272461, + "logits/rejected": -0.9068584442138672, + "logps/chosen": -5.80454683303833, + "logps/rejected": -14.936509132385254, + "loss": 0.7051, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.2611783444881439, + "rewards/margins": -0.7392787933349609, + "rewards/rejected": 1.0004571676254272, + "step": 9491 + }, + { + "epoch": 1.54, + "learning_rate": 1.320724637751219e-06, + "logits/chosen": -1.4602704048156738, + "logits/rejected": -1.5872597694396973, + "logps/chosen": -235.18154907226562, + "logps/rejected": -147.145751953125, + "loss": 0.0446, + "rewards/accuracies": 1.0, + "rewards/chosen": 9.898831367492676, + "rewards/margins": 2.467965602874756, + "rewards/rejected": 7.43086576461792, + "step": 9492 + }, + { + "epoch": 1.54, + "learning_rate": 1.3198348321698568e-06, + "logits/chosen": -1.5464732646942139, + "logits/rejected": -1.3883130550384521, + "logps/chosen": -155.87374877929688, + "logps/rejected": -30.728065490722656, + "loss": 0.0287, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9669556617736816, + "rewards/margins": 2.835186004638672, + "rewards/rejected": 0.1317695677280426, + "step": 9493 + }, + { + "epoch": 1.54, + "learning_rate": 1.3189452808530866e-06, + "logits/chosen": -1.2205822467803955, + "logits/rejected": -1.1212413311004639, + "logps/chosen": -69.27641296386719, + "logps/rejected": -18.731487274169922, + "loss": 0.1605, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.695517063140869, + "rewards/margins": 2.522953987121582, + "rewards/rejected": 0.17256298661231995, + "step": 9494 + }, + { + "epoch": 1.54, + "learning_rate": 1.3180559838623674e-06, + "logits/chosen": -1.5735810995101929, + "logits/rejected": -1.6383086442947388, + "logps/chosen": -85.50834655761719, + "logps/rejected": -134.3011474609375, + "loss": 0.6177, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.366621494293213, + "rewards/margins": -0.5703856945037842, + "rewards/rejected": 3.937007188796997, + "step": 9495 + }, + { + "epoch": 1.54, + "learning_rate": 1.3171669412591393e-06, + "logits/chosen": -1.1522674560546875, + "logits/rejected": -1.1522674560546875, + "logps/chosen": -1.6386992931365967, + "logps/rejected": -1.6386992931365967, + "loss": 0.7263, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.32520219683647156, + "rewards/margins": 0.0, + "rewards/rejected": 0.32520219683647156, + "step": 9496 + }, + { + "epoch": 1.54, + "learning_rate": 1.316278153104829e-06, + "logits/chosen": -1.259992003440857, + "logits/rejected": -1.303453803062439, + "logps/chosen": -75.7350082397461, + "logps/rejected": -114.38772583007812, + "loss": 0.4823, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.664827823638916, + "rewards/margins": -0.4761390686035156, + "rewards/rejected": 4.140966892242432, + "step": 9497 + }, + { + "epoch": 1.54, + "learning_rate": 1.3153896194608423e-06, + "logits/chosen": -1.3401401042938232, + "logits/rejected": -1.3270374536514282, + "logps/chosen": -70.59072875976562, + "logps/rejected": -128.40916442871094, + "loss": 0.402, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.90643310546875, + "rewards/margins": 1.1232330799102783, + "rewards/rejected": 0.7832000851631165, + "step": 9498 + }, + { + "epoch": 1.54, + "learning_rate": 1.3145013403885699e-06, + "logits/chosen": -1.4400813579559326, + "logits/rejected": -1.4378609657287598, + "logps/chosen": -111.89822387695312, + "logps/rejected": -107.47332763671875, + "loss": 0.8502, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.0015761852264404, + "rewards/margins": 0.8603415489196777, + "rewards/rejected": 1.1412346363067627, + "step": 9499 + }, + { + "epoch": 1.54, + "learning_rate": 1.3136133159493803e-06, + "logits/chosen": -1.3315147161483765, + "logits/rejected": -1.3615269660949707, + "logps/chosen": -57.127601623535156, + "logps/rejected": -105.27742767333984, + "loss": 0.1615, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.0618278980255127, + "rewards/margins": 1.0171219110488892, + "rewards/rejected": 1.0447059869766235, + "step": 9500 + }, + { + "epoch": 1.54, + "learning_rate": 1.3127255462046318e-06, + "logits/chosen": -0.7988779544830322, + "logits/rejected": -0.7988779544830322, + "logps/chosen": -3.6452784538269043, + "logps/rejected": -3.6452784538269043, + "loss": 0.7294, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.980000913143158, + "rewards/margins": 0.0, + "rewards/rejected": 0.980000913143158, + "step": 9501 + }, + { + "epoch": 1.54, + "learning_rate": 1.311838031215657e-06, + "logits/chosen": -1.4128371477127075, + "logits/rejected": -1.4742119312286377, + "logps/chosen": -168.9597625732422, + "logps/rejected": -86.0319595336914, + "loss": 0.1639, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.958966255187988, + "rewards/margins": 0.9982123374938965, + "rewards/rejected": 5.960753917694092, + "step": 9502 + }, + { + "epoch": 1.54, + "learning_rate": 1.310950771043778e-06, + "logits/chosen": -1.5012319087982178, + "logits/rejected": -1.4418706893920898, + "logps/chosen": -59.17802810668945, + "logps/rejected": -26.87311553955078, + "loss": 0.1405, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.852174758911133, + "rewards/margins": 1.3547245264053345, + "rewards/rejected": 1.4974502325057983, + "step": 9503 + }, + { + "epoch": 1.54, + "learning_rate": 1.310063765750293e-06, + "logits/chosen": -1.420291543006897, + "logits/rejected": -1.5524992942810059, + "logps/chosen": -378.16925048828125, + "logps/rejected": -116.75492858886719, + "loss": 0.0012, + "rewards/accuracies": 1.0, + "rewards/chosen": 11.804193496704102, + "rewards/margins": 6.4687581062316895, + "rewards/rejected": 5.335435390472412, + "step": 9504 + }, + { + "epoch": 1.54, + "learning_rate": 1.309177015396489e-06, + "logits/chosen": -1.0236581563949585, + "logits/rejected": -1.0236581563949585, + "logps/chosen": -56.053611755371094, + "logps/rejected": -56.053611755371094, + "loss": 0.5231, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.5101616382598877, + "rewards/margins": 0.0, + "rewards/rejected": 2.5101616382598877, + "step": 9505 + }, + { + "epoch": 1.54, + "learning_rate": 1.3082905200436291e-06, + "logits/chosen": -1.3740936517715454, + "logits/rejected": -1.3504884243011475, + "logps/chosen": -103.38937377929688, + "logps/rejected": -132.21209716796875, + "loss": 1.5109, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.7217391729354858, + "rewards/margins": 0.14629054069519043, + "rewards/rejected": 1.5754486322402954, + "step": 9506 + }, + { + "epoch": 1.54, + "learning_rate": 1.307404279752964e-06, + "logits/chosen": -2.1467528343200684, + "logits/rejected": -2.2002201080322266, + "logps/chosen": -144.33567810058594, + "logps/rejected": -133.5545196533203, + "loss": 1.2563, + "rewards/accuracies": 0.0, + "rewards/chosen": 6.839473247528076, + "rewards/margins": -1.6300749778747559, + "rewards/rejected": 8.469548225402832, + "step": 9507 + }, + { + "epoch": 1.54, + "learning_rate": 1.3065182945857218e-06, + "logits/chosen": -1.4834016561508179, + "logits/rejected": -1.3942077159881592, + "logps/chosen": -141.29473876953125, + "logps/rejected": -65.4884262084961, + "loss": 0.3711, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.580796718597412, + "rewards/margins": 3.4847288131713867, + "rewards/rejected": 2.0960679054260254, + "step": 9508 + }, + { + "epoch": 1.54, + "learning_rate": 1.3056325646031199e-06, + "logits/chosen": -1.2476195096969604, + "logits/rejected": -1.1104532480239868, + "logps/chosen": -81.03363037109375, + "logps/rejected": -55.28662109375, + "loss": 0.2222, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.5281898975372314, + "rewards/margins": 0.8086395263671875, + "rewards/rejected": 2.719550371170044, + "step": 9509 + }, + { + "epoch": 1.54, + "learning_rate": 1.304747089866349e-06, + "logits/chosen": -1.1403443813323975, + "logits/rejected": -1.161847472190857, + "logps/chosen": -50.08771514892578, + "logps/rejected": -51.68447494506836, + "loss": 1.0331, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.9459915161132812, + "rewards/margins": -1.7032535076141357, + "rewards/rejected": 3.649245023727417, + "step": 9510 + }, + { + "epoch": 1.54, + "learning_rate": 1.3038618704365914e-06, + "logits/chosen": -0.9754396677017212, + "logits/rejected": -0.9736550450325012, + "logps/chosen": -55.16402816772461, + "logps/rejected": -92.93973541259766, + "loss": 0.4721, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.3823292255401611, + "rewards/margins": 0.6795864701271057, + "rewards/rejected": 0.7027427554130554, + "step": 9511 + }, + { + "epoch": 1.54, + "learning_rate": 1.302976906375003e-06, + "logits/chosen": -0.871181845664978, + "logits/rejected": -0.8600996136665344, + "logps/chosen": -53.55226135253906, + "logps/rejected": -100.6087875366211, + "loss": 0.4482, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.110127329826355, + "rewards/margins": 1.4902390241622925, + "rewards/rejected": -0.3801116943359375, + "step": 9512 + }, + { + "epoch": 1.54, + "learning_rate": 1.302092197742731e-06, + "logits/chosen": -1.3880172967910767, + "logits/rejected": -1.3838202953338623, + "logps/chosen": -37.653629302978516, + "logps/rejected": -150.28213500976562, + "loss": 2.9384, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.03582501411438, + "rewards/margins": -5.792168617248535, + "rewards/rejected": 7.827993869781494, + "step": 9513 + }, + { + "epoch": 1.54, + "learning_rate": 1.3012077446008969e-06, + "logits/chosen": -1.3436111211776733, + "logits/rejected": -1.2925145626068115, + "logps/chosen": -80.911865234375, + "logps/rejected": -35.96269989013672, + "loss": 0.1457, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.6251633167266846, + "rewards/margins": 1.6279007196426392, + "rewards/rejected": -0.002737426897510886, + "step": 9514 + }, + { + "epoch": 1.54, + "learning_rate": 1.3003235470106102e-06, + "logits/chosen": -1.0875681638717651, + "logits/rejected": -1.0875681638717651, + "logps/chosen": -43.06468200683594, + "logps/rejected": -43.06468200683594, + "loss": 0.4099, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.1186630725860596, + "rewards/margins": 0.0, + "rewards/rejected": 2.1186630725860596, + "step": 9515 + }, + { + "epoch": 1.54, + "learning_rate": 1.2994396050329589e-06, + "logits/chosen": -1.0608566999435425, + "logits/rejected": -1.0403261184692383, + "logps/chosen": -15.7572603225708, + "logps/rejected": -2.126005172729492, + "loss": 0.732, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.633695125579834, + "rewards/margins": 0.26128092408180237, + "rewards/rejected": 0.3724142014980316, + "step": 9516 + }, + { + "epoch": 1.54, + "learning_rate": 1.2985559187290153e-06, + "logits/chosen": -0.9841369390487671, + "logits/rejected": -0.9824698567390442, + "logps/chosen": -8.781292915344238, + "logps/rejected": -6.010558605194092, + "loss": 0.7995, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.4892433285713196, + "rewards/margins": 0.0367165207862854, + "rewards/rejected": 0.4525268077850342, + "step": 9517 + }, + { + "epoch": 1.54, + "learning_rate": 1.2976724881598362e-06, + "logits/chosen": -1.3486405611038208, + "logits/rejected": -1.2273837327957153, + "logps/chosen": -106.04072570800781, + "logps/rejected": -57.92546844482422, + "loss": 0.0331, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.691628932952881, + "rewards/margins": 2.745544195175171, + "rewards/rejected": 2.94608473777771, + "step": 9518 + }, + { + "epoch": 1.55, + "learning_rate": 1.296789313386455e-06, + "logits/chosen": -0.7079339027404785, + "logits/rejected": -0.7079339027404785, + "logps/chosen": -4.3611273765563965, + "logps/rejected": -4.3611273765563965, + "loss": 0.356, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.5749214887619019, + "rewards/margins": 0.0, + "rewards/rejected": 0.5749214887619019, + "step": 9519 + }, + { + "epoch": 1.55, + "learning_rate": 1.2959063944698935e-06, + "logits/chosen": -1.4768421649932861, + "logits/rejected": -1.390596628189087, + "logps/chosen": -47.76368713378906, + "logps/rejected": -36.679840087890625, + "loss": 1.1802, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.2640061378479004, + "rewards/margins": 0.3302861452102661, + "rewards/rejected": 1.9337199926376343, + "step": 9520 + }, + { + "epoch": 1.55, + "learning_rate": 1.2950237314711501e-06, + "logits/chosen": -1.2293227910995483, + "logits/rejected": -1.259791374206543, + "logps/chosen": -69.5134506225586, + "logps/rejected": -67.1159439086914, + "loss": 0.7681, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.7480888366699219, + "rewards/margins": -0.5313966274261475, + "rewards/rejected": 2.2794854640960693, + "step": 9521 + }, + { + "epoch": 1.55, + "learning_rate": 1.2941413244512113e-06, + "logits/chosen": -1.2484694719314575, + "logits/rejected": -1.2806274890899658, + "logps/chosen": -50.360374450683594, + "logps/rejected": -37.428565979003906, + "loss": 0.6201, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.182671308517456, + "rewards/margins": 0.754448652267456, + "rewards/rejected": 1.42822265625, + "step": 9522 + }, + { + "epoch": 1.55, + "learning_rate": 1.293259173471041e-06, + "logits/chosen": -1.3827451467514038, + "logits/rejected": -1.3951680660247803, + "logps/chosen": -64.22064971923828, + "logps/rejected": -117.63533020019531, + "loss": 0.6495, + "rewards/accuracies": 0.0, + "rewards/chosen": 7.108651161193848, + "rewards/margins": -0.08423233032226562, + "rewards/rejected": 7.192883491516113, + "step": 9523 + }, + { + "epoch": 1.55, + "learning_rate": 1.2923772785915888e-06, + "logits/chosen": -1.4289907217025757, + "logits/rejected": -1.3871991634368896, + "logps/chosen": -59.75814437866211, + "logps/rejected": -21.290760040283203, + "loss": 0.1951, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.6575840711593628, + "rewards/margins": 0.887306272983551, + "rewards/rejected": 0.7702777981758118, + "step": 9524 + }, + { + "epoch": 1.55, + "learning_rate": 1.2914956398737844e-06, + "logits/chosen": -1.159933090209961, + "logits/rejected": -1.1217896938323975, + "logps/chosen": -83.01814270019531, + "logps/rejected": -47.39784240722656, + "loss": 0.5467, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.2462470531463623, + "rewards/margins": -0.18259215354919434, + "rewards/rejected": 3.4288392066955566, + "step": 9525 + }, + { + "epoch": 1.55, + "learning_rate": 1.290614257378542e-06, + "logits/chosen": -0.9707256555557251, + "logits/rejected": -1.1394933462142944, + "logps/chosen": -67.06228637695312, + "logps/rejected": -118.36503601074219, + "loss": 1.7166, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.95531165599823, + "rewards/margins": -3.040078639984131, + "rewards/rejected": 4.99539041519165, + "step": 9526 + }, + { + "epoch": 1.55, + "learning_rate": 1.2897331311667544e-06, + "logits/chosen": -1.548792839050293, + "logits/rejected": -1.4578053951263428, + "logps/chosen": -72.63277435302734, + "logps/rejected": -60.7889404296875, + "loss": 0.6594, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.5620994567871094, + "rewards/margins": 1.2632453441619873, + "rewards/rejected": 2.298854112625122, + "step": 9527 + }, + { + "epoch": 1.55, + "learning_rate": 1.288852261299302e-06, + "logits/chosen": -1.0327523946762085, + "logits/rejected": -1.0327523946762085, + "logps/chosen": -34.918060302734375, + "logps/rejected": -34.918060302734375, + "loss": 0.3576, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.0373427867889404, + "rewards/margins": 0.0, + "rewards/rejected": 2.0373427867889404, + "step": 9528 + }, + { + "epoch": 1.55, + "learning_rate": 1.287971647837042e-06, + "logits/chosen": -1.218752384185791, + "logits/rejected": -1.2686258554458618, + "logps/chosen": -39.61167907714844, + "logps/rejected": -51.21403121948242, + "loss": 2.4738, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.8182777166366577, + "rewards/margins": -4.055222511291504, + "rewards/rejected": 5.873500347137451, + "step": 9529 + }, + { + "epoch": 1.55, + "learning_rate": 1.2870912908408184e-06, + "logits/chosen": -1.4360967874526978, + "logits/rejected": -1.2794655561447144, + "logps/chosen": -98.11421203613281, + "logps/rejected": -15.482028007507324, + "loss": 0.0236, + "rewards/accuracies": 1.0, + "rewards/chosen": 7.001561164855957, + "rewards/margins": 6.000384330749512, + "rewards/rejected": 1.0011765956878662, + "step": 9530 + }, + { + "epoch": 1.55, + "learning_rate": 1.2862111903714542e-06, + "logits/chosen": -1.5725663900375366, + "logits/rejected": -1.6242088079452515, + "logps/chosen": -64.85708618164062, + "logps/rejected": -84.54341125488281, + "loss": 1.183, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.940274953842163, + "rewards/margins": -2.002617597579956, + "rewards/rejected": 4.942892551422119, + "step": 9531 + }, + { + "epoch": 1.55, + "learning_rate": 1.2853313464897572e-06, + "logits/chosen": -1.3689974546432495, + "logits/rejected": -1.3023693561553955, + "logps/chosen": -54.58502197265625, + "logps/rejected": -25.15157127380371, + "loss": 0.2383, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.9044815301895142, + "rewards/margins": 0.5933643579483032, + "rewards/rejected": 0.31111717224121094, + "step": 9532 + }, + { + "epoch": 1.55, + "learning_rate": 1.2844517592565148e-06, + "logits/chosen": -1.5317838191986084, + "logits/rejected": -1.4228819608688354, + "logps/chosen": -152.09567260742188, + "logps/rejected": -54.25408935546875, + "loss": 0.1387, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.6988725662231445, + "rewards/margins": 2.592338800430298, + "rewards/rejected": 3.1065337657928467, + "step": 9533 + }, + { + "epoch": 1.55, + "learning_rate": 1.2835724287325001e-06, + "logits/chosen": -0.8795042634010315, + "logits/rejected": -0.9343719482421875, + "logps/chosen": -20.263545989990234, + "logps/rejected": -101.30122375488281, + "loss": 0.3881, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.35026779770851135, + "rewards/margins": -0.052898406982421875, + "rewards/rejected": 0.4031662046909332, + "step": 9534 + }, + { + "epoch": 1.55, + "learning_rate": 1.2826933549784637e-06, + "logits/chosen": -1.2041176557540894, + "logits/rejected": -1.2041176557540894, + "logps/chosen": -95.53536987304688, + "logps/rejected": -95.53536987304688, + "loss": 0.3536, + "rewards/accuracies": 0.0, + "rewards/chosen": 5.789776802062988, + "rewards/margins": 0.0, + "rewards/rejected": 5.789776802062988, + "step": 9535 + }, + { + "epoch": 1.55, + "learning_rate": 1.281814538055145e-06, + "logits/chosen": -0.8644670844078064, + "logits/rejected": -0.8646304607391357, + "logps/chosen": -2.519366502761841, + "logps/rejected": -1.4173089265823364, + "loss": 0.5424, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.29438135027885437, + "rewards/margins": -0.12212809920310974, + "rewards/rejected": 0.4165094494819641, + "step": 9536 + }, + { + "epoch": 1.55, + "learning_rate": 1.2809359780232582e-06, + "logits/chosen": -1.135416030883789, + "logits/rejected": -1.0774405002593994, + "logps/chosen": -63.74458312988281, + "logps/rejected": -38.452903747558594, + "loss": 1.5329, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.9933388233184814, + "rewards/margins": -0.49695587158203125, + "rewards/rejected": 2.4902946949005127, + "step": 9537 + }, + { + "epoch": 1.55, + "learning_rate": 1.2800576749435068e-06, + "logits/chosen": -1.4216303825378418, + "logits/rejected": -1.420114517211914, + "logps/chosen": -98.337646484375, + "logps/rejected": -43.72496032714844, + "loss": 1.0316, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.8963592648506165, + "rewards/margins": -0.939990222454071, + "rewards/rejected": 1.8363494873046875, + "step": 9538 + }, + { + "epoch": 1.55, + "learning_rate": 1.27917962887657e-06, + "logits/chosen": -0.9234000444412231, + "logits/rejected": -0.8318729996681213, + "logps/chosen": -75.73167419433594, + "logps/rejected": -1.2260359525680542, + "loss": 0.2063, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.0181297063827515, + "rewards/margins": 0.7528716325759888, + "rewards/rejected": 0.2652580440044403, + "step": 9539 + }, + { + "epoch": 1.55, + "learning_rate": 1.2783018398831154e-06, + "logits/chosen": -1.4876346588134766, + "logits/rejected": -1.4876346588134766, + "logps/chosen": -75.62125396728516, + "logps/rejected": -75.62125396728516, + "loss": 0.9242, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.4840080738067627, + "rewards/margins": 0.0, + "rewards/rejected": 2.4840080738067627, + "step": 9540 + }, + { + "epoch": 1.55, + "learning_rate": 1.2774243080237876e-06, + "logits/chosen": -1.2099014520645142, + "logits/rejected": -1.2159576416015625, + "logps/chosen": -74.49406433105469, + "logps/rejected": -167.130859375, + "loss": 1.6337, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.6294312477111816, + "rewards/margins": -3.212794303894043, + "rewards/rejected": 5.842225551605225, + "step": 9541 + }, + { + "epoch": 1.55, + "learning_rate": 1.2765470333592178e-06, + "logits/chosen": -1.487501621246338, + "logits/rejected": -1.583747148513794, + "logps/chosen": -73.79978942871094, + "logps/rejected": -37.129783630371094, + "loss": 0.0583, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.052725315093994, + "rewards/margins": 2.9640748500823975, + "rewards/rejected": 0.08865051716566086, + "step": 9542 + }, + { + "epoch": 1.55, + "learning_rate": 1.275670015950015e-06, + "logits/chosen": -0.9748768210411072, + "logits/rejected": -1.0090261697769165, + "logps/chosen": -2.255934238433838, + "logps/rejected": -40.305084228515625, + "loss": 0.3992, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.9213947653770447, + "rewards/margins": 0.7270603179931641, + "rewards/rejected": 0.19433441758155823, + "step": 9543 + }, + { + "epoch": 1.55, + "learning_rate": 1.274793255856776e-06, + "logits/chosen": -1.2333316802978516, + "logits/rejected": -1.2255834341049194, + "logps/chosen": -157.1559600830078, + "logps/rejected": -118.0925521850586, + "loss": 0.0269, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.094071865081787, + "rewards/margins": 3.9407660961151123, + "rewards/rejected": 2.153305768966675, + "step": 9544 + }, + { + "epoch": 1.55, + "learning_rate": 1.273916753140073e-06, + "logits/chosen": -0.7724658846855164, + "logits/rejected": -0.796316921710968, + "logps/chosen": -95.2088851928711, + "logps/rejected": -70.29171752929688, + "loss": 0.9496, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.6521804332733154, + "rewards/margins": 0.5138578414916992, + "rewards/rejected": 2.138322591781616, + "step": 9545 + }, + { + "epoch": 1.55, + "learning_rate": 1.2730405078604673e-06, + "logits/chosen": -1.2146559953689575, + "logits/rejected": -1.1755096912384033, + "logps/chosen": -71.04427337646484, + "logps/rejected": -57.93814468383789, + "loss": 0.6583, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.599740743637085, + "rewards/margins": 0.14472484588623047, + "rewards/rejected": 2.4550158977508545, + "step": 9546 + }, + { + "epoch": 1.55, + "learning_rate": 1.2721645200784966e-06, + "logits/chosen": -0.6895140409469604, + "logits/rejected": -0.6948874592781067, + "logps/chosen": -9.525310516357422, + "logps/rejected": -1.6490157842636108, + "loss": 0.83, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.379690557718277, + "rewards/margins": 0.13562145829200745, + "rewards/rejected": 0.24406909942626953, + "step": 9547 + }, + { + "epoch": 1.55, + "learning_rate": 1.2712887898546856e-06, + "logits/chosen": -1.1969983577728271, + "logits/rejected": -1.242570161819458, + "logps/chosen": -178.7248077392578, + "logps/rejected": -50.849395751953125, + "loss": 0.0671, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.907496929168701, + "rewards/margins": 2.2501046657562256, + "rewards/rejected": 2.6573922634124756, + "step": 9548 + }, + { + "epoch": 1.55, + "learning_rate": 1.2704133172495364e-06, + "logits/chosen": -1.2386349439620972, + "logits/rejected": -1.2129539251327515, + "logps/chosen": -25.49124526977539, + "logps/rejected": -32.59864044189453, + "loss": 0.3487, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.3001248836517334, + "rewards/margins": 0.328723669052124, + "rewards/rejected": 1.9714012145996094, + "step": 9549 + }, + { + "epoch": 1.55, + "learning_rate": 1.2695381023235387e-06, + "logits/chosen": -1.2289308309555054, + "logits/rejected": -1.2253977060317993, + "logps/chosen": -46.87653732299805, + "logps/rejected": -99.93743896484375, + "loss": 0.2852, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.292238235473633, + "rewards/margins": 0.40733301639556885, + "rewards/rejected": 1.884905219078064, + "step": 9550 + }, + { + "epoch": 1.55, + "learning_rate": 1.2686631451371588e-06, + "logits/chosen": -1.1572248935699463, + "logits/rejected": -1.0973082780838013, + "logps/chosen": -47.104583740234375, + "logps/rejected": -64.02015686035156, + "loss": 0.4639, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.2032363414764404, + "rewards/margins": -0.1205301284790039, + "rewards/rejected": 3.3237664699554443, + "step": 9551 + }, + { + "epoch": 1.55, + "learning_rate": 1.2677884457508505e-06, + "logits/chosen": -1.240828037261963, + "logits/rejected": -1.332444190979004, + "logps/chosen": -104.45945739746094, + "logps/rejected": -70.38352966308594, + "loss": 0.0743, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.9564971923828125, + "rewards/margins": 2.850140333175659, + "rewards/rejected": 1.1063568592071533, + "step": 9552 + }, + { + "epoch": 1.55, + "learning_rate": 1.2669140042250449e-06, + "logits/chosen": -1.3362373113632202, + "logits/rejected": -1.3091439008712769, + "logps/chosen": -131.12353515625, + "logps/rejected": -71.47362518310547, + "loss": 0.1131, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.4675567150115967, + "rewards/margins": 1.411614179611206, + "rewards/rejected": 1.0559425354003906, + "step": 9553 + }, + { + "epoch": 1.55, + "learning_rate": 1.266039820620159e-06, + "logits/chosen": -1.2255934476852417, + "logits/rejected": -1.2420475482940674, + "logps/chosen": -161.55860900878906, + "logps/rejected": -51.99872589111328, + "loss": 0.1323, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.8063857555389404, + "rewards/margins": 1.5677070617675781, + "rewards/rejected": 2.2386786937713623, + "step": 9554 + }, + { + "epoch": 1.55, + "learning_rate": 1.2651658949965918e-06, + "logits/chosen": -1.1617484092712402, + "logits/rejected": -1.1211789846420288, + "logps/chosen": -32.657249450683594, + "logps/rejected": -47.49327850341797, + "loss": 0.7745, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.837591528892517, + "rewards/margins": -0.0020378828048706055, + "rewards/rejected": 1.8396294116973877, + "step": 9555 + }, + { + "epoch": 1.55, + "learning_rate": 1.2642922274147202e-06, + "logits/chosen": -1.4889084100723267, + "logits/rejected": -1.2574076652526855, + "logps/chosen": -91.19467163085938, + "logps/rejected": -75.6326904296875, + "loss": 0.0454, + "rewards/accuracies": 1.0, + "rewards/chosen": 7.547051906585693, + "rewards/margins": 3.9713170528411865, + "rewards/rejected": 3.575734853744507, + "step": 9556 + }, + { + "epoch": 1.55, + "learning_rate": 1.2634188179349099e-06, + "logits/chosen": -1.0571318864822388, + "logits/rejected": -1.1371171474456787, + "logps/chosen": -29.91960906982422, + "logps/rejected": -49.011474609375, + "loss": 2.0917, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.9150784015655518, + "rewards/margins": -3.4592998027801514, + "rewards/rejected": 5.374378204345703, + "step": 9557 + }, + { + "epoch": 1.55, + "learning_rate": 1.2625456666175017e-06, + "logits/chosen": -1.3533703088760376, + "logits/rejected": -1.3533703088760376, + "logps/chosen": -46.22544860839844, + "logps/rejected": -46.22544860839844, + "loss": 0.3928, + "rewards/accuracies": 0.0, + "rewards/chosen": 4.1699066162109375, + "rewards/margins": 0.0, + "rewards/rejected": 4.1699066162109375, + "step": 9558 + }, + { + "epoch": 1.55, + "learning_rate": 1.261672773522825e-06, + "logits/chosen": -1.3705309629440308, + "logits/rejected": -1.2543206214904785, + "logps/chosen": -150.30508422851562, + "logps/rejected": -54.026126861572266, + "loss": 0.0707, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.015389919281006, + "rewards/margins": 2.8854684829711914, + "rewards/rejected": 2.1299214363098145, + "step": 9559 + }, + { + "epoch": 1.55, + "learning_rate": 1.2608001387111862e-06, + "logits/chosen": -0.7139326930046082, + "logits/rejected": -0.7150338292121887, + "logps/chosen": -2.6285829544067383, + "logps/rejected": -1.9341024160385132, + "loss": 0.4544, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.2643182873725891, + "rewards/margins": -0.0025528371334075928, + "rewards/rejected": 0.2668711245059967, + "step": 9560 + }, + { + "epoch": 1.55, + "learning_rate": 1.259927762242879e-06, + "logits/chosen": -1.0957762002944946, + "logits/rejected": -1.0890684127807617, + "logps/chosen": -60.12036895751953, + "logps/rejected": -58.504432678222656, + "loss": 0.9499, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.7504372596740723, + "rewards/margins": -0.5999267101287842, + "rewards/rejected": 3.3503639698028564, + "step": 9561 + }, + { + "epoch": 1.55, + "learning_rate": 1.2590556441781725e-06, + "logits/chosen": -1.3029721975326538, + "logits/rejected": -1.2461702823638916, + "logps/chosen": -129.02699279785156, + "logps/rejected": -57.84223556518555, + "loss": 0.239, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.267425537109375, + "rewards/margins": 0.9666683673858643, + "rewards/rejected": 2.3007571697235107, + "step": 9562 + }, + { + "epoch": 1.55, + "learning_rate": 1.2581837845773253e-06, + "logits/chosen": -1.494354486465454, + "logits/rejected": -1.4974232912063599, + "logps/chosen": -100.10189819335938, + "logps/rejected": -90.42312622070312, + "loss": 1.5652, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.9673064947128296, + "rewards/margins": -2.1576476097106934, + "rewards/rejected": 4.1249542236328125, + "step": 9563 + }, + { + "epoch": 1.55, + "learning_rate": 1.257312183500572e-06, + "logits/chosen": -1.3309670686721802, + "logits/rejected": -1.267757534980774, + "logps/chosen": -70.77090454101562, + "logps/rejected": -75.6199951171875, + "loss": 0.2648, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.602215528488159, + "rewards/margins": 0.9126639366149902, + "rewards/rejected": 1.689551591873169, + "step": 9564 + }, + { + "epoch": 1.55, + "learning_rate": 1.2564408410081347e-06, + "logits/chosen": -1.3644726276397705, + "logits/rejected": -1.2834545373916626, + "logps/chosen": -55.969879150390625, + "logps/rejected": -56.329647064208984, + "loss": 0.1698, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9427506923675537, + "rewards/margins": 1.6726040840148926, + "rewards/rejected": 1.2701466083526611, + "step": 9565 + }, + { + "epoch": 1.55, + "learning_rate": 1.2555697571602116e-06, + "logits/chosen": -1.5731327533721924, + "logits/rejected": -1.621977686882019, + "logps/chosen": -131.1373291015625, + "logps/rejected": -29.810617446899414, + "loss": 0.0885, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.333763122558594, + "rewards/margins": 1.8511707782745361, + "rewards/rejected": 2.4825923442840576, + "step": 9566 + }, + { + "epoch": 1.55, + "learning_rate": 1.25469893201699e-06, + "logits/chosen": -1.2883057594299316, + "logits/rejected": -1.4531307220458984, + "logps/chosen": -173.55828857421875, + "logps/rejected": -216.86276245117188, + "loss": 0.8192, + "rewards/accuracies": 0.0, + "rewards/chosen": 8.73223876953125, + "rewards/margins": -1.2342681884765625, + "rewards/rejected": 9.966506958007812, + "step": 9567 + }, + { + "epoch": 1.55, + "learning_rate": 1.2538283656386318e-06, + "logits/chosen": -1.279332160949707, + "logits/rejected": -1.2435133457183838, + "logps/chosen": -69.57085418701172, + "logps/rejected": -32.50283432006836, + "loss": 0.077, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.410518646240234, + "rewards/margins": 2.2426085472106934, + "rewards/rejected": 2.167910099029541, + "step": 9568 + }, + { + "epoch": 1.55, + "learning_rate": 1.252958058085289e-06, + "logits/chosen": -1.556152582168579, + "logits/rejected": -1.4140359163284302, + "logps/chosen": -83.79203796386719, + "logps/rejected": -45.68019104003906, + "loss": 0.0911, + "rewards/accuracies": 1.0, + "rewards/chosen": 7.923153877258301, + "rewards/margins": 4.944300651550293, + "rewards/rejected": 2.9788529872894287, + "step": 9569 + }, + { + "epoch": 1.55, + "learning_rate": 1.2520880094170878e-06, + "logits/chosen": -1.4156031608581543, + "logits/rejected": -1.4127006530761719, + "logps/chosen": -121.25213623046875, + "logps/rejected": -109.25943756103516, + "loss": 0.0316, + "rewards/accuracies": 1.0, + "rewards/chosen": 7.958535671234131, + "rewards/margins": 2.936288833618164, + "rewards/rejected": 5.022246837615967, + "step": 9570 + }, + { + "epoch": 1.55, + "learning_rate": 1.2512182196941436e-06, + "logits/chosen": -1.1890472173690796, + "logits/rejected": -0.9701533317565918, + "logps/chosen": -72.0715560913086, + "logps/rejected": -38.54022979736328, + "loss": 0.0872, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.49459981918335, + "rewards/margins": 2.2777788639068604, + "rewards/rejected": 2.2168209552764893, + "step": 9571 + }, + { + "epoch": 1.55, + "learning_rate": 1.2503486889765476e-06, + "logits/chosen": -1.5444937944412231, + "logits/rejected": -1.5076017379760742, + "logps/chosen": -23.589950561523438, + "logps/rejected": -34.85933303833008, + "loss": 0.1373, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0274970531463623, + "rewards/margins": 1.1940597295761108, + "rewards/rejected": 1.8334373235702515, + "step": 9572 + }, + { + "epoch": 1.55, + "learning_rate": 1.2494794173243791e-06, + "logits/chosen": -1.2325153350830078, + "logits/rejected": -1.2054380178451538, + "logps/chosen": -53.859859466552734, + "logps/rejected": -78.27215576171875, + "loss": 0.4244, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.7787907123565674, + "rewards/margins": -0.07338213920593262, + "rewards/rejected": 2.8521728515625, + "step": 9573 + }, + { + "epoch": 1.55, + "learning_rate": 1.2486104047976937e-06, + "logits/chosen": -1.2338223457336426, + "logits/rejected": -1.180209994316101, + "logps/chosen": -73.95294189453125, + "logps/rejected": -51.52935028076172, + "loss": 0.5616, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.4153000116348267, + "rewards/margins": -0.5057357549667358, + "rewards/rejected": 1.9210357666015625, + "step": 9574 + }, + { + "epoch": 1.55, + "learning_rate": 1.247741651456535e-06, + "logits/chosen": -1.1600937843322754, + "logits/rejected": -1.2301372289657593, + "logps/chosen": -62.959381103515625, + "logps/rejected": -86.37889099121094, + "loss": 0.859, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.135883331298828, + "rewards/margins": -1.5043649673461914, + "rewards/rejected": 4.6402482986450195, + "step": 9575 + }, + { + "epoch": 1.55, + "learning_rate": 1.2468731573609222e-06, + "logits/chosen": -1.243038296699524, + "logits/rejected": -1.2318724393844604, + "logps/chosen": -76.4766616821289, + "logps/rejected": -133.39939880371094, + "loss": 0.1453, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1936187744140625, + "rewards/margins": 1.098963975906372, + "rewards/rejected": 2.0946547985076904, + "step": 9576 + }, + { + "epoch": 1.55, + "learning_rate": 1.2460049225708637e-06, + "logits/chosen": -1.4616307020187378, + "logits/rejected": -1.375382661819458, + "logps/chosen": -183.3968048095703, + "logps/rejected": -78.99600982666016, + "loss": 0.2635, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.5063767433166504, + "rewards/margins": 0.9798592329025269, + "rewards/rejected": 1.5265175104141235, + "step": 9577 + }, + { + "epoch": 1.55, + "learning_rate": 1.2451369471463426e-06, + "logits/chosen": -1.2357710599899292, + "logits/rejected": -1.197176456451416, + "logps/chosen": -32.60920333862305, + "logps/rejected": -33.252628326416016, + "loss": 0.7695, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.7838242053985596, + "rewards/margins": -0.3379337787628174, + "rewards/rejected": 2.121757984161377, + "step": 9578 + }, + { + "epoch": 1.55, + "learning_rate": 1.244269231147331e-06, + "logits/chosen": -1.0897482633590698, + "logits/rejected": -1.0810445547103882, + "logps/chosen": -49.064857482910156, + "logps/rejected": -76.01278686523438, + "loss": 1.5936, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.963808536529541, + "rewards/margins": -0.5317573547363281, + "rewards/rejected": 3.495565891265869, + "step": 9579 + }, + { + "epoch": 1.55, + "learning_rate": 1.2434017746337773e-06, + "logits/chosen": -1.2593154907226562, + "logits/rejected": -1.2192953824996948, + "logps/chosen": -45.71648406982422, + "logps/rejected": -80.7442626953125, + "loss": 1.7639, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.4311962127685547, + "rewards/margins": 0.4842258095741272, + "rewards/rejected": 0.9469704031944275, + "step": 9580 + }, + { + "epoch": 1.56, + "learning_rate": 1.2425345776656166e-06, + "logits/chosen": -0.976535975933075, + "logits/rejected": -1.0001370906829834, + "logps/chosen": -61.76016616821289, + "logps/rejected": -103.17723083496094, + "loss": 0.5822, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.5986804962158203, + "rewards/margins": -0.5971782207489014, + "rewards/rejected": 2.1958587169647217, + "step": 9581 + }, + { + "epoch": 1.56, + "learning_rate": 1.2416676403027622e-06, + "logits/chosen": -1.1968611478805542, + "logits/rejected": -0.9555889964103699, + "logps/chosen": -55.04931640625, + "logps/rejected": -54.10845947265625, + "loss": 0.3089, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.607943058013916, + "rewards/margins": 0.16202783584594727, + "rewards/rejected": 2.4459152221679688, + "step": 9582 + }, + { + "epoch": 1.56, + "learning_rate": 1.2408009626051137e-06, + "logits/chosen": -1.135032296180725, + "logits/rejected": -1.0928475856781006, + "logps/chosen": -75.48793029785156, + "logps/rejected": -140.6766815185547, + "loss": 0.2845, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9190919399261475, + "rewards/margins": 0.26703953742980957, + "rewards/rejected": 2.652052402496338, + "step": 9583 + }, + { + "epoch": 1.56, + "learning_rate": 1.2399345446325472e-06, + "logits/chosen": -1.0620383024215698, + "logits/rejected": -1.0918688774108887, + "logps/chosen": -42.27737808227539, + "logps/rejected": -56.047813415527344, + "loss": 0.2609, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.0091958045959473, + "rewards/margins": 0.49482274055480957, + "rewards/rejected": 1.5143730640411377, + "step": 9584 + }, + { + "epoch": 1.56, + "learning_rate": 1.2390683864449271e-06, + "logits/chosen": -0.8927614688873291, + "logits/rejected": -0.8910417556762695, + "logps/chosen": -16.196758270263672, + "logps/rejected": -16.049663543701172, + "loss": 1.4724, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5756378173828125, + "rewards/margins": 0.03475457429885864, + "rewards/rejected": 0.5408832430839539, + "step": 9585 + }, + { + "epoch": 1.56, + "learning_rate": 1.2382024881020937e-06, + "logits/chosen": -1.8387364149093628, + "logits/rejected": -1.6298301219940186, + "logps/chosen": -112.10443115234375, + "logps/rejected": -14.712077140808105, + "loss": 1.9522, + "rewards/accuracies": 1.0, + "rewards/chosen": 7.144494533538818, + "rewards/margins": 6.1211066246032715, + "rewards/rejected": 1.0233877897262573, + "step": 9586 + }, + { + "epoch": 1.56, + "learning_rate": 1.2373368496638756e-06, + "logits/chosen": -1.4423260688781738, + "logits/rejected": -1.426884651184082, + "logps/chosen": -90.6046371459961, + "logps/rejected": -82.86055755615234, + "loss": 0.3475, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0910727977752686, + "rewards/margins": 1.3517241477966309, + "rewards/rejected": 1.7393486499786377, + "step": 9587 + }, + { + "epoch": 1.56, + "learning_rate": 1.2364714711900766e-06, + "logits/chosen": -1.3873391151428223, + "logits/rejected": -1.2723612785339355, + "logps/chosen": -130.0368194580078, + "logps/rejected": -115.93013763427734, + "loss": 1.7202, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.3365676403045654, + "rewards/margins": -3.1206138134002686, + "rewards/rejected": 5.457181453704834, + "step": 9588 + }, + { + "epoch": 1.56, + "learning_rate": 1.2356063527404883e-06, + "logits/chosen": -1.3164379596710205, + "logits/rejected": -1.1701890230178833, + "logps/chosen": -175.58734130859375, + "logps/rejected": -76.8480224609375, + "loss": 0.0214, + "rewards/accuracies": 1.0, + "rewards/chosen": 9.5129976272583, + "rewards/margins": 3.4623565673828125, + "rewards/rejected": 6.050641059875488, + "step": 9589 + }, + { + "epoch": 1.56, + "learning_rate": 1.2347414943748836e-06, + "logits/chosen": -1.0356395244598389, + "logits/rejected": -1.013668417930603, + "logps/chosen": -51.37147521972656, + "logps/rejected": -81.6985092163086, + "loss": 1.1267, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.058274984359741, + "rewards/margins": 0.1884545087814331, + "rewards/rejected": 1.869820475578308, + "step": 9590 + }, + { + "epoch": 1.56, + "learning_rate": 1.233876896153013e-06, + "logits/chosen": -1.0719618797302246, + "logits/rejected": -1.0906343460083008, + "logps/chosen": -24.77939224243164, + "logps/rejected": -47.723819732666016, + "loss": 0.4094, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.8361202478408813, + "rewards/margins": -0.22253036499023438, + "rewards/rejected": 1.0586506128311157, + "step": 9591 + }, + { + "epoch": 1.56, + "learning_rate": 1.2330125581346148e-06, + "logits/chosen": -1.229550838470459, + "logits/rejected": -1.2731401920318604, + "logps/chosen": -56.60594177246094, + "logps/rejected": -72.42718505859375, + "loss": 0.8695, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.733976125717163, + "rewards/margins": -0.7708709239959717, + "rewards/rejected": 3.5048470497131348, + "step": 9592 + }, + { + "epoch": 1.56, + "learning_rate": 1.2321484803794038e-06, + "logits/chosen": -1.3706408739089966, + "logits/rejected": -1.3165977001190186, + "logps/chosen": -59.916404724121094, + "logps/rejected": -53.21907043457031, + "loss": 0.6411, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.498253583908081, + "rewards/margins": -0.6447570323944092, + "rewards/rejected": 4.14301061630249, + "step": 9593 + }, + { + "epoch": 1.56, + "learning_rate": 1.2312846629470826e-06, + "logits/chosen": -1.063577651977539, + "logits/rejected": -1.0478883981704712, + "logps/chosen": -77.90530395507812, + "logps/rejected": -90.06161499023438, + "loss": 0.559, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.2185592651367188, + "rewards/margins": -0.10286879539489746, + "rewards/rejected": 3.321428060531616, + "step": 9594 + }, + { + "epoch": 1.56, + "learning_rate": 1.2304211058973297e-06, + "logits/chosen": -1.158164143562317, + "logits/rejected": -1.158164143562317, + "logps/chosen": -90.60835266113281, + "logps/rejected": -90.60835266113281, + "loss": 0.3645, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.2200562953948975, + "rewards/margins": 0.0, + "rewards/rejected": 3.2200562953948975, + "step": 9595 + }, + { + "epoch": 1.56, + "learning_rate": 1.229557809289812e-06, + "logits/chosen": -1.3021348714828491, + "logits/rejected": -1.2391548156738281, + "logps/chosen": -108.98397827148438, + "logps/rejected": -109.46070098876953, + "loss": 0.0392, + "rewards/accuracies": 1.0, + "rewards/chosen": 7.6854753494262695, + "rewards/margins": 3.7984094619750977, + "rewards/rejected": 3.887065887451172, + "step": 9596 + }, + { + "epoch": 1.56, + "learning_rate": 1.2286947731841714e-06, + "logits/chosen": -1.6039961576461792, + "logits/rejected": -1.5548739433288574, + "logps/chosen": -66.2894287109375, + "logps/rejected": -86.1148452758789, + "loss": 0.2414, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.791165351867676, + "rewards/margins": 0.5895547866821289, + "rewards/rejected": 4.201610565185547, + "step": 9597 + }, + { + "epoch": 1.56, + "learning_rate": 1.2278319976400393e-06, + "logits/chosen": -1.2923351526260376, + "logits/rejected": -1.2225018739700317, + "logps/chosen": -47.140403747558594, + "logps/rejected": -51.500823974609375, + "loss": 0.2703, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.141791582107544, + "rewards/margins": 0.6111000776290894, + "rewards/rejected": 1.5306915044784546, + "step": 9598 + }, + { + "epoch": 1.56, + "learning_rate": 1.2269694827170226e-06, + "logits/chosen": -1.1838698387145996, + "logits/rejected": -1.174878478050232, + "logps/chosen": -96.09614562988281, + "logps/rejected": -308.20819091796875, + "loss": 1.6454, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.853323459625244, + "rewards/margins": -2.6095213890075684, + "rewards/rejected": 6.4628448486328125, + "step": 9599 + }, + { + "epoch": 1.56, + "learning_rate": 1.226107228474715e-06, + "logits/chosen": -0.9762109518051147, + "logits/rejected": -0.9331482648849487, + "logps/chosen": -61.486793518066406, + "logps/rejected": -40.53894805908203, + "loss": 1.4686, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.971631646156311, + "rewards/margins": -0.3232383728027344, + "rewards/rejected": 1.2948700189590454, + "step": 9600 + }, + { + "epoch": 1.56, + "learning_rate": 1.225245234972689e-06, + "logits/chosen": -1.4429254531860352, + "logits/rejected": -1.4510680437088013, + "logps/chosen": -99.86451721191406, + "logps/rejected": -68.59025573730469, + "loss": 0.5683, + "rewards/accuracies": 0.0, + "rewards/chosen": 7.852229595184326, + "rewards/margins": -0.6663126945495605, + "rewards/rejected": 8.518542289733887, + "step": 9601 + }, + { + "epoch": 1.56, + "learning_rate": 1.2243835022705003e-06, + "logits/chosen": -0.8999891877174377, + "logits/rejected": -0.912325382232666, + "logps/chosen": -82.0523681640625, + "logps/rejected": -42.290008544921875, + "loss": 0.5491, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.1770340204238892, + "rewards/margins": -0.6692187786102295, + "rewards/rejected": 1.8462527990341187, + "step": 9602 + }, + { + "epoch": 1.56, + "learning_rate": 1.2235220304276846e-06, + "logits/chosen": -1.13262140750885, + "logits/rejected": -1.2122892141342163, + "logps/chosen": -39.042877197265625, + "logps/rejected": -86.3099136352539, + "loss": 0.774, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.0835952758789062, + "rewards/margins": -1.298264503479004, + "rewards/rejected": 4.38185977935791, + "step": 9603 + }, + { + "epoch": 1.56, + "learning_rate": 1.2226608195037648e-06, + "logits/chosen": -1.1310604810714722, + "logits/rejected": -1.139963150024414, + "logps/chosen": -74.55406951904297, + "logps/rejected": -89.92423248291016, + "loss": 0.5119, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.2623138427734375, + "rewards/margins": -0.3328850269317627, + "rewards/rejected": 1.5951988697052002, + "step": 9604 + }, + { + "epoch": 1.56, + "learning_rate": 1.2217998695582395e-06, + "logits/chosen": -1.3828166723251343, + "logits/rejected": -1.3273696899414062, + "logps/chosen": -66.4523696899414, + "logps/rejected": -89.25900268554688, + "loss": 1.291, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.6204116344451904, + "rewards/margins": 0.38578319549560547, + "rewards/rejected": 2.234628438949585, + "step": 9605 + }, + { + "epoch": 1.56, + "learning_rate": 1.2209391806505949e-06, + "logits/chosen": -1.6325596570968628, + "logits/rejected": -1.6021229028701782, + "logps/chosen": -66.92970275878906, + "logps/rejected": -99.48920440673828, + "loss": 0.9021, + "rewards/accuracies": 0.0, + "rewards/chosen": 5.090953826904297, + "rewards/margins": -1.5514178276062012, + "rewards/rejected": 6.642371654510498, + "step": 9606 + }, + { + "epoch": 1.56, + "learning_rate": 1.2200787528402941e-06, + "logits/chosen": -1.1462655067443848, + "logits/rejected": -1.2621831893920898, + "logps/chosen": -30.759458541870117, + "logps/rejected": -58.17829132080078, + "loss": 0.6676, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.665544271469116, + "rewards/margins": -0.9848272800445557, + "rewards/rejected": 3.650371551513672, + "step": 9607 + }, + { + "epoch": 1.56, + "learning_rate": 1.2192185861867866e-06, + "logits/chosen": -1.365236520767212, + "logits/rejected": -1.38712739944458, + "logps/chosen": -66.03689575195312, + "logps/rejected": -62.76121139526367, + "loss": 0.7966, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.6497840881347656, + "rewards/margins": -0.4561886787414551, + "rewards/rejected": 2.1059727668762207, + "step": 9608 + }, + { + "epoch": 1.56, + "learning_rate": 1.218358680749499e-06, + "logits/chosen": -1.451208233833313, + "logits/rejected": -1.332916021347046, + "logps/chosen": -49.65858459472656, + "logps/rejected": -30.51224136352539, + "loss": 0.5606, + "rewards/accuracies": 0.0, + "rewards/chosen": 4.612248420715332, + "rewards/margins": -0.4911632537841797, + "rewards/rejected": 5.103411674499512, + "step": 9609 + }, + { + "epoch": 1.56, + "learning_rate": 1.2174990365878448e-06, + "logits/chosen": -0.9764899611473083, + "logits/rejected": -0.9558854699134827, + "logps/chosen": -54.3582649230957, + "logps/rejected": -67.51834106445312, + "loss": 0.3781, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.6570706367492676, + "rewards/margins": -0.11629509925842285, + "rewards/rejected": 2.7733657360076904, + "step": 9610 + }, + { + "epoch": 1.56, + "learning_rate": 1.216639653761218e-06, + "logits/chosen": -1.455877423286438, + "logits/rejected": -1.4404414892196655, + "logps/chosen": -64.36582946777344, + "logps/rejected": -73.54428100585938, + "loss": 1.188, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.480211615562439, + "rewards/margins": -1.6054948568344116, + "rewards/rejected": 3.0857064723968506, + "step": 9611 + }, + { + "epoch": 1.56, + "learning_rate": 1.2157805323289912e-06, + "logits/chosen": -1.41383695602417, + "logits/rejected": -1.3713874816894531, + "logps/chosen": -198.2191162109375, + "logps/rejected": -86.62864685058594, + "loss": 0.0081, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.955264568328857, + "rewards/margins": 4.399806499481201, + "rewards/rejected": 2.5554580688476562, + "step": 9612 + }, + { + "epoch": 1.56, + "learning_rate": 1.2149216723505247e-06, + "logits/chosen": -1.2564777135849, + "logits/rejected": -1.1392416954040527, + "logps/chosen": -31.120193481445312, + "logps/rejected": -9.941852569580078, + "loss": 0.0713, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.4671714305877686, + "rewards/margins": 1.8967392444610596, + "rewards/rejected": 0.570432186126709, + "step": 9613 + }, + { + "epoch": 1.56, + "learning_rate": 1.2140630738851544e-06, + "logits/chosen": -1.6291717290878296, + "logits/rejected": -1.6592599153518677, + "logps/chosen": -152.61265563964844, + "logps/rejected": -127.96633911132812, + "loss": 0.6951, + "rewards/accuracies": 1.0, + "rewards/chosen": 8.286734580993652, + "rewards/margins": 1.2299489974975586, + "rewards/rejected": 7.056785583496094, + "step": 9614 + }, + { + "epoch": 1.56, + "learning_rate": 1.213204736992204e-06, + "logits/chosen": -0.9756706953048706, + "logits/rejected": -0.9955260157585144, + "logps/chosen": -54.19160461425781, + "logps/rejected": -40.2735595703125, + "loss": 0.4877, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.5142822265625, + "rewards/margins": 0.7021437883377075, + "rewards/rejected": 1.8121384382247925, + "step": 9615 + }, + { + "epoch": 1.56, + "learning_rate": 1.2123466617309742e-06, + "logits/chosen": -1.1284757852554321, + "logits/rejected": -1.1284757852554321, + "logps/chosen": -22.317935943603516, + "logps/rejected": -22.317935943603516, + "loss": 0.6168, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.4112491607666016, + "rewards/margins": 0.0, + "rewards/rejected": 3.4112491607666016, + "step": 9616 + }, + { + "epoch": 1.56, + "learning_rate": 1.2114888481607522e-06, + "logits/chosen": -1.6415941715240479, + "logits/rejected": -1.587881088256836, + "logps/chosen": -115.85739135742188, + "logps/rejected": -64.6938247680664, + "loss": 0.4763, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.0084075927734375, + "rewards/margins": 2.325763702392578, + "rewards/rejected": 3.6826438903808594, + "step": 9617 + }, + { + "epoch": 1.56, + "learning_rate": 1.2106312963408024e-06, + "logits/chosen": -1.09684157371521, + "logits/rejected": -1.0629695653915405, + "logps/chosen": -65.75068664550781, + "logps/rejected": -90.31163024902344, + "loss": 0.2664, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.8005943298339844, + "rewards/margins": 1.0010932683944702, + "rewards/rejected": 1.7995010614395142, + "step": 9618 + }, + { + "epoch": 1.56, + "learning_rate": 1.2097740063303752e-06, + "logits/chosen": -1.3762142658233643, + "logits/rejected": -1.2336678504943848, + "logps/chosen": -34.671669006347656, + "logps/rejected": -6.11237096786499, + "loss": 0.0495, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2226455211639404, + "rewards/margins": 2.5029964447021484, + "rewards/rejected": 0.719649076461792, + "step": 9619 + }, + { + "epoch": 1.56, + "learning_rate": 1.2089169781887e-06, + "logits/chosen": -1.3767693042755127, + "logits/rejected": -1.3223737478256226, + "logps/chosen": -96.05811309814453, + "logps/rejected": -89.76852416992188, + "loss": 0.2287, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.082430362701416, + "rewards/margins": 1.4898688793182373, + "rewards/rejected": 2.5925614833831787, + "step": 9620 + }, + { + "epoch": 1.56, + "learning_rate": 1.2080602119749918e-06, + "logits/chosen": -1.4753026962280273, + "logits/rejected": -1.471559762954712, + "logps/chosen": -48.193748474121094, + "logps/rejected": -55.94990158081055, + "loss": 0.8268, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.540839433670044, + "rewards/margins": -1.2989940643310547, + "rewards/rejected": 3.8398334980010986, + "step": 9621 + }, + { + "epoch": 1.56, + "learning_rate": 1.2072037077484416e-06, + "logits/chosen": -1.3810865879058838, + "logits/rejected": -1.4205242395401, + "logps/chosen": -95.02214050292969, + "logps/rejected": -97.80540466308594, + "loss": 0.6354, + "rewards/accuracies": 0.0, + "rewards/chosen": 6.005210876464844, + "rewards/margins": -0.06644439697265625, + "rewards/rejected": 6.0716552734375, + "step": 9622 + }, + { + "epoch": 1.56, + "learning_rate": 1.2063474655682284e-06, + "logits/chosen": -1.3801549673080444, + "logits/rejected": -1.378501057624817, + "logps/chosen": -21.495363235473633, + "logps/rejected": -31.753143310546875, + "loss": 0.5842, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.9490448236465454, + "rewards/margins": 0.28062933683395386, + "rewards/rejected": 0.6684154868125916, + "step": 9623 + }, + { + "epoch": 1.56, + "learning_rate": 1.2054914854935086e-06, + "logits/chosen": -1.433223843574524, + "logits/rejected": -1.6303764581680298, + "logps/chosen": -155.25160217285156, + "logps/rejected": -174.448486328125, + "loss": 1.1407, + "rewards/accuracies": 0.0, + "rewards/chosen": 6.292144775390625, + "rewards/margins": -2.1566343307495117, + "rewards/rejected": 8.448779106140137, + "step": 9624 + }, + { + "epoch": 1.56, + "learning_rate": 1.2046357675834241e-06, + "logits/chosen": -1.0361069440841675, + "logits/rejected": -0.9458004832267761, + "logps/chosen": -57.55970001220703, + "logps/rejected": -48.86780548095703, + "loss": 0.1989, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.150353193283081, + "rewards/margins": 0.7195258140563965, + "rewards/rejected": 2.4308273792266846, + "step": 9625 + }, + { + "epoch": 1.56, + "learning_rate": 1.2037803118970948e-06, + "logits/chosen": -1.620260238647461, + "logits/rejected": -1.5959923267364502, + "logps/chosen": -66.35639953613281, + "logps/rejected": -56.040714263916016, + "loss": 0.5803, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.981579542160034, + "rewards/margins": -0.7357900142669678, + "rewards/rejected": 3.717369556427002, + "step": 9626 + }, + { + "epoch": 1.56, + "learning_rate": 1.2029251184936275e-06, + "logits/chosen": -1.7204856872558594, + "logits/rejected": -1.6063613891601562, + "logps/chosen": -78.37910461425781, + "logps/rejected": -80.53083801269531, + "loss": 0.0975, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.225935459136963, + "rewards/margins": 2.168696403503418, + "rewards/rejected": 4.057239055633545, + "step": 9627 + }, + { + "epoch": 1.56, + "learning_rate": 1.2020701874321044e-06, + "logits/chosen": -1.4160231351852417, + "logits/rejected": -1.386297583580017, + "logps/chosen": -70.66477966308594, + "logps/rejected": -68.89728546142578, + "loss": 0.4957, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.028111457824707, + "rewards/margins": 1.506441593170166, + "rewards/rejected": 3.521669864654541, + "step": 9628 + }, + { + "epoch": 1.56, + "learning_rate": 1.2012155187715968e-06, + "logits/chosen": -1.1620038747787476, + "logits/rejected": -1.156336784362793, + "logps/chosen": -108.93055725097656, + "logps/rejected": -74.72250366210938, + "loss": 0.2263, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.810705661773682, + "rewards/margins": 0.9291367530822754, + "rewards/rejected": 4.881568908691406, + "step": 9629 + }, + { + "epoch": 1.56, + "learning_rate": 1.2003611125711507e-06, + "logits/chosen": -1.5013492107391357, + "logits/rejected": -1.4265207052230835, + "logps/chosen": -64.10887908935547, + "logps/rejected": -205.04302978515625, + "loss": 1.8324, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.1455771923065186, + "rewards/margins": -3.591538190841675, + "rewards/rejected": 6.737115383148193, + "step": 9630 + }, + { + "epoch": 1.56, + "learning_rate": 1.1995069688898003e-06, + "logits/chosen": -1.3720637559890747, + "logits/rejected": -1.3777201175689697, + "logps/chosen": -71.2132797241211, + "logps/rejected": -66.9380874633789, + "loss": 1.1337, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.4479637145996094, + "rewards/margins": -0.7979347705841064, + "rewards/rejected": 3.245898485183716, + "step": 9631 + }, + { + "epoch": 1.56, + "learning_rate": 1.1986530877865572e-06, + "logits/chosen": -1.425749659538269, + "logits/rejected": -1.5087575912475586, + "logps/chosen": -125.97908782958984, + "logps/rejected": -177.86328125, + "loss": 0.4657, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.635688304901123, + "rewards/margins": 1.7499065399169922, + "rewards/rejected": 4.885781764984131, + "step": 9632 + }, + { + "epoch": 1.56, + "learning_rate": 1.1977994693204176e-06, + "logits/chosen": -1.1079946756362915, + "logits/rejected": -1.042001724243164, + "logps/chosen": -34.91511154174805, + "logps/rejected": -47.649810791015625, + "loss": 0.2538, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.7596920728683472, + "rewards/margins": 0.7475666999816895, + "rewards/rejected": 1.0121253728866577, + "step": 9633 + }, + { + "epoch": 1.56, + "learning_rate": 1.1969461135503573e-06, + "logits/chosen": -1.4278143644332886, + "logits/rejected": -1.3819526433944702, + "logps/chosen": -175.49777221679688, + "logps/rejected": -128.68612670898438, + "loss": 0.501, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.4545440673828125, + "rewards/margins": 1.4790067672729492, + "rewards/rejected": 4.975537300109863, + "step": 9634 + }, + { + "epoch": 1.56, + "learning_rate": 1.1960930205353366e-06, + "logits/chosen": -1.0668522119522095, + "logits/rejected": -1.1139265298843384, + "logps/chosen": -66.89173889160156, + "logps/rejected": -55.20814895629883, + "loss": 0.1967, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.350261688232422, + "rewards/margins": 0.7391955852508545, + "rewards/rejected": 1.6110661029815674, + "step": 9635 + }, + { + "epoch": 1.56, + "learning_rate": 1.1952401903342942e-06, + "logits/chosen": -1.0607417821884155, + "logits/rejected": -1.0607417821884155, + "logps/chosen": -76.94180297851562, + "logps/rejected": -76.94180297851562, + "loss": 0.7129, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.5365097522735596, + "rewards/margins": 0.0, + "rewards/rejected": 2.5365097522735596, + "step": 9636 + }, + { + "epoch": 1.56, + "learning_rate": 1.1943876230061546e-06, + "logits/chosen": -0.6477435231208801, + "logits/rejected": -0.6776041388511658, + "logps/chosen": -11.868709564208984, + "logps/rejected": -45.56745910644531, + "loss": 0.6702, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.14256344735622406, + "rewards/margins": -1.0363378524780273, + "rewards/rejected": 1.1789013147354126, + "step": 9637 + }, + { + "epoch": 1.56, + "learning_rate": 1.1935353186098204e-06, + "logits/chosen": -1.556484580039978, + "logits/rejected": -1.4903484582901, + "logps/chosen": -81.0052261352539, + "logps/rejected": -70.11700439453125, + "loss": 0.27, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.5986857414245605, + "rewards/margins": 2.0105738639831543, + "rewards/rejected": 3.5881118774414062, + "step": 9638 + }, + { + "epoch": 1.56, + "learning_rate": 1.1926832772041797e-06, + "logits/chosen": -1.546520471572876, + "logits/rejected": -1.4997442960739136, + "logps/chosen": -77.50733947753906, + "logps/rejected": -41.50475311279297, + "loss": 1.4375, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.9950889348983765, + "rewards/margins": 1.2188514471054077, + "rewards/rejected": 0.7762374877929688, + "step": 9639 + }, + { + "epoch": 1.56, + "learning_rate": 1.1918314988480978e-06, + "logits/chosen": -1.4200055599212646, + "logits/rejected": -1.443466067314148, + "logps/chosen": -70.34982299804688, + "logps/rejected": -61.175296783447266, + "loss": 0.3404, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.586402177810669, + "rewards/margins": 0.17075014114379883, + "rewards/rejected": 2.41565203666687, + "step": 9640 + }, + { + "epoch": 1.56, + "learning_rate": 1.1909799836004276e-06, + "logits/chosen": -1.320569634437561, + "logits/rejected": -1.2877706289291382, + "logps/chosen": -135.2127227783203, + "logps/rejected": -90.82003784179688, + "loss": 0.2287, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.997552394866943, + "rewards/margins": 3.3403494358062744, + "rewards/rejected": 2.657202959060669, + "step": 9641 + }, + { + "epoch": 1.57, + "learning_rate": 1.1901287315199977e-06, + "logits/chosen": -1.4465057849884033, + "logits/rejected": -1.4561538696289062, + "logps/chosen": -98.19854736328125, + "logps/rejected": -59.84969711303711, + "loss": 0.4417, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.3454208374023438, + "rewards/margins": 0.1737903356552124, + "rewards/rejected": 1.1716305017471313, + "step": 9642 + }, + { + "epoch": 1.57, + "learning_rate": 1.1892777426656249e-06, + "logits/chosen": -1.2963579893112183, + "logits/rejected": -1.2963579893112183, + "logps/chosen": -67.77042388916016, + "logps/rejected": -67.77042388916016, + "loss": 0.4991, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.6247856616973877, + "rewards/margins": 0.0, + "rewards/rejected": 3.6247856616973877, + "step": 9643 + }, + { + "epoch": 1.57, + "learning_rate": 1.188427017096101e-06, + "logits/chosen": -1.258123755455017, + "logits/rejected": -1.2144184112548828, + "logps/chosen": -113.64274597167969, + "logps/rejected": -56.01777267456055, + "loss": 0.208, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.615139961242676, + "rewards/margins": 2.033808708190918, + "rewards/rejected": 2.581331253051758, + "step": 9644 + }, + { + "epoch": 1.57, + "learning_rate": 1.1875765548702052e-06, + "logits/chosen": -0.8758833408355713, + "logits/rejected": -0.9002610445022583, + "logps/chosen": -4.1667561531066895, + "logps/rejected": -81.26994323730469, + "loss": 0.2501, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.4466109871864319, + "rewards/margins": 0.449428528547287, + "rewards/rejected": -0.002817535540089011, + "step": 9645 + }, + { + "epoch": 1.57, + "learning_rate": 1.1867263560466968e-06, + "logits/chosen": -1.4294339418411255, + "logits/rejected": -1.3430253267288208, + "logps/chosen": -54.94820785522461, + "logps/rejected": -93.94229125976562, + "loss": 1.1013, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.343712329864502, + "rewards/margins": -1.4103648662567139, + "rewards/rejected": 3.754077196121216, + "step": 9646 + }, + { + "epoch": 1.57, + "learning_rate": 1.185876420684315e-06, + "logits/chosen": -1.3032728433609009, + "logits/rejected": -1.3032728433609009, + "logps/chosen": -51.40160369873047, + "logps/rejected": -51.40160369873047, + "loss": 0.6145, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.4778029918670654, + "rewards/margins": 0.0, + "rewards/rejected": 2.4778029918670654, + "step": 9647 + }, + { + "epoch": 1.57, + "learning_rate": 1.1850267488417838e-06, + "logits/chosen": -1.03575599193573, + "logits/rejected": -1.098527193069458, + "logps/chosen": -71.26795959472656, + "logps/rejected": -68.92241668701172, + "loss": 0.4108, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.261082410812378, + "rewards/margins": 0.3127845525741577, + "rewards/rejected": 1.9482978582382202, + "step": 9648 + }, + { + "epoch": 1.57, + "learning_rate": 1.184177340577805e-06, + "logits/chosen": -0.9477642774581909, + "logits/rejected": -0.917464554309845, + "logps/chosen": -34.40080642700195, + "logps/rejected": -6.536084175109863, + "loss": 2.8536, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.7073391079902649, + "rewards/margins": 0.07491618394851685, + "rewards/rejected": 0.632422924041748, + "step": 9649 + }, + { + "epoch": 1.57, + "learning_rate": 1.1833281959510684e-06, + "logits/chosen": -1.3169975280761719, + "logits/rejected": -1.2664064168930054, + "logps/chosen": -48.295013427734375, + "logps/rejected": -103.24221801757812, + "loss": 1.25, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.450585126876831, + "rewards/margins": -2.2393763065338135, + "rewards/rejected": 4.6899614334106445, + "step": 9650 + }, + { + "epoch": 1.57, + "learning_rate": 1.1824793150202379e-06, + "logits/chosen": -0.8166632056236267, + "logits/rejected": -0.8186457753181458, + "logps/chosen": -47.89643096923828, + "logps/rejected": -52.26569366455078, + "loss": 0.4807, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.0197372436523438, + "rewards/margins": 0.7983261346817017, + "rewards/rejected": 1.221411108970642, + "step": 9651 + }, + { + "epoch": 1.57, + "learning_rate": 1.1816306978439668e-06, + "logits/chosen": -1.382089614868164, + "logits/rejected": -1.3201627731323242, + "logps/chosen": -120.97557067871094, + "logps/rejected": -72.82288360595703, + "loss": 0.5534, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.957366943359375, + "rewards/margins": 1.147179365158081, + "rewards/rejected": 1.810187578201294, + "step": 9652 + }, + { + "epoch": 1.57, + "learning_rate": 1.180782344480883e-06, + "logits/chosen": -1.393898844718933, + "logits/rejected": -1.4152072668075562, + "logps/chosen": -86.3463363647461, + "logps/rejected": -105.73829650878906, + "loss": 0.6133, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.1705689430236816, + "rewards/margins": 0.7986626625061035, + "rewards/rejected": 1.3719062805175781, + "step": 9653 + }, + { + "epoch": 1.57, + "learning_rate": 1.1799342549896027e-06, + "logits/chosen": -1.4094399213790894, + "logits/rejected": -1.4112179279327393, + "logps/chosen": -84.72676086425781, + "logps/rejected": -60.363128662109375, + "loss": 0.5824, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.0464844703674316, + "rewards/margins": 0.5481705665588379, + "rewards/rejected": 1.4983139038085938, + "step": 9654 + }, + { + "epoch": 1.57, + "learning_rate": 1.1790864294287186e-06, + "logits/chosen": -1.4270009994506836, + "logits/rejected": -1.4270009994506836, + "logps/chosen": -47.61670684814453, + "logps/rejected": -47.61670684814453, + "loss": 0.3884, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.8825554847717285, + "rewards/margins": 0.0, + "rewards/rejected": 3.8825554847717285, + "step": 9655 + }, + { + "epoch": 1.57, + "learning_rate": 1.1782388678568097e-06, + "logits/chosen": -1.1866371631622314, + "logits/rejected": -1.1887191534042358, + "logps/chosen": -82.51637268066406, + "logps/rejected": -77.6121826171875, + "loss": 0.1562, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.686142683029175, + "rewards/margins": 3.4039320945739746, + "rewards/rejected": 0.2822105586528778, + "step": 9656 + }, + { + "epoch": 1.57, + "learning_rate": 1.1773915703324317e-06, + "logits/chosen": -1.4744688272476196, + "logits/rejected": -1.4498523473739624, + "logps/chosen": -66.51362609863281, + "logps/rejected": -78.6787109375, + "loss": 1.0112, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.569187879562378, + "rewards/margins": 1.1481170654296875, + "rewards/rejected": 2.4210708141326904, + "step": 9657 + }, + { + "epoch": 1.57, + "learning_rate": 1.1765445369141276e-06, + "logits/chosen": -1.060562252998352, + "logits/rejected": -1.0666648149490356, + "logps/chosen": -8.96683120727539, + "logps/rejected": -1.8514095544815063, + "loss": 0.5705, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.2568739056587219, + "rewards/margins": -0.11835387349128723, + "rewards/rejected": 0.37522777915000916, + "step": 9658 + }, + { + "epoch": 1.57, + "learning_rate": 1.175697767660417e-06, + "logits/chosen": -1.6764562129974365, + "logits/rejected": -1.6736129522323608, + "logps/chosen": -146.19143676757812, + "logps/rejected": -88.13154602050781, + "loss": 0.078, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.9408860206604, + "rewards/margins": 2.724623203277588, + "rewards/rejected": 3.2162628173828125, + "step": 9659 + }, + { + "epoch": 1.57, + "learning_rate": 1.1748512626298058e-06, + "logits/chosen": -1.5667320489883423, + "logits/rejected": -1.436707854270935, + "logps/chosen": -131.4342498779297, + "logps/rejected": -81.31559753417969, + "loss": 0.0217, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.463624477386475, + "rewards/margins": 4.019506454467773, + "rewards/rejected": 1.444117784500122, + "step": 9660 + }, + { + "epoch": 1.57, + "learning_rate": 1.1740050218807775e-06, + "logits/chosen": -1.3591862916946411, + "logits/rejected": -1.3146823644638062, + "logps/chosen": -66.60400390625, + "logps/rejected": -77.19170379638672, + "loss": 2.7221, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.9353607296943665, + "rewards/margins": 0.23786240816116333, + "rewards/rejected": 0.6974983215332031, + "step": 9661 + }, + { + "epoch": 1.57, + "learning_rate": 1.1731590454718012e-06, + "logits/chosen": -1.372219443321228, + "logits/rejected": -1.468801498413086, + "logps/chosen": -90.29286193847656, + "logps/rejected": -98.72818756103516, + "loss": 2.186, + "rewards/accuracies": 0.0, + "rewards/chosen": 5.169766426086426, + "rewards/margins": -3.3110408782958984, + "rewards/rejected": 8.480807304382324, + "step": 9662 + }, + { + "epoch": 1.57, + "learning_rate": 1.172313333461324e-06, + "logits/chosen": -1.6974592208862305, + "logits/rejected": -1.5748584270477295, + "logps/chosen": -107.97129821777344, + "logps/rejected": -77.94529724121094, + "loss": 0.3693, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.524151802062988, + "rewards/margins": 1.5729050636291504, + "rewards/rejected": 4.951246738433838, + "step": 9663 + }, + { + "epoch": 1.57, + "learning_rate": 1.1714678859077788e-06, + "logits/chosen": -1.1176106929779053, + "logits/rejected": -1.0531671047210693, + "logps/chosen": -40.29158401489258, + "logps/rejected": -28.07386016845703, + "loss": 0.2038, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.8498291373252869, + "rewards/margins": 0.7198317050933838, + "rewards/rejected": 0.12999744713306427, + "step": 9664 + }, + { + "epoch": 1.57, + "learning_rate": 1.1706227028695755e-06, + "logits/chosen": -1.1205593347549438, + "logits/rejected": -1.0924099683761597, + "logps/chosen": -35.09228515625, + "logps/rejected": -52.911739349365234, + "loss": 0.3252, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.654468059539795, + "rewards/margins": 0.7924361228942871, + "rewards/rejected": 3.862031936645508, + "step": 9665 + }, + { + "epoch": 1.57, + "learning_rate": 1.1697777844051105e-06, + "logits/chosen": -1.3922170400619507, + "logits/rejected": -1.287475824356079, + "logps/chosen": -104.71440124511719, + "logps/rejected": -42.096256256103516, + "loss": 0.129, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.004124641418457, + "rewards/margins": 1.6571753025054932, + "rewards/rejected": 2.346949338912964, + "step": 9666 + }, + { + "epoch": 1.57, + "learning_rate": 1.1689331305727574e-06, + "logits/chosen": -1.0105129480361938, + "logits/rejected": -1.0105129480361938, + "logps/chosen": -1.475705862045288, + "logps/rejected": -1.475705862045288, + "loss": 0.7159, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.24483107030391693, + "rewards/margins": 0.0, + "rewards/rejected": 0.24483107030391693, + "step": 9667 + }, + { + "epoch": 1.57, + "learning_rate": 1.1680887414308768e-06, + "logits/chosen": -1.4052914381027222, + "logits/rejected": -1.3136789798736572, + "logps/chosen": -59.328453063964844, + "logps/rejected": -48.25660705566406, + "loss": 0.5716, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.97773814201355, + "rewards/margins": 0.04771566390991211, + "rewards/rejected": 2.9300224781036377, + "step": 9668 + }, + { + "epoch": 1.57, + "learning_rate": 1.167244617037805e-06, + "logits/chosen": -0.9977054595947266, + "logits/rejected": -0.9977054595947266, + "logps/chosen": -0.6679317355155945, + "logps/rejected": -0.6679317355155945, + "loss": 0.4711, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.2823023498058319, + "rewards/margins": 0.0, + "rewards/rejected": 0.2823023498058319, + "step": 9669 + }, + { + "epoch": 1.57, + "learning_rate": 1.1664007574518655e-06, + "logits/chosen": -1.694685935974121, + "logits/rejected": -1.5808624029159546, + "logps/chosen": -136.46920776367188, + "logps/rejected": -70.94286346435547, + "loss": 0.0509, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.701590061187744, + "rewards/margins": 2.7120537757873535, + "rewards/rejected": 1.9895362854003906, + "step": 9670 + }, + { + "epoch": 1.57, + "learning_rate": 1.1655571627313583e-06, + "logits/chosen": -1.2958816289901733, + "logits/rejected": -1.3566508293151855, + "logps/chosen": -81.25961303710938, + "logps/rejected": -159.5840301513672, + "loss": 1.1365, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.312969923019409, + "rewards/margins": -1.0784683227539062, + "rewards/rejected": 3.3914382457733154, + "step": 9671 + }, + { + "epoch": 1.57, + "learning_rate": 1.1647138329345709e-06, + "logits/chosen": -1.2375812530517578, + "logits/rejected": -1.2278343439102173, + "logps/chosen": -240.33465576171875, + "logps/rejected": -92.54856872558594, + "loss": 0.0514, + "rewards/accuracies": 1.0, + "rewards/chosen": 9.19956111907959, + "rewards/margins": 3.822925090789795, + "rewards/rejected": 5.376636028289795, + "step": 9672 + }, + { + "epoch": 1.57, + "learning_rate": 1.1638707681197658e-06, + "logits/chosen": -1.0100340843200684, + "logits/rejected": -1.0100340843200684, + "logps/chosen": -36.92097091674805, + "logps/rejected": -36.92097091674805, + "loss": 0.4358, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.8774913549423218, + "rewards/margins": 0.0, + "rewards/rejected": 1.8774913549423218, + "step": 9673 + }, + { + "epoch": 1.57, + "learning_rate": 1.163027968345195e-06, + "logits/chosen": -1.4246888160705566, + "logits/rejected": -1.4246888160705566, + "logps/chosen": -78.78336334228516, + "logps/rejected": -78.78336334228516, + "loss": 0.4582, + "rewards/accuracies": 0.0, + "rewards/chosen": 4.122464179992676, + "rewards/margins": 0.0, + "rewards/rejected": 4.122464179992676, + "step": 9674 + }, + { + "epoch": 1.57, + "learning_rate": 1.162185433669084e-06, + "logits/chosen": -1.3778266906738281, + "logits/rejected": -1.32127046585083, + "logps/chosen": -66.20603942871094, + "logps/rejected": -42.02373504638672, + "loss": 0.4528, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.8321045637130737, + "rewards/margins": -0.38557660579681396, + "rewards/rejected": 2.2176811695098877, + "step": 9675 + }, + { + "epoch": 1.57, + "learning_rate": 1.1613431641496475e-06, + "logits/chosen": -1.3969993591308594, + "logits/rejected": -1.3682252168655396, + "logps/chosen": -61.65602111816406, + "logps/rejected": -74.0092544555664, + "loss": 1.0237, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.002700090408325, + "rewards/margins": -1.054905652999878, + "rewards/rejected": 4.057605743408203, + "step": 9676 + }, + { + "epoch": 1.57, + "learning_rate": 1.160501159845075e-06, + "logits/chosen": -1.3901762962341309, + "logits/rejected": -1.389721155166626, + "logps/chosen": -81.10884857177734, + "logps/rejected": -65.95111846923828, + "loss": 0.2503, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.413828372955322, + "rewards/margins": 0.4768540859222412, + "rewards/rejected": 3.936974287033081, + "step": 9677 + }, + { + "epoch": 1.57, + "learning_rate": 1.1596594208135443e-06, + "logits/chosen": -1.3059463500976562, + "logits/rejected": -1.2657318115234375, + "logps/chosen": -114.45854949951172, + "logps/rejected": -102.80787658691406, + "loss": 0.9251, + "rewards/accuracies": 0.0, + "rewards/chosen": 4.9555487632751465, + "rewards/margins": -0.899416446685791, + "rewards/rejected": 5.8549652099609375, + "step": 9678 + }, + { + "epoch": 1.57, + "learning_rate": 1.1588179471132083e-06, + "logits/chosen": -1.3190141916275024, + "logits/rejected": -1.3723442554473877, + "logps/chosen": -44.213619232177734, + "logps/rejected": -71.97087097167969, + "loss": 0.4278, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.4466991424560547, + "rewards/margins": -0.2882411479949951, + "rewards/rejected": 2.73494029045105, + "step": 9679 + }, + { + "epoch": 1.57, + "learning_rate": 1.1579767388022068e-06, + "logits/chosen": -1.2621270418167114, + "logits/rejected": -1.1269670724868774, + "logps/chosen": -106.76497650146484, + "logps/rejected": -21.461668014526367, + "loss": 0.7462, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.652019500732422, + "rewards/margins": 2.2025585174560547, + "rewards/rejected": 0.4494609832763672, + "step": 9680 + }, + { + "epoch": 1.57, + "learning_rate": 1.1571357959386604e-06, + "logits/chosen": -1.4050790071487427, + "logits/rejected": -1.387594223022461, + "logps/chosen": -46.15394973754883, + "logps/rejected": -52.713356018066406, + "loss": 0.5495, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.6680920124053955, + "rewards/margins": -0.3684208393096924, + "rewards/rejected": 3.036512851715088, + "step": 9681 + }, + { + "epoch": 1.57, + "learning_rate": 1.1562951185806675e-06, + "logits/chosen": -1.0125325918197632, + "logits/rejected": -0.8031079769134521, + "logps/chosen": -64.45146179199219, + "logps/rejected": -17.967010498046875, + "loss": 0.7643, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.418505907058716, + "rewards/margins": 1.7364394664764404, + "rewards/rejected": 0.6820663809776306, + "step": 9682 + }, + { + "epoch": 1.57, + "learning_rate": 1.1554547067863136e-06, + "logits/chosen": -1.1309521198272705, + "logits/rejected": -1.1032910346984863, + "logps/chosen": -76.23626708984375, + "logps/rejected": -82.89820861816406, + "loss": 0.7444, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.1188125610351562, + "rewards/margins": -0.8530158996582031, + "rewards/rejected": 1.9718284606933594, + "step": 9683 + }, + { + "epoch": 1.57, + "learning_rate": 1.154614560613661e-06, + "logits/chosen": -1.1910423040390015, + "logits/rejected": -1.2318798303604126, + "logps/chosen": -142.12030029296875, + "logps/rejected": -103.7633056640625, + "loss": 0.0283, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.095086574554443, + "rewards/margins": 2.859194040298462, + "rewards/rejected": 1.2358925342559814, + "step": 9684 + }, + { + "epoch": 1.57, + "learning_rate": 1.1537746801207582e-06, + "logits/chosen": -1.063126564025879, + "logits/rejected": -1.0782159566879272, + "logps/chosen": -55.35567092895508, + "logps/rejected": -69.50978088378906, + "loss": 0.3929, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.503925800323486, + "rewards/margins": 1.3588955402374268, + "rewards/rejected": 3.1450302600860596, + "step": 9685 + }, + { + "epoch": 1.57, + "learning_rate": 1.1529350653656307e-06, + "logits/chosen": -1.2916138172149658, + "logits/rejected": -1.2918223142623901, + "logps/chosen": -86.59624481201172, + "logps/rejected": -53.68074035644531, + "loss": 0.4995, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.026287078857422, + "rewards/margins": -0.0844581127166748, + "rewards/rejected": 2.1107451915740967, + "step": 9686 + }, + { + "epoch": 1.57, + "learning_rate": 1.1520957164062896e-06, + "logits/chosen": -1.141946792602539, + "logits/rejected": -1.0774070024490356, + "logps/chosen": -26.20000457763672, + "logps/rejected": -8.49699592590332, + "loss": 0.3492, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.597095251083374, + "rewards/margins": 2.6745622158050537, + "rewards/rejected": 0.9225330352783203, + "step": 9687 + }, + { + "epoch": 1.57, + "learning_rate": 1.1512566333007247e-06, + "logits/chosen": -1.3791084289550781, + "logits/rejected": -1.4396250247955322, + "logps/chosen": -104.57632446289062, + "logps/rejected": -168.12831115722656, + "loss": 2.3165, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.8384811878204346, + "rewards/margins": -4.068638801574707, + "rewards/rejected": 5.9071197509765625, + "step": 9688 + }, + { + "epoch": 1.57, + "learning_rate": 1.1504178161069102e-06, + "logits/chosen": -1.0542528629302979, + "logits/rejected": -0.7935264706611633, + "logps/chosen": -85.31327819824219, + "logps/rejected": -21.652299880981445, + "loss": 0.6596, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.69045877456665, + "rewards/margins": 4.385585308074951, + "rewards/rejected": 0.30487367510795593, + "step": 9689 + }, + { + "epoch": 1.57, + "learning_rate": 1.149579264882798e-06, + "logits/chosen": -1.4019577503204346, + "logits/rejected": -1.4591988325119019, + "logps/chosen": -56.52064514160156, + "logps/rejected": -82.85193634033203, + "loss": 0.7606, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.3963654041290283, + "rewards/margins": -1.0831072330474854, + "rewards/rejected": 4.479472637176514, + "step": 9690 + }, + { + "epoch": 1.57, + "learning_rate": 1.1487409796863269e-06, + "logits/chosen": -1.3357512950897217, + "logits/rejected": -1.2461529970169067, + "logps/chosen": -45.55126953125, + "logps/rejected": -40.01974868774414, + "loss": 0.1114, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.64049232006073, + "rewards/margins": 1.3928357362747192, + "rewards/rejected": 0.24765662848949432, + "step": 9691 + }, + { + "epoch": 1.57, + "learning_rate": 1.1479029605754121e-06, + "logits/chosen": -1.030912160873413, + "logits/rejected": -0.957260251045227, + "logps/chosen": -35.480804443359375, + "logps/rejected": -35.42762756347656, + "loss": 0.6068, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2406280040740967, + "rewards/margins": 1.3663123846054077, + "rewards/rejected": 1.874315619468689, + "step": 9692 + }, + { + "epoch": 1.57, + "learning_rate": 1.1470652076079552e-06, + "logits/chosen": -1.3578914403915405, + "logits/rejected": -1.238207221031189, + "logps/chosen": -80.16543579101562, + "logps/rejected": -71.86994934082031, + "loss": 0.2399, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1214606761932373, + "rewards/margins": 1.2861281633377075, + "rewards/rejected": 1.8353325128555298, + "step": 9693 + }, + { + "epoch": 1.57, + "learning_rate": 1.1462277208418338e-06, + "logits/chosen": -1.4681460857391357, + "logits/rejected": -1.4058436155319214, + "logps/chosen": -49.456512451171875, + "logps/rejected": -65.3741226196289, + "loss": 0.9757, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.2283363342285156, + "rewards/margins": -0.4671647548675537, + "rewards/rejected": 2.6955010890960693, + "step": 9694 + }, + { + "epoch": 1.57, + "learning_rate": 1.1453905003349142e-06, + "logits/chosen": -1.4876004457473755, + "logits/rejected": -1.486145257949829, + "logps/chosen": -137.60922241210938, + "logps/rejected": -88.13323211669922, + "loss": 1.242, + "rewards/accuracies": 1.0, + "rewards/chosen": 8.451560020446777, + "rewards/margins": 2.433523178100586, + "rewards/rejected": 6.018036842346191, + "step": 9695 + }, + { + "epoch": 1.57, + "learning_rate": 1.1445535461450362e-06, + "logits/chosen": -1.6206392049789429, + "logits/rejected": -1.599360466003418, + "logps/chosen": -51.32060241699219, + "logps/rejected": -41.965335845947266, + "loss": 0.5193, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.497693657875061, + "rewards/margins": -0.42713892459869385, + "rewards/rejected": 1.9248325824737549, + "step": 9696 + }, + { + "epoch": 1.57, + "learning_rate": 1.1437168583300302e-06, + "logits/chosen": -1.6017448902130127, + "logits/rejected": -1.6395782232284546, + "logps/chosen": -196.49961853027344, + "logps/rejected": -89.31873321533203, + "loss": 0.0056, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.518937587738037, + "rewards/margins": 4.495741844177246, + "rewards/rejected": 2.023195743560791, + "step": 9697 + }, + { + "epoch": 1.57, + "learning_rate": 1.1428804369476982e-06, + "logits/chosen": -1.3480898141860962, + "logits/rejected": -1.3317941427230835, + "logps/chosen": -54.396644592285156, + "logps/rejected": -39.893028259277344, + "loss": 0.2813, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.1676299571990967, + "rewards/margins": 0.964401125907898, + "rewards/rejected": 1.2032288312911987, + "step": 9698 + }, + { + "epoch": 1.57, + "learning_rate": 1.1420442820558336e-06, + "logits/chosen": -1.073236346244812, + "logits/rejected": -1.0845080614089966, + "logps/chosen": -3.7229695320129395, + "logps/rejected": -4.645397186279297, + "loss": 0.7768, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.287008672952652, + "rewards/margins": -0.3992482125759125, + "rewards/rejected": 0.6862568855285645, + "step": 9699 + }, + { + "epoch": 1.57, + "learning_rate": 1.1412083937122031e-06, + "logits/chosen": -1.1372830867767334, + "logits/rejected": -1.1580874919891357, + "logps/chosen": -112.42842864990234, + "logps/rejected": -58.46253204345703, + "loss": 0.2665, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.9250191450119019, + "rewards/margins": 0.40312957763671875, + "rewards/rejected": 1.521889567375183, + "step": 9700 + }, + { + "epoch": 1.57, + "learning_rate": 1.1403727719745622e-06, + "logits/chosen": -1.3418903350830078, + "logits/rejected": -1.3114354610443115, + "logps/chosen": -85.39838409423828, + "logps/rejected": -93.14427185058594, + "loss": 0.2152, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.5288002490997314, + "rewards/margins": 0.944817304611206, + "rewards/rejected": 2.5839829444885254, + "step": 9701 + }, + { + "epoch": 1.57, + "learning_rate": 1.1395374169006407e-06, + "logits/chosen": -1.3892176151275635, + "logits/rejected": -1.1808327436447144, + "logps/chosen": -118.91088104248047, + "logps/rejected": -67.29949951171875, + "loss": 0.1847, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.213620662689209, + "rewards/margins": 0.9490299224853516, + "rewards/rejected": 5.264590740203857, + "step": 9702 + }, + { + "epoch": 1.57, + "learning_rate": 1.1387023285481573e-06, + "logits/chosen": -1.3916170597076416, + "logits/rejected": -1.2635173797607422, + "logps/chosen": -96.39314270019531, + "logps/rejected": -74.91778564453125, + "loss": 0.5047, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.721087694168091, + "rewards/margins": 0.9892860651016235, + "rewards/rejected": 1.7318016290664673, + "step": 9703 + }, + { + "epoch": 1.58, + "learning_rate": 1.1378675069748058e-06, + "logits/chosen": -1.386637806892395, + "logits/rejected": -1.3264167308807373, + "logps/chosen": -84.14776611328125, + "logps/rejected": -33.806976318359375, + "loss": 0.3499, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.8194565773010254, + "rewards/margins": 0.1620471477508545, + "rewards/rejected": 2.657409429550171, + "step": 9704 + }, + { + "epoch": 1.58, + "learning_rate": 1.1370329522382672e-06, + "logits/chosen": -1.2103774547576904, + "logits/rejected": -1.0094079971313477, + "logps/chosen": -38.53068161010742, + "logps/rejected": -20.395248413085938, + "loss": 0.129, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.604922771453857, + "rewards/margins": 4.1054887771606445, + "rewards/rejected": 0.49943408370018005, + "step": 9705 + }, + { + "epoch": 1.58, + "learning_rate": 1.136198664396198e-06, + "logits/chosen": -1.3307631015777588, + "logits/rejected": -1.1812976598739624, + "logps/chosen": -77.50827026367188, + "logps/rejected": -68.89469909667969, + "loss": 0.2829, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.5093629360198975, + "rewards/margins": 0.8562150001525879, + "rewards/rejected": 2.6531479358673096, + "step": 9706 + }, + { + "epoch": 1.58, + "learning_rate": 1.1353646435062431e-06, + "logits/chosen": -1.3908721208572388, + "logits/rejected": -1.253732681274414, + "logps/chosen": -97.48416137695312, + "logps/rejected": -23.730958938598633, + "loss": 0.505, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.4154510498046875, + "rewards/margins": 4.758253574371338, + "rewards/rejected": 1.65719735622406, + "step": 9707 + }, + { + "epoch": 1.58, + "learning_rate": 1.1345308896260237e-06, + "logits/chosen": -1.1029108762741089, + "logits/rejected": -1.1375991106033325, + "logps/chosen": -53.86431884765625, + "logps/rejected": -75.67920684814453, + "loss": 0.1618, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.4276421070098877, + "rewards/margins": 1.7173919677734375, + "rewards/rejected": 0.7102500796318054, + "step": 9708 + }, + { + "epoch": 1.58, + "learning_rate": 1.1336974028131437e-06, + "logits/chosen": -1.1390639543533325, + "logits/rejected": -1.2684143781661987, + "logps/chosen": -39.414695739746094, + "logps/rejected": -131.56045532226562, + "loss": 1.474, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.2474801540374756, + "rewards/margins": -2.566915273666382, + "rewards/rejected": 5.814395427703857, + "step": 9709 + }, + { + "epoch": 1.58, + "learning_rate": 1.1328641831251908e-06, + "logits/chosen": -1.2128419876098633, + "logits/rejected": -1.1455371379852295, + "logps/chosen": -109.29885864257812, + "logps/rejected": -56.09169006347656, + "loss": 0.8252, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.9845703840255737, + "rewards/margins": 0.8850860595703125, + "rewards/rejected": 1.0994843244552612, + "step": 9710 + }, + { + "epoch": 1.58, + "learning_rate": 1.13203123061973e-06, + "logits/chosen": -1.4209071397781372, + "logits/rejected": -1.1500561237335205, + "logps/chosen": -146.18386840820312, + "logps/rejected": -80.88117980957031, + "loss": 0.0316, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.2580766677856445, + "rewards/margins": 3.172261953353882, + "rewards/rejected": 3.0858147144317627, + "step": 9711 + }, + { + "epoch": 1.58, + "learning_rate": 1.1311985453543134e-06, + "logits/chosen": -1.5035319328308105, + "logits/rejected": -1.2356181144714355, + "logps/chosen": -84.27803802490234, + "logps/rejected": -61.416465759277344, + "loss": 0.2751, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9201157093048096, + "rewards/margins": 0.5028693675994873, + "rewards/rejected": 2.4172463417053223, + "step": 9712 + }, + { + "epoch": 1.58, + "learning_rate": 1.1303661273864696e-06, + "logits/chosen": -1.2201712131500244, + "logits/rejected": -1.1295911073684692, + "logps/chosen": -83.8709716796875, + "logps/rejected": -6.052172660827637, + "loss": 2.9861, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.6027694940567017, + "rewards/margins": 0.3426738977432251, + "rewards/rejected": 0.26009559631347656, + "step": 9713 + }, + { + "epoch": 1.58, + "learning_rate": 1.1295339767737125e-06, + "logits/chosen": -1.226013422012329, + "logits/rejected": -1.2153064012527466, + "logps/chosen": -70.87310028076172, + "logps/rejected": -121.65670776367188, + "loss": 0.4152, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.6817092895507812, + "rewards/margins": -0.14798438549041748, + "rewards/rejected": 1.8296936750411987, + "step": 9714 + }, + { + "epoch": 1.58, + "learning_rate": 1.1287020935735338e-06, + "logits/chosen": -1.2945876121520996, + "logits/rejected": -1.1788204908370972, + "logps/chosen": -141.22621154785156, + "logps/rejected": -32.68693542480469, + "loss": 0.6524, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.930345296859741, + "rewards/margins": 0.9851059913635254, + "rewards/rejected": 1.9452393054962158, + "step": 9715 + }, + { + "epoch": 1.58, + "learning_rate": 1.1278704778434112e-06, + "logits/chosen": -1.3198012113571167, + "logits/rejected": -1.308059811592102, + "logps/chosen": -84.32406616210938, + "logps/rejected": -76.55570220947266, + "loss": 0.374, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.507820129394531, + "rewards/margins": 1.2018423080444336, + "rewards/rejected": 5.305977821350098, + "step": 9716 + }, + { + "epoch": 1.58, + "learning_rate": 1.1270391296407983e-06, + "logits/chosen": -1.4195116758346558, + "logits/rejected": -1.4654489755630493, + "logps/chosen": -75.93804168701172, + "logps/rejected": -45.867828369140625, + "loss": 0.5554, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.9300575256347656, + "rewards/margins": 0.43311309814453125, + "rewards/rejected": 3.4969444274902344, + "step": 9717 + }, + { + "epoch": 1.58, + "learning_rate": 1.1262080490231375e-06, + "logits/chosen": -0.9672865271568298, + "logits/rejected": -0.7891252636909485, + "logps/chosen": -63.34962463378906, + "logps/rejected": -26.252843856811523, + "loss": 0.2637, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.603784084320068, + "rewards/margins": 3.972783088684082, + "rewards/rejected": 1.6310011148452759, + "step": 9718 + }, + { + "epoch": 1.58, + "learning_rate": 1.1253772360478443e-06, + "logits/chosen": -1.2636535167694092, + "logits/rejected": -1.0581661462783813, + "logps/chosen": -58.68157196044922, + "logps/rejected": -25.311668395996094, + "loss": 0.1339, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.4545280933380127, + "rewards/margins": 2.771440267562866, + "rewards/rejected": -0.3169122636318207, + "step": 9719 + }, + { + "epoch": 1.58, + "learning_rate": 1.1245466907723235e-06, + "logits/chosen": -1.399336576461792, + "logits/rejected": -1.4124432802200317, + "logps/chosen": -85.84085083007812, + "logps/rejected": -69.99737548828125, + "loss": 1.755, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.631182909011841, + "rewards/margins": -3.244131326675415, + "rewards/rejected": 5.875314235687256, + "step": 9720 + }, + { + "epoch": 1.58, + "learning_rate": 1.1237164132539552e-06, + "logits/chosen": -1.256933569908142, + "logits/rejected": -1.094957709312439, + "logps/chosen": -57.789268493652344, + "logps/rejected": -37.63628005981445, + "loss": 0.3647, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.170307159423828, + "rewards/margins": 3.3017756938934326, + "rewards/rejected": -0.13146857917308807, + "step": 9721 + }, + { + "epoch": 1.58, + "learning_rate": 1.122886403550107e-06, + "logits/chosen": -1.3898553848266602, + "logits/rejected": -1.3665106296539307, + "logps/chosen": -75.8826904296875, + "logps/rejected": -78.59093475341797, + "loss": 0.1262, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.446669101715088, + "rewards/margins": 1.3859894275665283, + "rewards/rejected": 1.0606796741485596, + "step": 9722 + }, + { + "epoch": 1.58, + "learning_rate": 1.1220566617181205e-06, + "logits/chosen": -1.171450138092041, + "logits/rejected": -1.1400483846664429, + "logps/chosen": -79.83716583251953, + "logps/rejected": -119.02490234375, + "loss": 0.4884, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.8149101734161377, + "rewards/margins": 1.7087318897247314, + "rewards/rejected": 1.1061782836914062, + "step": 9723 + }, + { + "epoch": 1.58, + "learning_rate": 1.1212271878153268e-06, + "logits/chosen": -1.176661491394043, + "logits/rejected": -1.0464398860931396, + "logps/chosen": -166.8359375, + "logps/rejected": -48.986087799072266, + "loss": 0.2615, + "rewards/accuracies": 1.0, + "rewards/chosen": 7.972364902496338, + "rewards/margins": 6.192916393280029, + "rewards/rejected": 1.779448390007019, + "step": 9724 + }, + { + "epoch": 1.58, + "learning_rate": 1.1203979818990318e-06, + "logits/chosen": -1.1041722297668457, + "logits/rejected": -1.0848371982574463, + "logps/chosen": -35.40632629394531, + "logps/rejected": -40.27592468261719, + "loss": 1.3645, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.552467107772827, + "rewards/margins": -1.2236392498016357, + "rewards/rejected": 3.776106357574463, + "step": 9725 + }, + { + "epoch": 1.58, + "learning_rate": 1.1195690440265288e-06, + "logits/chosen": -1.038151741027832, + "logits/rejected": -1.0450388193130493, + "logps/chosen": -1.1650254726409912, + "logps/rejected": -2.28654408454895, + "loss": 0.4873, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3262687623500824, + "rewards/margins": 0.12501627206802368, + "rewards/rejected": 0.20125249028205872, + "step": 9726 + }, + { + "epoch": 1.58, + "learning_rate": 1.1187403742550862e-06, + "logits/chosen": -1.0795100927352905, + "logits/rejected": -1.386845588684082, + "logps/chosen": -42.6086311340332, + "logps/rejected": -30.999481201171875, + "loss": 0.3518, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.5323803424835205, + "rewards/margins": 1.5525245666503906, + "rewards/rejected": 1.9798557758331299, + "step": 9727 + }, + { + "epoch": 1.58, + "learning_rate": 1.11791197264196e-06, + "logits/chosen": -1.3098918199539185, + "logits/rejected": -1.3098918199539185, + "logps/chosen": -40.28812026977539, + "logps/rejected": -40.28812026977539, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.857635974884033, + "rewards/margins": 0.0, + "rewards/rejected": 2.857635974884033, + "step": 9728 + }, + { + "epoch": 1.58, + "learning_rate": 1.1170838392443834e-06, + "logits/chosen": -1.2778022289276123, + "logits/rejected": -1.208868145942688, + "logps/chosen": -50.542449951171875, + "logps/rejected": -42.843963623046875, + "loss": 0.4782, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.0141441822052, + "rewards/margins": -0.4657096862792969, + "rewards/rejected": 2.479853868484497, + "step": 9729 + }, + { + "epoch": 1.58, + "learning_rate": 1.1162559741195733e-06, + "logits/chosen": -1.3579490184783936, + "logits/rejected": -1.3547192811965942, + "logps/chosen": -54.09943389892578, + "logps/rejected": -55.75323486328125, + "loss": 0.8651, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.7161026000976562, + "rewards/margins": -1.4573898315429688, + "rewards/rejected": 3.173492431640625, + "step": 9730 + }, + { + "epoch": 1.58, + "learning_rate": 1.1154283773247265e-06, + "logits/chosen": -1.4275662899017334, + "logits/rejected": -1.3479430675506592, + "logps/chosen": -51.83204650878906, + "logps/rejected": -36.64337158203125, + "loss": 0.41, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.006526231765747, + "rewards/margins": 0.22785186767578125, + "rewards/rejected": 2.778674364089966, + "step": 9731 + }, + { + "epoch": 1.58, + "learning_rate": 1.114601048917024e-06, + "logits/chosen": -1.8764029741287231, + "logits/rejected": -1.8613855838775635, + "logps/chosen": -87.80534362792969, + "logps/rejected": -149.34173583984375, + "loss": 1.7158, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.2283387184143066, + "rewards/margins": -2.35601806640625, + "rewards/rejected": 5.584356784820557, + "step": 9732 + }, + { + "epoch": 1.58, + "learning_rate": 1.113773988953623e-06, + "logits/chosen": -1.3762761354446411, + "logits/rejected": -1.2403411865234375, + "logps/chosen": -46.604339599609375, + "logps/rejected": -21.8273868560791, + "loss": 0.0529, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1152284145355225, + "rewards/margins": 2.54097843170166, + "rewards/rejected": 0.5742498636245728, + "step": 9733 + }, + { + "epoch": 1.58, + "learning_rate": 1.1129471974916696e-06, + "logits/chosen": -1.1564513444900513, + "logits/rejected": -1.0520075559616089, + "logps/chosen": -32.28908920288086, + "logps/rejected": -40.1594352722168, + "loss": 0.4415, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.4686535596847534, + "rewards/margins": 0.35169756412506104, + "rewards/rejected": 1.1169559955596924, + "step": 9734 + }, + { + "epoch": 1.58, + "learning_rate": 1.1121206745882834e-06, + "logits/chosen": -1.5688735246658325, + "logits/rejected": -1.4945544004440308, + "logps/chosen": -58.02760314941406, + "logps/rejected": -42.33789825439453, + "loss": 1.3371, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.6484391689300537, + "rewards/margins": 0.3147447109222412, + "rewards/rejected": 2.3336944580078125, + "step": 9735 + }, + { + "epoch": 1.58, + "learning_rate": 1.111294420300571e-06, + "logits/chosen": -1.1609623432159424, + "logits/rejected": -1.1609623432159424, + "logps/chosen": -50.20559310913086, + "logps/rejected": -50.20559310913086, + "loss": 1.1573, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.587804079055786, + "rewards/margins": 0.0, + "rewards/rejected": 2.587804079055786, + "step": 9736 + }, + { + "epoch": 1.58, + "learning_rate": 1.110468434685621e-06, + "logits/chosen": -1.2377233505249023, + "logits/rejected": -1.314002513885498, + "logps/chosen": -56.70857238769531, + "logps/rejected": -114.25559997558594, + "loss": 2.2, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.213217258453369, + "rewards/margins": -4.384461879730225, + "rewards/rejected": 7.597679138183594, + "step": 9737 + }, + { + "epoch": 1.58, + "learning_rate": 1.109642717800497e-06, + "logits/chosen": -1.2113081216812134, + "logits/rejected": -1.1148791313171387, + "logps/chosen": -48.118492126464844, + "logps/rejected": -21.298093795776367, + "loss": 1.9606, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.685754299163818, + "rewards/margins": 2.834325075149536, + "rewards/rejected": 2.8514292240142822, + "step": 9738 + }, + { + "epoch": 1.58, + "learning_rate": 1.108817269702252e-06, + "logits/chosen": -1.6501954793930054, + "logits/rejected": -1.660713791847229, + "logps/chosen": -107.77851867675781, + "logps/rejected": -105.68236541748047, + "loss": 0.3769, + "rewards/accuracies": 0.0, + "rewards/chosen": 6.323767185211182, + "rewards/margins": -0.1012430191040039, + "rewards/rejected": 6.4250102043151855, + "step": 9739 + }, + { + "epoch": 1.58, + "learning_rate": 1.1079920904479135e-06, + "logits/chosen": -1.5270681381225586, + "logits/rejected": -1.5332626104354858, + "logps/chosen": -122.77407836914062, + "logps/rejected": -138.37997436523438, + "loss": 0.144, + "rewards/accuracies": 1.0, + "rewards/chosen": 8.110244750976562, + "rewards/margins": 1.1196975708007812, + "rewards/rejected": 6.990547180175781, + "step": 9740 + }, + { + "epoch": 1.58, + "learning_rate": 1.1071671800944962e-06, + "logits/chosen": -1.3031011819839478, + "logits/rejected": -1.412198781967163, + "logps/chosen": -61.17460632324219, + "logps/rejected": -106.53445434570312, + "loss": 1.6213, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.0940797328948975, + "rewards/margins": -3.201677083969116, + "rewards/rejected": 5.295756816864014, + "step": 9741 + }, + { + "epoch": 1.58, + "learning_rate": 1.1063425386989913e-06, + "logits/chosen": -1.0118908882141113, + "logits/rejected": -0.9990748763084412, + "logps/chosen": -69.3502197265625, + "logps/rejected": -54.38862609863281, + "loss": 1.1931, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.9764655828475952, + "rewards/margins": -2.281045436859131, + "rewards/rejected": 4.257511138916016, + "step": 9742 + }, + { + "epoch": 1.58, + "learning_rate": 1.1055181663183763e-06, + "logits/chosen": -1.4315489530563354, + "logits/rejected": -1.4315489530563354, + "logps/chosen": -37.06782531738281, + "logps/rejected": -37.06782531738281, + "loss": 0.379, + "rewards/accuracies": 0.0, + "rewards/chosen": 4.129316806793213, + "rewards/margins": 0.0, + "rewards/rejected": 4.129316806793213, + "step": 9743 + }, + { + "epoch": 1.58, + "learning_rate": 1.1046940630096049e-06, + "logits/chosen": -1.4381825923919678, + "logits/rejected": -1.4163942337036133, + "logps/chosen": -98.07240295410156, + "logps/rejected": -91.47283172607422, + "loss": 0.3177, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.603621006011963, + "rewards/margins": 0.21428775787353516, + "rewards/rejected": 5.389333248138428, + "step": 9744 + }, + { + "epoch": 1.58, + "learning_rate": 1.1038702288296166e-06, + "logits/chosen": -1.3525663614273071, + "logits/rejected": -1.4881221055984497, + "logps/chosen": -73.59025573730469, + "logps/rejected": -143.28921508789062, + "loss": 1.7154, + "rewards/accuracies": 0.0, + "rewards/chosen": 4.8455963134765625, + "rewards/margins": -3.396028518676758, + "rewards/rejected": 8.24162483215332, + "step": 9745 + }, + { + "epoch": 1.58, + "learning_rate": 1.1030466638353293e-06, + "logits/chosen": -1.2189780473709106, + "logits/rejected": -1.1245685815811157, + "logps/chosen": -75.0200424194336, + "logps/rejected": -44.669654846191406, + "loss": 0.1068, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.8191277980804443, + "rewards/margins": 1.4659074544906616, + "rewards/rejected": 1.3532203435897827, + "step": 9746 + }, + { + "epoch": 1.58, + "learning_rate": 1.1022233680836452e-06, + "logits/chosen": -1.6179612874984741, + "logits/rejected": -1.5398547649383545, + "logps/chosen": -48.681453704833984, + "logps/rejected": -28.15760612487793, + "loss": 0.0666, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.0227468013763428, + "rewards/margins": 2.53259015083313, + "rewards/rejected": -0.5098432898521423, + "step": 9747 + }, + { + "epoch": 1.58, + "learning_rate": 1.101400341631444e-06, + "logits/chosen": -1.277850866317749, + "logits/rejected": -1.134697675704956, + "logps/chosen": -36.424102783203125, + "logps/rejected": -22.82514762878418, + "loss": 0.0607, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.3390045166015625, + "rewards/margins": 3.2604875564575195, + "rewards/rejected": 1.078516960144043, + "step": 9748 + }, + { + "epoch": 1.58, + "learning_rate": 1.100577584535592e-06, + "logits/chosen": -1.1580432653427124, + "logits/rejected": -1.1594078540802002, + "logps/chosen": -37.30224609375, + "logps/rejected": -23.364065170288086, + "loss": 0.1756, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.128946304321289, + "rewards/margins": 1.0832290649414062, + "rewards/rejected": 0.04571723937988281, + "step": 9749 + }, + { + "epoch": 1.58, + "learning_rate": 1.0997550968529302e-06, + "logits/chosen": -1.2064193487167358, + "logits/rejected": -1.1747766733169556, + "logps/chosen": -100.25239562988281, + "logps/rejected": -128.47177124023438, + "loss": 0.8304, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0737533569335938, + "rewards/margins": 1.7244033813476562, + "rewards/rejected": 1.3493499755859375, + "step": 9750 + }, + { + "epoch": 1.58, + "learning_rate": 1.0989328786402887e-06, + "logits/chosen": -1.0998423099517822, + "logits/rejected": -1.0223904848098755, + "logps/chosen": -39.154075622558594, + "logps/rejected": -41.26093292236328, + "loss": 0.3469, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.466378927230835, + "rewards/margins": 1.048861026763916, + "rewards/rejected": 1.417517900466919, + "step": 9751 + }, + { + "epoch": 1.58, + "learning_rate": 1.0981109299544713e-06, + "logits/chosen": -1.3166011571884155, + "logits/rejected": -1.296722650527954, + "logps/chosen": -61.80158996582031, + "logps/rejected": -70.69232940673828, + "loss": 0.6269, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.3264710903167725, + "rewards/margins": 0.7984750270843506, + "rewards/rejected": 1.5279960632324219, + "step": 9752 + }, + { + "epoch": 1.58, + "learning_rate": 1.0972892508522704e-06, + "logits/chosen": -1.0990595817565918, + "logits/rejected": -1.1426950693130493, + "logps/chosen": -12.162965774536133, + "logps/rejected": -40.16339111328125, + "loss": 0.4183, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.8986963629722595, + "rewards/margins": -0.2334997057914734, + "rewards/rejected": 1.132196068763733, + "step": 9753 + }, + { + "epoch": 1.58, + "learning_rate": 1.0964678413904529e-06, + "logits/chosen": -1.2774124145507812, + "logits/rejected": -1.1821558475494385, + "logps/chosen": -110.82015991210938, + "logps/rejected": -111.67106628417969, + "loss": 0.5722, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.221164226531982, + "rewards/margins": 0.9500184059143066, + "rewards/rejected": 4.271145820617676, + "step": 9754 + }, + { + "epoch": 1.58, + "learning_rate": 1.0956467016257732e-06, + "logits/chosen": -1.5084887742996216, + "logits/rejected": -1.3383744955062866, + "logps/chosen": -149.9621124267578, + "logps/rejected": -23.771230697631836, + "loss": 0.1526, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.909816026687622, + "rewards/margins": 1.1463367938995361, + "rewards/rejected": 0.7634792327880859, + "step": 9755 + }, + { + "epoch": 1.58, + "learning_rate": 1.0948258316149619e-06, + "logits/chosen": -1.1232631206512451, + "logits/rejected": -0.9983776211738586, + "logps/chosen": -87.3812255859375, + "logps/rejected": -33.108970642089844, + "loss": 0.1534, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.684616804122925, + "rewards/margins": 1.0306086540222168, + "rewards/rejected": 2.654008150100708, + "step": 9756 + }, + { + "epoch": 1.58, + "learning_rate": 1.094005231414736e-06, + "logits/chosen": -1.0799440145492554, + "logits/rejected": -1.0799440145492554, + "logps/chosen": -38.19468307495117, + "logps/rejected": -38.19468307495117, + "loss": 0.3767, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.2649716138839722, + "rewards/margins": 0.0, + "rewards/rejected": 1.2649716138839722, + "step": 9757 + }, + { + "epoch": 1.58, + "learning_rate": 1.093184901081788e-06, + "logits/chosen": -1.2170982360839844, + "logits/rejected": -1.220447063446045, + "logps/chosen": -73.2564697265625, + "logps/rejected": -57.56779861450195, + "loss": 0.2055, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.297217607498169, + "rewards/margins": 0.7518305778503418, + "rewards/rejected": 2.545387029647827, + "step": 9758 + }, + { + "epoch": 1.58, + "learning_rate": 1.0923648406727983e-06, + "logits/chosen": -1.4636808633804321, + "logits/rejected": -0.9600786566734314, + "logps/chosen": -49.69562911987305, + "logps/rejected": -100.88570404052734, + "loss": 0.1428, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.690880298614502, + "rewards/margins": 1.8768444061279297, + "rewards/rejected": 3.8140358924865723, + "step": 9759 + }, + { + "epoch": 1.58, + "learning_rate": 1.0915450502444226e-06, + "logits/chosen": -1.335201382637024, + "logits/rejected": -1.3270988464355469, + "logps/chosen": -82.51834106445312, + "logps/rejected": -67.08550262451172, + "loss": 0.8263, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.5603349208831787, + "rewards/margins": -1.4142751693725586, + "rewards/rejected": 3.9746100902557373, + "step": 9760 + }, + { + "epoch": 1.58, + "learning_rate": 1.0907255298533026e-06, + "logits/chosen": -1.5215566158294678, + "logits/rejected": -1.376692771911621, + "logps/chosen": -77.63198852539062, + "logps/rejected": -59.670955657958984, + "loss": 0.0993, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.4709930419921875, + "rewards/margins": 2.1030619144439697, + "rewards/rejected": 2.3679311275482178, + "step": 9761 + }, + { + "epoch": 1.58, + "learning_rate": 1.0899062795560572e-06, + "logits/chosen": -1.2174758911132812, + "logits/rejected": -1.3043138980865479, + "logps/chosen": -62.034523010253906, + "logps/rejected": -56.56608963012695, + "loss": 0.731, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.7399544715881348, + "rewards/margins": -0.9385097026824951, + "rewards/rejected": 3.67846417427063, + "step": 9762 + }, + { + "epoch": 1.58, + "learning_rate": 1.0890872994092921e-06, + "logits/chosen": -1.5219464302062988, + "logits/rejected": -1.6291645765304565, + "logps/chosen": -81.00949096679688, + "logps/rejected": -153.9276885986328, + "loss": 0.5508, + "rewards/accuracies": 0.0, + "rewards/chosen": 6.026611328125, + "rewards/margins": -0.21595335006713867, + "rewards/rejected": 6.242564678192139, + "step": 9763 + }, + { + "epoch": 1.58, + "learning_rate": 1.0882685894695878e-06, + "logits/chosen": -1.0307117700576782, + "logits/rejected": -1.0811306238174438, + "logps/chosen": -70.65774536132812, + "logps/rejected": -65.1666488647461, + "loss": 0.5142, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.227983236312866, + "rewards/margins": 0.935152530670166, + "rewards/rejected": 2.2928307056427, + "step": 9764 + }, + { + "epoch": 1.58, + "learning_rate": 1.087450149793512e-06, + "logits/chosen": -1.2042039632797241, + "logits/rejected": -1.2301714420318604, + "logps/chosen": -45.41456604003906, + "logps/rejected": -128.65982055664062, + "loss": 1.1583, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.893702745437622, + "rewards/margins": 0.5108078718185425, + "rewards/rejected": 1.3828948736190796, + "step": 9765 + }, + { + "epoch": 1.59, + "learning_rate": 1.0866319804376086e-06, + "logits/chosen": -0.9168604612350464, + "logits/rejected": -0.9168604612350464, + "logps/chosen": -77.35060119628906, + "logps/rejected": -77.35060119628906, + "loss": 0.6684, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.8354218006134033, + "rewards/margins": 0.0, + "rewards/rejected": 2.8354218006134033, + "step": 9766 + }, + { + "epoch": 1.59, + "learning_rate": 1.0858140814584083e-06, + "logits/chosen": -1.2845388650894165, + "logits/rejected": -1.2603033781051636, + "logps/chosen": -47.156044006347656, + "logps/rejected": -8.79317569732666, + "loss": 0.2291, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.4176644086837769, + "rewards/margins": 0.7602741718292236, + "rewards/rejected": 0.6573902368545532, + "step": 9767 + }, + { + "epoch": 1.59, + "learning_rate": 1.084996452912417e-06, + "logits/chosen": -1.1772609949111938, + "logits/rejected": -1.123163104057312, + "logps/chosen": -66.45330810546875, + "logps/rejected": -54.08565139770508, + "loss": 0.4437, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.549903154373169, + "rewards/margins": 0.48711347579956055, + "rewards/rejected": 3.0627896785736084, + "step": 9768 + }, + { + "epoch": 1.59, + "learning_rate": 1.084179094856128e-06, + "logits/chosen": -1.398437261581421, + "logits/rejected": -1.3937972784042358, + "logps/chosen": -75.37480163574219, + "logps/rejected": -45.39885330200195, + "loss": 0.2479, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.966210961341858, + "rewards/margins": 0.7647507190704346, + "rewards/rejected": 1.2014602422714233, + "step": 9769 + }, + { + "epoch": 1.59, + "learning_rate": 1.0833620073460104e-06, + "logits/chosen": -1.155131220817566, + "logits/rejected": -1.155131220817566, + "logps/chosen": -19.361083984375, + "logps/rejected": -19.361083984375, + "loss": 0.3825, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.23891448974609375, + "rewards/margins": 0.0, + "rewards/rejected": 0.23891448974609375, + "step": 9770 + }, + { + "epoch": 1.59, + "learning_rate": 1.0825451904385198e-06, + "logits/chosen": -1.4715633392333984, + "logits/rejected": -1.1798114776611328, + "logps/chosen": -62.45814514160156, + "logps/rejected": -60.696346282958984, + "loss": 0.1127, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.837542772293091, + "rewards/margins": 1.6411099433898926, + "rewards/rejected": 2.1964328289031982, + "step": 9771 + }, + { + "epoch": 1.59, + "learning_rate": 1.0817286441900883e-06, + "logits/chosen": -1.7637956142425537, + "logits/rejected": -1.7384300231933594, + "logps/chosen": -98.6890869140625, + "logps/rejected": -166.84085083007812, + "loss": 1.4596, + "rewards/accuracies": 0.0, + "rewards/chosen": 5.969119548797607, + "rewards/margins": -2.8482604026794434, + "rewards/rejected": 8.81737995147705, + "step": 9772 + }, + { + "epoch": 1.59, + "learning_rate": 1.080912368657132e-06, + "logits/chosen": -1.377266526222229, + "logits/rejected": -1.3857985734939575, + "logps/chosen": -101.45587158203125, + "logps/rejected": -118.27922058105469, + "loss": 0.2435, + "rewards/accuracies": 1.0, + "rewards/chosen": 7.142068386077881, + "rewards/margins": 0.7020277976989746, + "rewards/rejected": 6.440040588378906, + "step": 9773 + }, + { + "epoch": 1.59, + "learning_rate": 1.08009636389605e-06, + "logits/chosen": -1.5777606964111328, + "logits/rejected": -1.5409053564071655, + "logps/chosen": -80.97784423828125, + "logps/rejected": -61.678466796875, + "loss": 0.0571, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.920543670654297, + "rewards/margins": 2.6831917762756348, + "rewards/rejected": 1.2373520135879517, + "step": 9774 + }, + { + "epoch": 1.59, + "learning_rate": 1.0792806299632174e-06, + "logits/chosen": -1.4539011716842651, + "logits/rejected": -1.4721782207489014, + "logps/chosen": -38.088897705078125, + "logps/rejected": -48.44381332397461, + "loss": 0.3325, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.408404588699341, + "rewards/margins": 0.2646160125732422, + "rewards/rejected": 3.1437885761260986, + "step": 9775 + }, + { + "epoch": 1.59, + "learning_rate": 1.0784651669149959e-06, + "logits/chosen": -1.008685827255249, + "logits/rejected": -0.8361435532569885, + "logps/chosen": -100.6036376953125, + "logps/rejected": -49.61909103393555, + "loss": 1.2398, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.308422803878784, + "rewards/margins": 1.3968554735183716, + "rewards/rejected": 0.9115673303604126, + "step": 9776 + }, + { + "epoch": 1.59, + "learning_rate": 1.0776499748077246e-06, + "logits/chosen": -1.4913562536239624, + "logits/rejected": -1.392991065979004, + "logps/chosen": -58.42618942260742, + "logps/rejected": -14.447517395019531, + "loss": 3.2434, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.7289066314697266, + "rewards/margins": 0.7876485586166382, + "rewards/rejected": 0.9412580728530884, + "step": 9777 + }, + { + "epoch": 1.59, + "learning_rate": 1.076835053697728e-06, + "logits/chosen": -1.3606562614440918, + "logits/rejected": -1.2630382776260376, + "logps/chosen": -82.64376068115234, + "logps/rejected": -86.74300384521484, + "loss": 0.0397, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.482131481170654, + "rewards/margins": 2.5008256435394287, + "rewards/rejected": 3.9813058376312256, + "step": 9778 + }, + { + "epoch": 1.59, + "learning_rate": 1.0760204036413057e-06, + "logits/chosen": -1.6449050903320312, + "logits/rejected": -1.6294758319854736, + "logps/chosen": -135.09735107421875, + "logps/rejected": -125.73999786376953, + "loss": 0.627, + "rewards/accuracies": 0.0, + "rewards/chosen": 8.661128044128418, + "rewards/margins": -0.9134674072265625, + "rewards/rejected": 9.57459545135498, + "step": 9779 + }, + { + "epoch": 1.59, + "learning_rate": 1.0752060246947465e-06, + "logits/chosen": -1.2107864618301392, + "logits/rejected": -1.1933128833770752, + "logps/chosen": -106.05450439453125, + "logps/rejected": -59.399513244628906, + "loss": 0.2502, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.570275783538818, + "rewards/margins": 2.384840965270996, + "rewards/rejected": 2.1854348182678223, + "step": 9780 + }, + { + "epoch": 1.59, + "learning_rate": 1.0743919169143125e-06, + "logits/chosen": -1.1151838302612305, + "logits/rejected": -1.1611242294311523, + "logps/chosen": -61.62248611450195, + "logps/rejected": -65.23210144042969, + "loss": 0.7827, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.1216824054718018, + "rewards/margins": -0.330844521522522, + "rewards/rejected": 1.4525269269943237, + "step": 9781 + }, + { + "epoch": 1.59, + "learning_rate": 1.073578080356254e-06, + "logits/chosen": -1.1970115900039673, + "logits/rejected": -1.2339061498641968, + "logps/chosen": -56.19270706176758, + "logps/rejected": -67.87140655517578, + "loss": 1.414, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.9761570692062378, + "rewards/margins": -2.072411060333252, + "rewards/rejected": 4.048568248748779, + "step": 9782 + }, + { + "epoch": 1.59, + "learning_rate": 1.0727645150767967e-06, + "logits/chosen": -1.420664668083191, + "logits/rejected": -1.420664668083191, + "logps/chosen": -49.8905143737793, + "logps/rejected": -49.8905143737793, + "loss": 0.3767, + "rewards/accuracies": 0.0, + "rewards/chosen": 5.861028671264648, + "rewards/margins": 0.0, + "rewards/rejected": 5.861028671264648, + "step": 9783 + }, + { + "epoch": 1.59, + "learning_rate": 1.071951221132153e-06, + "logits/chosen": -1.3907157182693481, + "logits/rejected": -1.1701816320419312, + "logps/chosen": -107.63863372802734, + "logps/rejected": -36.71241760253906, + "loss": 0.8979, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.585592031478882, + "rewards/margins": 2.000952959060669, + "rewards/rejected": 0.5846390128135681, + "step": 9784 + }, + { + "epoch": 1.59, + "learning_rate": 1.0711381985785113e-06, + "logits/chosen": -1.4242782592773438, + "logits/rejected": -1.375680685043335, + "logps/chosen": -196.4891357421875, + "logps/rejected": -190.6770477294922, + "loss": 0.0655, + "rewards/accuracies": 1.0, + "rewards/chosen": 8.50107479095459, + "rewards/margins": 2.3797049522399902, + "rewards/rejected": 6.1213698387146, + "step": 9785 + }, + { + "epoch": 1.59, + "learning_rate": 1.070325447472046e-06, + "logits/chosen": -1.1511198282241821, + "logits/rejected": -1.1747727394104004, + "logps/chosen": -150.45974731445312, + "logps/rejected": -54.48374557495117, + "loss": 0.223, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.942665100097656, + "rewards/margins": 3.1528210639953613, + "rewards/rejected": 1.7898441553115845, + "step": 9786 + }, + { + "epoch": 1.59, + "learning_rate": 1.0695129678689076e-06, + "logits/chosen": -1.589640498161316, + "logits/rejected": -1.540601134300232, + "logps/chosen": -94.19485473632812, + "logps/rejected": -104.06072998046875, + "loss": 0.3057, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.820687770843506, + "rewards/margins": 0.18467235565185547, + "rewards/rejected": 5.63601541519165, + "step": 9787 + }, + { + "epoch": 1.59, + "learning_rate": 1.068700759825234e-06, + "logits/chosen": -1.243859887123108, + "logits/rejected": -1.2401399612426758, + "logps/chosen": -96.67021179199219, + "logps/rejected": -104.42084503173828, + "loss": 0.204, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.311875820159912, + "rewards/margins": 1.1543021202087402, + "rewards/rejected": 5.157573699951172, + "step": 9788 + }, + { + "epoch": 1.59, + "learning_rate": 1.0678888233971386e-06, + "logits/chosen": -1.5875635147094727, + "logits/rejected": -1.597784161567688, + "logps/chosen": -48.947723388671875, + "logps/rejected": -62.35026550292969, + "loss": 1.2801, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.0123183727264404, + "rewards/margins": -1.5516068935394287, + "rewards/rejected": 3.563925266265869, + "step": 9789 + }, + { + "epoch": 1.59, + "learning_rate": 1.0670771586407208e-06, + "logits/chosen": -1.2069913148880005, + "logits/rejected": -1.2656559944152832, + "logps/chosen": -77.55602264404297, + "logps/rejected": -79.84441375732422, + "loss": 0.7092, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.4901435375213623, + "rewards/margins": -0.936629056930542, + "rewards/rejected": 4.426772594451904, + "step": 9790 + }, + { + "epoch": 1.59, + "learning_rate": 1.0662657656120561e-06, + "logits/chosen": -1.2875360250473022, + "logits/rejected": -1.344794511795044, + "logps/chosen": -60.69544219970703, + "logps/rejected": -120.87913513183594, + "loss": 0.2655, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.4602181911468506, + "rewards/margins": 0.596000075340271, + "rewards/rejected": 1.8642181158065796, + "step": 9791 + }, + { + "epoch": 1.59, + "learning_rate": 1.065454644367207e-06, + "logits/chosen": -1.5321067571640015, + "logits/rejected": -1.5807288885116577, + "logps/chosen": -32.40610122680664, + "logps/rejected": -74.9315414428711, + "loss": 1.5292, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.3371798992156982, + "rewards/margins": 0.5502070188522339, + "rewards/rejected": 1.7869728803634644, + "step": 9792 + }, + { + "epoch": 1.59, + "learning_rate": 1.0646437949622118e-06, + "logits/chosen": -1.2098848819732666, + "logits/rejected": -1.2254456281661987, + "logps/chosen": -90.03153991699219, + "logps/rejected": -96.34321594238281, + "loss": 1.0831, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.8579413890838623, + "rewards/margins": 1.7936309576034546, + "rewards/rejected": 1.0643104314804077, + "step": 9793 + }, + { + "epoch": 1.59, + "learning_rate": 1.0638332174530953e-06, + "logits/chosen": -1.1123608350753784, + "logits/rejected": -1.0769712924957275, + "logps/chosen": -35.91939926147461, + "logps/rejected": -34.24468994140625, + "loss": 1.4365, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.778258204460144, + "rewards/margins": -2.3024606704711914, + "rewards/rejected": 4.080718994140625, + "step": 9794 + }, + { + "epoch": 1.59, + "learning_rate": 1.0630229118958574e-06, + "logits/chosen": -1.0435349941253662, + "logits/rejected": -1.045530080795288, + "logps/chosen": -2.367431402206421, + "logps/rejected": -4.435301780700684, + "loss": 0.7392, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.24503672122955322, + "rewards/margins": -0.07429537177085876, + "rewards/rejected": 0.319332093000412, + "step": 9795 + }, + { + "epoch": 1.59, + "learning_rate": 1.0622128783464853e-06, + "logits/chosen": -1.4915854930877686, + "logits/rejected": -1.5606735944747925, + "logps/chosen": -39.14802169799805, + "logps/rejected": -70.98652648925781, + "loss": 2.6365, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.685469388961792, + "rewards/margins": 1.6650798320770264, + "rewards/rejected": 2.0203895568847656, + "step": 9796 + }, + { + "epoch": 1.59, + "learning_rate": 1.061403116860943e-06, + "logits/chosen": -1.3566728830337524, + "logits/rejected": -1.3566728830337524, + "logps/chosen": -55.14337158203125, + "logps/rejected": -55.14337158203125, + "loss": 0.3738, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.553229570388794, + "rewards/margins": 0.0, + "rewards/rejected": 3.553229570388794, + "step": 9797 + }, + { + "epoch": 1.59, + "learning_rate": 1.0605936274951783e-06, + "logits/chosen": -1.3331526517868042, + "logits/rejected": -1.3652178049087524, + "logps/chosen": -94.81474304199219, + "logps/rejected": -122.71702575683594, + "loss": 0.1277, + "rewards/accuracies": 1.0, + "rewards/chosen": 7.135186672210693, + "rewards/margins": 2.4946699142456055, + "rewards/rejected": 4.640516757965088, + "step": 9798 + }, + { + "epoch": 1.59, + "learning_rate": 1.0597844103051186e-06, + "logits/chosen": -1.4491326808929443, + "logits/rejected": -1.4700417518615723, + "logps/chosen": -55.590614318847656, + "logps/rejected": -70.76528930664062, + "loss": 0.3553, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.1100547313690186, + "rewards/margins": -0.03198719024658203, + "rewards/rejected": 3.1420419216156006, + "step": 9799 + }, + { + "epoch": 1.59, + "learning_rate": 1.0589754653466745e-06, + "logits/chosen": -1.1917285919189453, + "logits/rejected": -1.1326245069503784, + "logps/chosen": -78.16918182373047, + "logps/rejected": -44.24258041381836, + "loss": 0.2324, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.8716140985488892, + "rewards/margins": 0.5541031360626221, + "rewards/rejected": 1.317510962486267, + "step": 9800 + }, + { + "epoch": 1.59, + "learning_rate": 1.0581667926757338e-06, + "logits/chosen": -1.3560266494750977, + "logits/rejected": -1.3141802549362183, + "logps/chosen": -105.99053955078125, + "logps/rejected": -99.88729858398438, + "loss": 0.1639, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.778451681137085, + "rewards/margins": 0.974746823310852, + "rewards/rejected": 1.803704857826233, + "step": 9801 + }, + { + "epoch": 1.59, + "learning_rate": 1.057358392348171e-06, + "logits/chosen": -1.3495824337005615, + "logits/rejected": -1.2589595317840576, + "logps/chosen": -69.8579330444336, + "logps/rejected": -44.6005973815918, + "loss": 0.8731, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.822293996810913, + "rewards/margins": 0.6551656723022461, + "rewards/rejected": 3.167128324508667, + "step": 9802 + }, + { + "epoch": 1.59, + "learning_rate": 1.0565502644198371e-06, + "logits/chosen": -1.4880902767181396, + "logits/rejected": -1.5662941932678223, + "logps/chosen": -169.45208740234375, + "logps/rejected": -78.78369140625, + "loss": 0.1524, + "rewards/accuracies": 1.0, + "rewards/chosen": 7.506019592285156, + "rewards/margins": 1.2107176780700684, + "rewards/rejected": 6.295301914215088, + "step": 9803 + }, + { + "epoch": 1.59, + "learning_rate": 1.0557424089465674e-06, + "logits/chosen": -1.0909322500228882, + "logits/rejected": -1.0587570667266846, + "logps/chosen": -115.46153259277344, + "logps/rejected": -71.54470825195312, + "loss": 0.5734, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.5185317993164062, + "rewards/margins": -0.4013230800628662, + "rewards/rejected": 2.9198548793792725, + "step": 9804 + }, + { + "epoch": 1.59, + "learning_rate": 1.0549348259841753e-06, + "logits/chosen": -1.360990047454834, + "logits/rejected": -1.3516924381256104, + "logps/chosen": -38.68890380859375, + "logps/rejected": -106.57241821289062, + "loss": 0.6944, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.087970018386841, + "rewards/margins": 0.4799621105194092, + "rewards/rejected": 2.6080079078674316, + "step": 9805 + }, + { + "epoch": 1.59, + "learning_rate": 1.0541275155884596e-06, + "logits/chosen": -1.1037038564682007, + "logits/rejected": -1.1165446043014526, + "logps/chosen": -43.52067565917969, + "logps/rejected": -92.4409408569336, + "loss": 0.3817, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.575183153152466, + "rewards/margins": 0.7815964221954346, + "rewards/rejected": 1.7935867309570312, + "step": 9806 + }, + { + "epoch": 1.59, + "learning_rate": 1.0533204778151945e-06, + "logits/chosen": -1.2625210285186768, + "logits/rejected": -1.2414577007293701, + "logps/chosen": -166.24525451660156, + "logps/rejected": -96.8963623046875, + "loss": 0.09, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.455268859863281, + "rewards/margins": 2.0503463745117188, + "rewards/rejected": 2.4049224853515625, + "step": 9807 + }, + { + "epoch": 1.59, + "learning_rate": 1.0525137127201407e-06, + "logits/chosen": -1.3533570766448975, + "logits/rejected": -1.3268941640853882, + "logps/chosen": -52.811256408691406, + "logps/rejected": -45.807586669921875, + "loss": 0.5664, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.5039803981781006, + "rewards/margins": 0.4870903491973877, + "rewards/rejected": 2.016890048980713, + "step": 9808 + }, + { + "epoch": 1.59, + "learning_rate": 1.0517072203590394e-06, + "logits/chosen": -0.950043797492981, + "logits/rejected": -0.9707348346710205, + "logps/chosen": -25.826513290405273, + "logps/rejected": -22.902576446533203, + "loss": 0.392, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.9392839670181274, + "rewards/margins": 0.04161149263381958, + "rewards/rejected": 0.8976724743843079, + "step": 9809 + }, + { + "epoch": 1.59, + "learning_rate": 1.0509010007876085e-06, + "logits/chosen": -1.0688639879226685, + "logits/rejected": -1.074276328086853, + "logps/chosen": -53.21687698364258, + "logps/rejected": -44.13950729370117, + "loss": 0.3223, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.795264959335327, + "rewards/margins": 0.28478479385375977, + "rewards/rejected": 2.5104801654815674, + "step": 9810 + }, + { + "epoch": 1.59, + "learning_rate": 1.0500950540615534e-06, + "logits/chosen": -1.634097695350647, + "logits/rejected": -1.6122382879257202, + "logps/chosen": -74.83121490478516, + "logps/rejected": -120.55271911621094, + "loss": 0.4624, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0139496326446533, + "rewards/margins": 0.9900345802307129, + "rewards/rejected": 2.0239150524139404, + "step": 9811 + }, + { + "epoch": 1.59, + "learning_rate": 1.0492893802365546e-06, + "logits/chosen": -1.0690099000930786, + "logits/rejected": -1.0393935441970825, + "logps/chosen": -38.48095703125, + "logps/rejected": -44.44159698486328, + "loss": 1.0046, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.7187836170196533, + "rewards/margins": -0.5587646961212158, + "rewards/rejected": 3.277548313140869, + "step": 9812 + }, + { + "epoch": 1.59, + "learning_rate": 1.0484839793682783e-06, + "logits/chosen": -1.4103360176086426, + "logits/rejected": -1.4301998615264893, + "logps/chosen": -50.09273910522461, + "logps/rejected": -33.694034576416016, + "loss": 0.391, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.9124462604522705, + "rewards/margins": -0.1694939136505127, + "rewards/rejected": 2.081940174102783, + "step": 9813 + }, + { + "epoch": 1.59, + "learning_rate": 1.0476788515123687e-06, + "logits/chosen": -1.601547122001648, + "logits/rejected": -1.593809962272644, + "logps/chosen": -108.68270874023438, + "logps/rejected": -147.55694580078125, + "loss": 1.0913, + "rewards/accuracies": 0.0, + "rewards/chosen": 6.848565578460693, + "rewards/margins": -1.9610295295715332, + "rewards/rejected": 8.809595108032227, + "step": 9814 + }, + { + "epoch": 1.59, + "learning_rate": 1.0468739967244556e-06, + "logits/chosen": -1.0941411256790161, + "logits/rejected": -1.08393132686615, + "logps/chosen": -72.487548828125, + "logps/rejected": -27.551877975463867, + "loss": 0.4177, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.79995197057724, + "rewards/margins": -0.18405091762542725, + "rewards/rejected": 0.9840028882026672, + "step": 9815 + }, + { + "epoch": 1.59, + "learning_rate": 1.0460694150601418e-06, + "logits/chosen": -1.0038913488388062, + "logits/rejected": -1.0038913488388062, + "logps/chosen": -1.0826650857925415, + "logps/rejected": -1.0826650857925415, + "loss": 0.6943, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.1685846894979477, + "rewards/margins": 0.0, + "rewards/rejected": 0.1685846894979477, + "step": 9816 + }, + { + "epoch": 1.59, + "learning_rate": 1.0452651065750202e-06, + "logits/chosen": -1.437524676322937, + "logits/rejected": -1.2930209636688232, + "logps/chosen": -71.91381072998047, + "logps/rejected": -55.04234313964844, + "loss": 0.036, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9950859546661377, + "rewards/margins": 2.651646375656128, + "rewards/rejected": 0.3434394896030426, + "step": 9817 + }, + { + "epoch": 1.59, + "learning_rate": 1.0444610713246589e-06, + "logits/chosen": -1.2002736330032349, + "logits/rejected": -1.1619409322738647, + "logps/chosen": -116.8187484741211, + "logps/rejected": -49.92138671875, + "loss": 0.0443, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.018442630767822, + "rewards/margins": 2.9147040843963623, + "rewards/rejected": 2.10373854637146, + "step": 9818 + }, + { + "epoch": 1.59, + "learning_rate": 1.0436573093646107e-06, + "logits/chosen": -0.8844409584999084, + "logits/rejected": -0.8770298361778259, + "logps/chosen": -3.040680408477783, + "logps/rejected": -6.18343448638916, + "loss": 0.3037, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.40037187933921814, + "rewards/margins": 0.21778492629528046, + "rewards/rejected": 0.18258695304393768, + "step": 9819 + }, + { + "epoch": 1.59, + "learning_rate": 1.0428538207504057e-06, + "logits/chosen": -1.1717413663864136, + "logits/rejected": -0.9175766706466675, + "logps/chosen": -177.480712890625, + "logps/rejected": -70.34402465820312, + "loss": 0.0461, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.614044189453125, + "rewards/margins": 2.498941659927368, + "rewards/rejected": 2.115102529525757, + "step": 9820 + }, + { + "epoch": 1.59, + "learning_rate": 1.0420506055375606e-06, + "logits/chosen": -1.5889766216278076, + "logits/rejected": -1.586129903793335, + "logps/chosen": -84.13252258300781, + "logps/rejected": -64.54496002197266, + "loss": 0.9694, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.2208480834960938, + "rewards/margins": -0.9064369201660156, + "rewards/rejected": 3.1272850036621094, + "step": 9821 + }, + { + "epoch": 1.59, + "learning_rate": 1.0412476637815667e-06, + "logits/chosen": -1.2042943239212036, + "logits/rejected": -1.0755670070648193, + "logps/chosen": -94.69340515136719, + "logps/rejected": -64.89005279541016, + "loss": 0.5038, + "rewards/accuracies": 0.0, + "rewards/chosen": 4.081408977508545, + "rewards/margins": -0.038556575775146484, + "rewards/rejected": 4.119965553283691, + "step": 9822 + }, + { + "epoch": 1.59, + "learning_rate": 1.0404449955379026e-06, + "logits/chosen": -1.3502626419067383, + "logits/rejected": -1.321911334991455, + "logps/chosen": -89.721435546875, + "logps/rejected": -71.84683227539062, + "loss": 0.0684, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.3388564586639404, + "rewards/margins": 2.1204657554626465, + "rewards/rejected": 1.218390703201294, + "step": 9823 + }, + { + "epoch": 1.59, + "learning_rate": 1.0396426008620226e-06, + "logits/chosen": -1.2900748252868652, + "logits/rejected": -1.2998818159103394, + "logps/chosen": -48.23888397216797, + "logps/rejected": -73.62736511230469, + "loss": 0.4036, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.6257996559143066, + "rewards/margins": -0.17119741439819336, + "rewards/rejected": 2.7969970703125, + "step": 9824 + }, + { + "epoch": 1.59, + "learning_rate": 1.0388404798093666e-06, + "logits/chosen": -1.4846872091293335, + "logits/rejected": -1.3277231454849243, + "logps/chosen": -53.71159362792969, + "logps/rejected": -12.149330139160156, + "loss": 0.2746, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2488601207733154, + "rewards/margins": 2.8825771808624268, + "rewards/rejected": 0.36628302931785583, + "step": 9825 + }, + { + "epoch": 1.59, + "learning_rate": 1.0380386324353508e-06, + "logits/chosen": -1.440647840499878, + "logits/rejected": -1.4209895133972168, + "logps/chosen": -52.09736251831055, + "logps/rejected": -78.32707214355469, + "loss": 0.3829, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.4234471321105957, + "rewards/margins": 1.4818158149719238, + "rewards/rejected": 1.9416313171386719, + "step": 9826 + }, + { + "epoch": 1.6, + "learning_rate": 1.0372370587953779e-06, + "logits/chosen": -1.4519790410995483, + "logits/rejected": -1.392796516418457, + "logps/chosen": -144.34088134765625, + "logps/rejected": -99.50120544433594, + "loss": 0.2054, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.0782060623168945, + "rewards/margins": 1.1392168998718262, + "rewards/rejected": 4.938989162445068, + "step": 9827 + }, + { + "epoch": 1.6, + "learning_rate": 1.0364357589448293e-06, + "logits/chosen": -0.844329297542572, + "logits/rejected": -0.9029417037963867, + "logps/chosen": -33.072837829589844, + "logps/rejected": -84.56795501708984, + "loss": 0.3144, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.4464973211288452, + "rewards/margins": 0.1805328130722046, + "rewards/rejected": 1.2659645080566406, + "step": 9828 + }, + { + "epoch": 1.6, + "learning_rate": 1.0356347329390649e-06, + "logits/chosen": -1.307081937789917, + "logits/rejected": -1.2450172901153564, + "logps/chosen": -104.34759521484375, + "logps/rejected": -65.43890380859375, + "loss": 0.0686, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.564416408538818, + "rewards/margins": 2.087592840194702, + "rewards/rejected": 3.476823568344116, + "step": 9829 + }, + { + "epoch": 1.6, + "learning_rate": 1.03483398083343e-06, + "logits/chosen": -1.2444030046463013, + "logits/rejected": -1.2702083587646484, + "logps/chosen": -59.8916015625, + "logps/rejected": -57.16316604614258, + "loss": 0.2757, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.2486854791641235, + "rewards/margins": 0.36235010623931885, + "rewards/rejected": 0.8863353729248047, + "step": 9830 + }, + { + "epoch": 1.6, + "learning_rate": 1.0340335026832476e-06, + "logits/chosen": -1.1773908138275146, + "logits/rejected": -1.1479182243347168, + "logps/chosen": -53.025474548339844, + "logps/rejected": -36.6854133605957, + "loss": 0.6159, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.3453285694122314, + "rewards/margins": 1.5600204467773438, + "rewards/rejected": 0.7853080630302429, + "step": 9831 + }, + { + "epoch": 1.6, + "learning_rate": 1.0332332985438248e-06, + "logits/chosen": -1.3745777606964111, + "logits/rejected": -1.2946484088897705, + "logps/chosen": -91.6299819946289, + "logps/rejected": -24.98380470275879, + "loss": 0.736, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.964307427406311, + "rewards/margins": 2.2871909141540527, + "rewards/rejected": -0.3228834271430969, + "step": 9832 + }, + { + "epoch": 1.6, + "learning_rate": 1.0324333684704463e-06, + "logits/chosen": -1.0583388805389404, + "logits/rejected": -1.1816301345825195, + "logps/chosen": -72.17996215820312, + "logps/rejected": -74.95113372802734, + "loss": 1.4531, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.9747238159179688, + "rewards/margins": -2.3547677993774414, + "rewards/rejected": 5.32949161529541, + "step": 9833 + }, + { + "epoch": 1.6, + "learning_rate": 1.0316337125183817e-06, + "logits/chosen": -1.2661150693893433, + "logits/rejected": -1.3231173753738403, + "logps/chosen": -75.51011657714844, + "logps/rejected": -82.76255798339844, + "loss": 0.6282, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.0248305797576904, + "rewards/margins": -0.3409919738769531, + "rewards/rejected": 3.3658225536346436, + "step": 9834 + }, + { + "epoch": 1.6, + "learning_rate": 1.030834330742877e-06, + "logits/chosen": -1.5444087982177734, + "logits/rejected": -1.6551498174667358, + "logps/chosen": -63.961605072021484, + "logps/rejected": -99.20824432373047, + "loss": 1.6743, + "rewards/accuracies": 0.0, + "rewards/chosen": 4.189731121063232, + "rewards/margins": -3.1962666511535645, + "rewards/rejected": 7.385997772216797, + "step": 9835 + }, + { + "epoch": 1.6, + "learning_rate": 1.030035223199165e-06, + "logits/chosen": -1.2235379219055176, + "logits/rejected": -1.2009872198104858, + "logps/chosen": -122.04434204101562, + "logps/rejected": -57.001304626464844, + "loss": 2.5413, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.904577732086182, + "rewards/margins": 0.6736564636230469, + "rewards/rejected": 4.230921268463135, + "step": 9836 + }, + { + "epoch": 1.6, + "learning_rate": 1.0292363899424535e-06, + "logits/chosen": -1.1480125188827515, + "logits/rejected": -1.0555901527404785, + "logps/chosen": -56.4827880859375, + "logps/rejected": -64.35327911376953, + "loss": 1.3317, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.008061170578003, + "rewards/margins": 2.3631527423858643, + "rewards/rejected": -0.35509148240089417, + "step": 9837 + }, + { + "epoch": 1.6, + "learning_rate": 1.028437831027937e-06, + "logits/chosen": -1.2840074300765991, + "logits/rejected": -1.2840074300765991, + "logps/chosen": -53.52655029296875, + "logps/rejected": -53.52655029296875, + "loss": 0.9537, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.3797149658203125, + "rewards/margins": 0.0, + "rewards/rejected": 3.3797149658203125, + "step": 9838 + }, + { + "epoch": 1.6, + "learning_rate": 1.0276395465107859e-06, + "logits/chosen": -1.0302979946136475, + "logits/rejected": -0.7579957246780396, + "logps/chosen": -67.01020812988281, + "logps/rejected": -54.94818115234375, + "loss": 0.1507, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.301046848297119, + "rewards/margins": 2.6877591609954834, + "rewards/rejected": -0.38671228289604187, + "step": 9839 + }, + { + "epoch": 1.6, + "learning_rate": 1.0268415364461566e-06, + "logits/chosen": -1.2351943254470825, + "logits/rejected": -1.1786648035049438, + "logps/chosen": -101.42732238769531, + "logps/rejected": -109.04728698730469, + "loss": 0.2497, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.4734864234924316, + "rewards/margins": 0.9764785766601562, + "rewards/rejected": 2.4970078468322754, + "step": 9840 + }, + { + "epoch": 1.6, + "learning_rate": 1.0260438008891816e-06, + "logits/chosen": -1.4509729146957397, + "logits/rejected": -1.4638373851776123, + "logps/chosen": -63.691001892089844, + "logps/rejected": -66.7323226928711, + "loss": 1.2323, + "rewards/accuracies": 0.0, + "rewards/chosen": 4.113858222961426, + "rewards/margins": -1.8409619331359863, + "rewards/rejected": 5.954820156097412, + "step": 9841 + }, + { + "epoch": 1.6, + "learning_rate": 1.025246339894979e-06, + "logits/chosen": -1.2442231178283691, + "logits/rejected": -1.123184084892273, + "logps/chosen": -28.776058197021484, + "logps/rejected": -14.500307083129883, + "loss": 0.23, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.9857590198516846, + "rewards/margins": 0.9666032791137695, + "rewards/rejected": 1.019155740737915, + "step": 9842 + }, + { + "epoch": 1.6, + "learning_rate": 1.0244491535186436e-06, + "logits/chosen": -1.4447602033615112, + "logits/rejected": -1.41176438331604, + "logps/chosen": -52.337562561035156, + "logps/rejected": -90.39619445800781, + "loss": 1.1735, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.3429458141326904, + "rewards/margins": -0.5202744007110596, + "rewards/rejected": 2.86322021484375, + "step": 9843 + }, + { + "epoch": 1.6, + "learning_rate": 1.0236522418152562e-06, + "logits/chosen": -0.9863002300262451, + "logits/rejected": -0.9624730944633484, + "logps/chosen": -19.594139099121094, + "logps/rejected": -1.6097438335418701, + "loss": 1.0818, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.14699135720729828, + "rewards/margins": -0.2740612030029297, + "rewards/rejected": 0.4210525453090668, + "step": 9844 + }, + { + "epoch": 1.6, + "learning_rate": 1.022855604839873e-06, + "logits/chosen": -1.0017259120941162, + "logits/rejected": -0.9668186902999878, + "logps/chosen": -61.99527359008789, + "logps/rejected": -64.55043029785156, + "loss": 0.6821, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.4992092847824097, + "rewards/margins": 0.04464459419250488, + "rewards/rejected": 1.4545646905899048, + "step": 9845 + }, + { + "epoch": 1.6, + "learning_rate": 1.0220592426475367e-06, + "logits/chosen": -1.0034974813461304, + "logits/rejected": -0.8629492521286011, + "logps/chosen": -47.435882568359375, + "logps/rejected": -16.095108032226562, + "loss": 0.4032, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9441757202148438, + "rewards/margins": 2.673650026321411, + "rewards/rejected": 0.2705257534980774, + "step": 9846 + }, + { + "epoch": 1.6, + "learning_rate": 1.0212631552932656e-06, + "logits/chosen": -1.3423010110855103, + "logits/rejected": -1.2543386220932007, + "logps/chosen": -56.87641143798828, + "logps/rejected": -14.018019676208496, + "loss": 0.0879, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.639585256576538, + "rewards/margins": 1.7228126525878906, + "rewards/rejected": 0.9167725443840027, + "step": 9847 + }, + { + "epoch": 1.6, + "learning_rate": 1.020467342832065e-06, + "logits/chosen": -1.230071783065796, + "logits/rejected": -1.239758849143982, + "logps/chosen": -36.912933349609375, + "logps/rejected": -68.31080627441406, + "loss": 1.1261, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.996335744857788, + "rewards/margins": -2.104177236557007, + "rewards/rejected": 5.100512981414795, + "step": 9848 + }, + { + "epoch": 1.6, + "learning_rate": 1.0196718053189147e-06, + "logits/chosen": -1.3768696784973145, + "logits/rejected": -1.0718425512313843, + "logps/chosen": -69.47354888916016, + "logps/rejected": -17.663793563842773, + "loss": 0.7878, + "rewards/accuracies": 1.0, + "rewards/chosen": 7.321568965911865, + "rewards/margins": 6.442738056182861, + "rewards/rejected": 0.8788310885429382, + "step": 9849 + }, + { + "epoch": 1.6, + "learning_rate": 1.0188765428087815e-06, + "logits/chosen": -1.4234435558319092, + "logits/rejected": -1.444226622581482, + "logps/chosen": -157.46713256835938, + "logps/rejected": -45.837379455566406, + "loss": 1.6681, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.276052951812744, + "rewards/margins": 2.944439649581909, + "rewards/rejected": 3.331613302230835, + "step": 9850 + }, + { + "epoch": 1.6, + "learning_rate": 1.0180815553566087e-06, + "logits/chosen": -1.0734329223632812, + "logits/rejected": -0.9427247047424316, + "logps/chosen": -43.470680236816406, + "logps/rejected": -7.644716262817383, + "loss": 0.1025, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.3941025733947754, + "rewards/margins": 1.4991339445114136, + "rewards/rejected": 0.8949686288833618, + "step": 9851 + }, + { + "epoch": 1.6, + "learning_rate": 1.0172868430173244e-06, + "logits/chosen": -1.8291523456573486, + "logits/rejected": -1.8113064765930176, + "logps/chosen": -54.81370544433594, + "logps/rejected": -33.80377960205078, + "loss": 0.379, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.269915819168091, + "rewards/margins": 2.7026736736297607, + "rewards/rejected": 0.5672420859336853, + "step": 9852 + }, + { + "epoch": 1.6, + "learning_rate": 1.0164924058458331e-06, + "logits/chosen": -1.2567338943481445, + "logits/rejected": -1.2567338943481445, + "logps/chosen": -29.85912322998047, + "logps/rejected": -29.85912322998047, + "loss": 0.6291, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.39664626121521, + "rewards/margins": 0.0, + "rewards/rejected": 2.39664626121521, + "step": 9853 + }, + { + "epoch": 1.6, + "learning_rate": 1.0156982438970254e-06, + "logits/chosen": -1.237927794456482, + "logits/rejected": -1.1896183490753174, + "logps/chosen": -66.54423522949219, + "logps/rejected": -52.861053466796875, + "loss": 1.5277, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.2068299055099487, + "rewards/margins": -1.0158509016036987, + "rewards/rejected": 2.2226808071136475, + "step": 9854 + }, + { + "epoch": 1.6, + "learning_rate": 1.0149043572257678e-06, + "logits/chosen": -1.5101357698440552, + "logits/rejected": -1.4400851726531982, + "logps/chosen": -87.15003967285156, + "logps/rejected": -50.12251663208008, + "loss": 0.071, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.8495073318481445, + "rewards/margins": 2.4522268772125244, + "rewards/rejected": 2.39728045463562, + "step": 9855 + }, + { + "epoch": 1.6, + "learning_rate": 1.0141107458869132e-06, + "logits/chosen": -1.4227895736694336, + "logits/rejected": -1.449442982673645, + "logps/chosen": -58.12858581542969, + "logps/rejected": -95.17044830322266, + "loss": 0.6208, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.7568390369415283, + "rewards/margins": -0.896425724029541, + "rewards/rejected": 3.6532647609710693, + "step": 9856 + }, + { + "epoch": 1.6, + "learning_rate": 1.01331740993529e-06, + "logits/chosen": -0.9890562891960144, + "logits/rejected": -0.9890562891960144, + "logps/chosen": -21.11430549621582, + "logps/rejected": -21.11430549621582, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.799717903137207, + "rewards/margins": 0.0, + "rewards/rejected": 2.799717903137207, + "step": 9857 + }, + { + "epoch": 1.6, + "learning_rate": 1.012524349425713e-06, + "logits/chosen": -1.056411862373352, + "logits/rejected": -1.0531208515167236, + "logps/chosen": -3.176539182662964, + "logps/rejected": -3.9238734245300293, + "loss": 0.2531, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.7111517786979675, + "rewards/margins": 0.4628932774066925, + "rewards/rejected": 0.24825850129127502, + "step": 9858 + }, + { + "epoch": 1.6, + "learning_rate": 1.0117315644129721e-06, + "logits/chosen": -0.9917654395103455, + "logits/rejected": -1.0006505250930786, + "logps/chosen": -66.86424255371094, + "logps/rejected": -116.94548797607422, + "loss": 0.3617, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.86039662361145, + "rewards/margins": 0.4709610939025879, + "rewards/rejected": 2.3894355297088623, + "step": 9859 + }, + { + "epoch": 1.6, + "learning_rate": 1.0109390549518439e-06, + "logits/chosen": -1.2805901765823364, + "logits/rejected": -1.1113289594650269, + "logps/chosen": -117.6258773803711, + "logps/rejected": -36.73394775390625, + "loss": 0.0913, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.1284706592559814, + "rewards/margins": 1.9113411903381348, + "rewards/rejected": 0.21712951362133026, + "step": 9860 + }, + { + "epoch": 1.6, + "learning_rate": 1.010146821097081e-06, + "logits/chosen": -1.6270138025283813, + "logits/rejected": -1.6206538677215576, + "logps/chosen": -115.87773895263672, + "logps/rejected": -46.06688690185547, + "loss": 0.3827, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.147678375244141, + "rewards/margins": 2.5912024974823, + "rewards/rejected": 3.556475877761841, + "step": 9861 + }, + { + "epoch": 1.6, + "learning_rate": 1.0093548629034216e-06, + "logits/chosen": -0.9144575595855713, + "logits/rejected": -1.0716378688812256, + "logps/chosen": -64.54277038574219, + "logps/rejected": -161.92660522460938, + "loss": 0.8652, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.540754795074463, + "rewards/margins": -1.4786958694458008, + "rewards/rejected": 4.019450664520264, + "step": 9862 + }, + { + "epoch": 1.6, + "learning_rate": 1.0085631804255803e-06, + "logits/chosen": -1.3037331104278564, + "logits/rejected": -1.0961192846298218, + "logps/chosen": -67.10067749023438, + "logps/rejected": -38.09746551513672, + "loss": 0.1253, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.086604356765747, + "rewards/margins": 1.2648781538009644, + "rewards/rejected": -0.1782737821340561, + "step": 9863 + }, + { + "epoch": 1.6, + "learning_rate": 1.0077717737182557e-06, + "logits/chosen": -1.3448094129562378, + "logits/rejected": -1.240601897239685, + "logps/chosen": -33.778656005859375, + "logps/rejected": -28.91053581237793, + "loss": 0.4782, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.996498942375183, + "rewards/margins": 1.1079108715057373, + "rewards/rejected": 0.8885881304740906, + "step": 9864 + }, + { + "epoch": 1.6, + "learning_rate": 1.0069806428361277e-06, + "logits/chosen": -1.258252501487732, + "logits/rejected": -1.3977371454238892, + "logps/chosen": -68.91593933105469, + "logps/rejected": -124.64828491210938, + "loss": 3.7637, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.2733094692230225, + "rewards/margins": -3.3150970935821533, + "rewards/rejected": 5.588406562805176, + "step": 9865 + }, + { + "epoch": 1.6, + "learning_rate": 1.0061897878338545e-06, + "logits/chosen": -1.3682219982147217, + "logits/rejected": -1.3101292848587036, + "logps/chosen": -166.9036865234375, + "logps/rejected": -59.38825607299805, + "loss": 0.0044, + "rewards/accuracies": 1.0, + "rewards/chosen": 8.05826473236084, + "rewards/margins": 5.063887596130371, + "rewards/rejected": 2.9943768978118896, + "step": 9866 + }, + { + "epoch": 1.6, + "learning_rate": 1.005399208766078e-06, + "logits/chosen": -1.2066749334335327, + "logits/rejected": -1.2932144403457642, + "logps/chosen": -82.4649429321289, + "logps/rejected": -108.16239166259766, + "loss": 0.9968, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.118091583251953, + "rewards/margins": -1.7469208240509033, + "rewards/rejected": 3.8650124073028564, + "step": 9867 + }, + { + "epoch": 1.6, + "learning_rate": 1.0046089056874175e-06, + "logits/chosen": -1.4688478708267212, + "logits/rejected": -1.4426268339157104, + "logps/chosen": -78.44556427001953, + "logps/rejected": -77.7606430053711, + "loss": 0.9547, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.8484718799591064, + "rewards/margins": 1.2809737920761108, + "rewards/rejected": 1.5674980878829956, + "step": 9868 + }, + { + "epoch": 1.6, + "learning_rate": 1.0038188786524783e-06, + "logits/chosen": -0.9852390885353088, + "logits/rejected": -0.9402517080307007, + "logps/chosen": -124.06381225585938, + "logps/rejected": -56.24364471435547, + "loss": 0.0941, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.085928440093994, + "rewards/margins": 1.6201698780059814, + "rewards/rejected": 1.4657585620880127, + "step": 9869 + }, + { + "epoch": 1.6, + "learning_rate": 1.003029127715841e-06, + "logits/chosen": -1.3233522176742554, + "logits/rejected": -1.3318058252334595, + "logps/chosen": -44.84632873535156, + "logps/rejected": -79.0438003540039, + "loss": 0.3422, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1531388759613037, + "rewards/margins": 1.1386215686798096, + "rewards/rejected": 2.014517307281494, + "step": 9870 + }, + { + "epoch": 1.6, + "learning_rate": 1.0022396529320727e-06, + "logits/chosen": -1.3611090183258057, + "logits/rejected": -1.2719115018844604, + "logps/chosen": -100.58326721191406, + "logps/rejected": -42.986656188964844, + "loss": 1.1868, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.513606548309326, + "rewards/margins": 3.0122416019439697, + "rewards/rejected": 1.5013649463653564, + "step": 9871 + }, + { + "epoch": 1.6, + "learning_rate": 1.0014504543557158e-06, + "logits/chosen": -1.604344367980957, + "logits/rejected": -1.5853078365325928, + "logps/chosen": -60.06556701660156, + "logps/rejected": -85.99058532714844, + "loss": 0.3907, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.8083488941192627, + "rewards/margins": 0.3836212158203125, + "rewards/rejected": 1.4247276782989502, + "step": 9872 + }, + { + "epoch": 1.6, + "learning_rate": 1.0006615320412994e-06, + "logits/chosen": -1.0932031869888306, + "logits/rejected": -1.1312321424484253, + "logps/chosen": -35.118614196777344, + "logps/rejected": -36.19613265991211, + "loss": 0.5632, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.8804359436035156, + "rewards/margins": -0.4446704387664795, + "rewards/rejected": 3.325106382369995, + "step": 9873 + }, + { + "epoch": 1.6, + "learning_rate": 9.998728860433277e-07, + "logits/chosen": -1.0646058320999146, + "logits/rejected": -1.131403923034668, + "logps/chosen": -51.84541320800781, + "logps/rejected": -63.569976806640625, + "loss": 2.0552, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.8189666271209717, + "rewards/margins": -3.6643083095550537, + "rewards/rejected": 6.483274936676025, + "step": 9874 + }, + { + "epoch": 1.6, + "learning_rate": 9.99084516416291e-07, + "logits/chosen": -1.397290825843811, + "logits/rejected": -1.3223059177398682, + "logps/chosen": -70.55375671386719, + "logps/rejected": -18.67014503479004, + "loss": 0.1783, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1403839588165283, + "rewards/margins": 2.688896417617798, + "rewards/rejected": 0.45148754119873047, + "step": 9875 + }, + { + "epoch": 1.6, + "learning_rate": 9.982964232146564e-07, + "logits/chosen": -1.4885034561157227, + "logits/rejected": -1.402100682258606, + "logps/chosen": -48.37318420410156, + "logps/rejected": -80.36314392089844, + "loss": 2.019, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.6188507080078125, + "rewards/margins": 0.3396611213684082, + "rewards/rejected": 4.279189586639404, + "step": 9876 + }, + { + "epoch": 1.6, + "learning_rate": 9.975086064928752e-07, + "logits/chosen": -1.4593493938446045, + "logits/rejected": -1.4222592115402222, + "logps/chosen": -42.78209686279297, + "logps/rejected": -58.70423889160156, + "loss": 0.2774, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.374082326889038, + "rewards/margins": 0.36505818367004395, + "rewards/rejected": 3.009024143218994, + "step": 9877 + }, + { + "epoch": 1.6, + "learning_rate": 9.967210663053767e-07, + "logits/chosen": -1.2990689277648926, + "logits/rejected": -1.238502860069275, + "logps/chosen": -70.48191833496094, + "logps/rejected": -121.20248413085938, + "loss": 0.8393, + "rewards/accuracies": 0.0, + "rewards/chosen": 6.453605651855469, + "rewards/margins": -0.5010299682617188, + "rewards/rejected": 6.9546356201171875, + "step": 9878 + }, + { + "epoch": 1.6, + "learning_rate": 9.95933802706574e-07, + "logits/chosen": -1.4056286811828613, + "logits/rejected": -1.466618299484253, + "logps/chosen": -89.92953491210938, + "logps/rejected": -88.476806640625, + "loss": 1.285, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.023664951324463, + "rewards/margins": -2.487043857574463, + "rewards/rejected": 5.510708808898926, + "step": 9879 + }, + { + "epoch": 1.6, + "learning_rate": 9.951468157508575e-07, + "logits/chosen": -1.30268394947052, + "logits/rejected": -1.2590116262435913, + "logps/chosen": -109.4496078491211, + "logps/rejected": -88.61103057861328, + "loss": 0.5088, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.830945611000061, + "rewards/margins": -0.50812828540802, + "rewards/rejected": 2.339073896408081, + "step": 9880 + }, + { + "epoch": 1.6, + "learning_rate": 9.943601054926028e-07, + "logits/chosen": -1.3894145488739014, + "logits/rejected": -1.391856074333191, + "logps/chosen": -80.67973327636719, + "logps/rejected": -76.90265655517578, + "loss": 0.4828, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.7568435668945312, + "rewards/margins": -0.17525267601013184, + "rewards/rejected": 2.932096242904663, + "step": 9881 + }, + { + "epoch": 1.6, + "learning_rate": 9.935736719861621e-07, + "logits/chosen": -0.9820446372032166, + "logits/rejected": -0.9820446372032166, + "logps/chosen": -59.144073486328125, + "logps/rejected": -59.144073486328125, + "loss": 1.1612, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.4167251586914062, + "rewards/margins": 0.0, + "rewards/rejected": 1.4167251586914062, + "step": 9882 + }, + { + "epoch": 1.6, + "learning_rate": 9.92787515285873e-07, + "logits/chosen": -0.748813807964325, + "logits/rejected": -0.7458904981613159, + "logps/chosen": -2.5480170249938965, + "logps/rejected": -4.1238112449646, + "loss": 0.5091, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.22528843581676483, + "rewards/margins": -0.563899040222168, + "rewards/rejected": 0.789187490940094, + "step": 9883 + }, + { + "epoch": 1.6, + "learning_rate": 9.920016354460483e-07, + "logits/chosen": -1.45530366897583, + "logits/rejected": -1.3143774271011353, + "logps/chosen": -136.6148681640625, + "logps/rejected": -32.01712417602539, + "loss": 0.0569, + "rewards/accuracies": 1.0, + "rewards/chosen": 8.994659423828125, + "rewards/margins": 5.017584800720215, + "rewards/rejected": 3.9770748615264893, + "step": 9884 + }, + { + "epoch": 1.6, + "learning_rate": 9.91216032520988e-07, + "logits/chosen": -1.5647715330123901, + "logits/rejected": -1.3742772340774536, + "logps/chosen": -87.20287322998047, + "logps/rejected": -20.363285064697266, + "loss": 0.0287, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.7945475578308105, + "rewards/margins": 6.277886867523193, + "rewards/rejected": 0.5166605114936829, + "step": 9885 + }, + { + "epoch": 1.6, + "learning_rate": 9.90430706564967e-07, + "logits/chosen": -1.308517336845398, + "logits/rejected": -1.3523390293121338, + "logps/chosen": -198.92710876464844, + "logps/rejected": -178.86386108398438, + "loss": 0.8884, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.991002082824707, + "rewards/margins": 5.219362258911133, + "rewards/rejected": 1.7716400623321533, + "step": 9886 + }, + { + "epoch": 1.6, + "learning_rate": 9.896456576322471e-07, + "logits/chosen": -1.4321564435958862, + "logits/rejected": -1.3888682126998901, + "logps/chosen": -98.34647369384766, + "logps/rejected": -80.27851867675781, + "loss": 0.1778, + "rewards/accuracies": 1.0, + "rewards/chosen": 7.741608619689941, + "rewards/margins": 1.4073586463928223, + "rewards/rejected": 6.334249973297119, + "step": 9887 + }, + { + "epoch": 1.6, + "learning_rate": 9.888608857770643e-07, + "logits/chosen": -1.338526725769043, + "logits/rejected": -1.3835543394088745, + "logps/chosen": -73.9887466430664, + "logps/rejected": -82.3779067993164, + "loss": 0.2819, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.7298760414123535, + "rewards/margins": 1.8432656526565552, + "rewards/rejected": 1.8866103887557983, + "step": 9888 + }, + { + "epoch": 1.61, + "learning_rate": 9.880763910536417e-07, + "logits/chosen": -1.2219120264053345, + "logits/rejected": -1.2576580047607422, + "logps/chosen": -96.99750518798828, + "logps/rejected": -93.58709716796875, + "loss": 0.3606, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.7306030988693237, + "rewards/margins": -0.019464850425720215, + "rewards/rejected": 1.750067949295044, + "step": 9889 + }, + { + "epoch": 1.61, + "learning_rate": 9.872921735161778e-07, + "logits/chosen": -1.4873050451278687, + "logits/rejected": -1.517561674118042, + "logps/chosen": -68.25853729248047, + "logps/rejected": -76.8856430053711, + "loss": 1.4056, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.8738632202148438, + "rewards/margins": 0.6234304904937744, + "rewards/rejected": 2.2504327297210693, + "step": 9890 + }, + { + "epoch": 1.61, + "learning_rate": 9.86508233218858e-07, + "logits/chosen": -1.38046395778656, + "logits/rejected": -1.38046395778656, + "logps/chosen": -14.61817741394043, + "logps/rejected": -14.61817741394043, + "loss": 0.4389, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.363629102706909, + "rewards/margins": 0.0, + "rewards/rejected": 2.363629102706909, + "step": 9891 + }, + { + "epoch": 1.61, + "learning_rate": 9.857245702158413e-07, + "logits/chosen": -1.1661560535430908, + "logits/rejected": -1.0768775939941406, + "logps/chosen": -211.77879333496094, + "logps/rejected": -35.36284255981445, + "loss": 0.0849, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.198564052581787, + "rewards/margins": 5.7743611335754395, + "rewards/rejected": 0.42420274019241333, + "step": 9892 + }, + { + "epoch": 1.61, + "learning_rate": 9.84941184561275e-07, + "logits/chosen": -1.3295397758483887, + "logits/rejected": -1.3894766569137573, + "logps/chosen": -154.43421936035156, + "logps/rejected": -222.4219970703125, + "loss": 0.1853, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.719632148742676, + "rewards/margins": 0.8043608665466309, + "rewards/rejected": 5.915271282196045, + "step": 9893 + }, + { + "epoch": 1.61, + "learning_rate": 9.841580763092812e-07, + "logits/chosen": -1.4346535205841064, + "logits/rejected": -1.4338674545288086, + "logps/chosen": -51.34072494506836, + "logps/rejected": -37.732513427734375, + "loss": 0.8187, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.9119465351104736, + "rewards/margins": -1.0328631401062012, + "rewards/rejected": 2.944809675216675, + "step": 9894 + }, + { + "epoch": 1.61, + "learning_rate": 9.833752455139667e-07, + "logits/chosen": -1.0719032287597656, + "logits/rejected": -1.12617826461792, + "logps/chosen": -44.430419921875, + "logps/rejected": -100.68228149414062, + "loss": 0.0822, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.316333770751953, + "rewards/margins": 2.0718865394592285, + "rewards/rejected": 0.24444733560085297, + "step": 9895 + }, + { + "epoch": 1.61, + "learning_rate": 9.82592692229416e-07, + "logits/chosen": -1.636752963066101, + "logits/rejected": -1.696523666381836, + "logps/chosen": -164.0656280517578, + "logps/rejected": -166.91348266601562, + "loss": 0.7669, + "rewards/accuracies": 0.0, + "rewards/chosen": 8.5762939453125, + "rewards/margins": -0.4158477783203125, + "rewards/rejected": 8.992141723632812, + "step": 9896 + }, + { + "epoch": 1.61, + "learning_rate": 9.818104165096986e-07, + "logits/chosen": -1.1705087423324585, + "logits/rejected": -1.1321088075637817, + "logps/chosen": -47.302574157714844, + "logps/rejected": -36.43212127685547, + "loss": 0.5096, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.8168457746505737, + "rewards/margins": -0.5532439947128296, + "rewards/rejected": 2.3700897693634033, + "step": 9897 + }, + { + "epoch": 1.61, + "learning_rate": 9.810284184088592e-07, + "logits/chosen": -1.1624042987823486, + "logits/rejected": -1.1624042987823486, + "logps/chosen": -51.579898834228516, + "logps/rejected": -51.579898834228516, + "loss": 0.3701, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.5276352167129517, + "rewards/margins": 0.0, + "rewards/rejected": 0.5276352167129517, + "step": 9898 + }, + { + "epoch": 1.61, + "learning_rate": 9.802466979809288e-07, + "logits/chosen": -1.5540835857391357, + "logits/rejected": -1.577234148979187, + "logps/chosen": -84.3882064819336, + "logps/rejected": -111.6738510131836, + "loss": 1.2897, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.5103065967559814, + "rewards/margins": -1.0238966941833496, + "rewards/rejected": 3.534203290939331, + "step": 9899 + }, + { + "epoch": 1.61, + "learning_rate": 9.794652552799172e-07, + "logits/chosen": -1.3172855377197266, + "logits/rejected": -1.476780891418457, + "logps/chosen": -47.04920959472656, + "logps/rejected": -104.01597595214844, + "loss": 1.0763, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.9629006385803223, + "rewards/margins": -1.8605127334594727, + "rewards/rejected": 5.823413372039795, + "step": 9900 + }, + { + "epoch": 1.61, + "learning_rate": 9.786840903598128e-07, + "logits/chosen": -1.0739672183990479, + "logits/rejected": -1.0755678415298462, + "logps/chosen": -3.3654463291168213, + "logps/rejected": -1.549445629119873, + "loss": 0.4922, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0994313508272171, + "rewards/margins": -0.24287094175815582, + "rewards/rejected": 0.3423022925853729, + "step": 9901 + }, + { + "epoch": 1.61, + "learning_rate": 9.779032032745889e-07, + "logits/chosen": -1.285990595817566, + "logits/rejected": -1.2947012186050415, + "logps/chosen": -60.4791145324707, + "logps/rejected": -70.10203552246094, + "loss": 0.2901, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.625883102416992, + "rewards/margins": 0.24054837226867676, + "rewards/rejected": 2.3853347301483154, + "step": 9902 + }, + { + "epoch": 1.61, + "learning_rate": 9.771225940781948e-07, + "logits/chosen": -1.530808687210083, + "logits/rejected": -1.5613130331039429, + "logps/chosen": -68.0096435546875, + "logps/rejected": -102.79081726074219, + "loss": 1.6087, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.5315277576446533, + "rewards/margins": -0.48647308349609375, + "rewards/rejected": 2.018000841140747, + "step": 9903 + }, + { + "epoch": 1.61, + "learning_rate": 9.76342262824566e-07, + "logits/chosen": -0.9757201671600342, + "logits/rejected": -0.9131961464881897, + "logps/chosen": -33.43661117553711, + "logps/rejected": -37.08660888671875, + "loss": 0.4002, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.2252933979034424, + "rewards/margins": 0.1153721809387207, + "rewards/rejected": 2.1099212169647217, + "step": 9904 + }, + { + "epoch": 1.61, + "learning_rate": 9.755622095676138e-07, + "logits/chosen": -1.2392228841781616, + "logits/rejected": -1.1969945430755615, + "logps/chosen": -60.418758392333984, + "logps/rejected": -78.97239685058594, + "loss": 0.3624, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.9413105249404907, + "rewards/margins": -0.04442858695983887, + "rewards/rejected": 1.9857391119003296, + "step": 9905 + }, + { + "epoch": 1.61, + "learning_rate": 9.74782434361234e-07, + "logits/chosen": -1.0892438888549805, + "logits/rejected": -1.0940966606140137, + "logps/chosen": -4.616824626922607, + "logps/rejected": -2.2605931758880615, + "loss": 0.4879, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.45657554268836975, + "rewards/margins": -0.12160107493400574, + "rewards/rejected": 0.5781766176223755, + "step": 9906 + }, + { + "epoch": 1.61, + "learning_rate": 9.740029372593002e-07, + "logits/chosen": -1.4882984161376953, + "logits/rejected": -1.5026556253433228, + "logps/chosen": -62.442352294921875, + "logps/rejected": -112.08203125, + "loss": 0.3372, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.8305656909942627, + "rewards/margins": 0.038188934326171875, + "rewards/rejected": 1.7923767566680908, + "step": 9907 + }, + { + "epoch": 1.61, + "learning_rate": 9.7322371831567e-07, + "logits/chosen": -1.0562331676483154, + "logits/rejected": -1.1101025342941284, + "logps/chosen": -58.725006103515625, + "logps/rejected": -68.40367126464844, + "loss": 0.5213, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.623132348060608, + "rewards/margins": -0.33850252628326416, + "rewards/rejected": 1.961634874343872, + "step": 9908 + }, + { + "epoch": 1.61, + "learning_rate": 9.724447775841784e-07, + "logits/chosen": -1.1631778478622437, + "logits/rejected": -1.172629952430725, + "logps/chosen": -23.263675689697266, + "logps/rejected": -23.04183006286621, + "loss": 0.8149, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.6650797128677368, + "rewards/margins": -0.13703042268753052, + "rewards/rejected": 0.8021101355552673, + "step": 9909 + }, + { + "epoch": 1.61, + "learning_rate": 9.71666115118644e-07, + "logits/chosen": -0.9678080677986145, + "logits/rejected": -0.9696592688560486, + "logps/chosen": -44.313232421875, + "logps/rejected": -13.985755920410156, + "loss": 0.4177, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.625689685344696, + "rewards/margins": -0.2131357192993164, + "rewards/rejected": 0.8388254046440125, + "step": 9910 + }, + { + "epoch": 1.61, + "learning_rate": 9.708877309728638e-07, + "logits/chosen": -1.0282909870147705, + "logits/rejected": -1.0282909870147705, + "logps/chosen": -10.015327453613281, + "logps/rejected": -10.015327453613281, + "loss": 0.4058, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.6679712533950806, + "rewards/margins": 0.0, + "rewards/rejected": 0.6679712533950806, + "step": 9911 + }, + { + "epoch": 1.61, + "learning_rate": 9.701096252006192e-07, + "logits/chosen": -1.325113296508789, + "logits/rejected": -1.0987557172775269, + "logps/chosen": -81.59536743164062, + "logps/rejected": -27.48756980895996, + "loss": 0.028, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.709761142730713, + "rewards/margins": 2.878154754638672, + "rewards/rejected": 0.8316065073013306, + "step": 9912 + }, + { + "epoch": 1.61, + "learning_rate": 9.693317978556666e-07, + "logits/chosen": -1.1734265089035034, + "logits/rejected": -1.089401125907898, + "logps/chosen": -70.0030288696289, + "logps/rejected": -28.320953369140625, + "loss": 0.4574, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.5813621282577515, + "rewards/margins": 0.13913488388061523, + "rewards/rejected": 1.4422272443771362, + "step": 9913 + }, + { + "epoch": 1.61, + "learning_rate": 9.685542489917494e-07, + "logits/chosen": -1.0583784580230713, + "logits/rejected": -1.0559847354888916, + "logps/chosen": -2.342977523803711, + "logps/rejected": -1.9866390228271484, + "loss": 1.1432, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.37811312079429626, + "rewards/margins": 0.13541679084300995, + "rewards/rejected": 0.24269632995128632, + "step": 9914 + }, + { + "epoch": 1.61, + "learning_rate": 9.677769786625869e-07, + "logits/chosen": -1.1327801942825317, + "logits/rejected": -0.9911814332008362, + "logps/chosen": -46.693634033203125, + "logps/rejected": -39.75431823730469, + "loss": 0.2859, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.3080177307128906, + "rewards/margins": 0.3530135154724121, + "rewards/rejected": 2.9550042152404785, + "step": 9915 + }, + { + "epoch": 1.61, + "learning_rate": 9.669999869218827e-07, + "logits/chosen": -1.2878844738006592, + "logits/rejected": -1.2536388635635376, + "logps/chosen": -55.87051010131836, + "logps/rejected": -71.65316009521484, + "loss": 2.252, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.40386700630188, + "rewards/margins": -0.6311571598052979, + "rewards/rejected": 4.035024166107178, + "step": 9916 + }, + { + "epoch": 1.61, + "learning_rate": 9.66223273823318e-07, + "logits/chosen": -1.5966389179229736, + "logits/rejected": -1.5684810876846313, + "logps/chosen": -127.86839294433594, + "logps/rejected": -104.36780548095703, + "loss": 0.7739, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.517468214035034, + "rewards/margins": -0.09531784057617188, + "rewards/rejected": 2.612786054611206, + "step": 9917 + }, + { + "epoch": 1.61, + "learning_rate": 9.654468394205579e-07, + "logits/chosen": -1.2634278535842896, + "logits/rejected": -1.2410995960235596, + "logps/chosen": -2.0412063598632812, + "logps/rejected": -31.364696502685547, + "loss": 0.7043, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.520322859287262, + "rewards/margins": -0.3547557592391968, + "rewards/rejected": 0.8750786185264587, + "step": 9918 + }, + { + "epoch": 1.61, + "learning_rate": 9.646706837672449e-07, + "logits/chosen": -1.1308444738388062, + "logits/rejected": -1.0927109718322754, + "logps/chosen": -39.02757263183594, + "logps/rejected": -57.25086975097656, + "loss": 1.1298, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.7776116132736206, + "rewards/margins": -2.0539894104003906, + "rewards/rejected": 3.8316009044647217, + "step": 9919 + }, + { + "epoch": 1.61, + "learning_rate": 9.638948069170061e-07, + "logits/chosen": -1.0777088403701782, + "logits/rejected": -1.0712164640426636, + "logps/chosen": -90.36976623535156, + "logps/rejected": -87.3779296875, + "loss": 0.3056, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.098155498504639, + "rewards/margins": 0.24130582809448242, + "rewards/rejected": 3.8568496704101562, + "step": 9920 + }, + { + "epoch": 1.61, + "learning_rate": 9.631192089234465e-07, + "logits/chosen": -1.563675880432129, + "logits/rejected": -1.3122644424438477, + "logps/chosen": -117.57731628417969, + "logps/rejected": -45.56742858886719, + "loss": 0.0282, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.5565643310546875, + "rewards/margins": 4.641775608062744, + "rewards/rejected": 1.914788842201233, + "step": 9921 + }, + { + "epoch": 1.61, + "learning_rate": 9.62343889840151e-07, + "logits/chosen": -1.0726630687713623, + "logits/rejected": -1.1264804601669312, + "logps/chosen": -70.18630981445312, + "logps/rejected": -59.57170486450195, + "loss": 0.7899, + "rewards/accuracies": 0.0, + "rewards/chosen": 4.678236484527588, + "rewards/margins": -0.9083309173583984, + "rewards/rejected": 5.586567401885986, + "step": 9922 + }, + { + "epoch": 1.61, + "learning_rate": 9.615688497206893e-07, + "logits/chosen": -1.2301486730575562, + "logits/rejected": -1.324206829071045, + "logps/chosen": -48.238853454589844, + "logps/rejected": -83.82634735107422, + "loss": 0.6769, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.5826728343963623, + "rewards/margins": -0.5696570873260498, + "rewards/rejected": 4.152329921722412, + "step": 9923 + }, + { + "epoch": 1.61, + "learning_rate": 9.607940886186063e-07, + "logits/chosen": -1.3100379705429077, + "logits/rejected": -1.3723530769348145, + "logps/chosen": -152.1751708984375, + "logps/rejected": -83.1910400390625, + "loss": 0.2481, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.533163547515869, + "rewards/margins": 0.7524199485778809, + "rewards/rejected": 4.780743598937988, + "step": 9924 + }, + { + "epoch": 1.61, + "learning_rate": 9.60019606587434e-07, + "logits/chosen": -1.50163996219635, + "logits/rejected": -1.6066334247589111, + "logps/chosen": -41.56543731689453, + "logps/rejected": -130.8655242919922, + "loss": 1.6337, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.924480438232422, + "rewards/margins": -3.2238364219665527, + "rewards/rejected": 6.148316860198975, + "step": 9925 + }, + { + "epoch": 1.61, + "learning_rate": 9.592454036806792e-07, + "logits/chosen": -1.405847430229187, + "logits/rejected": -1.290081262588501, + "logps/chosen": -48.941444396972656, + "logps/rejected": -28.54073715209961, + "loss": 0.1093, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.249281406402588, + "rewards/margins": 3.0876166820526123, + "rewards/rejected": 0.16166476905345917, + "step": 9926 + }, + { + "epoch": 1.61, + "learning_rate": 9.58471479951834e-07, + "logits/chosen": -1.7311118841171265, + "logits/rejected": -1.6528315544128418, + "logps/chosen": -112.520751953125, + "logps/rejected": -109.86758422851562, + "loss": 0.4578, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.0907883644104004, + "rewards/margins": 1.0471749305725098, + "rewards/rejected": 1.0436134338378906, + "step": 9927 + }, + { + "epoch": 1.61, + "learning_rate": 9.57697835454367e-07, + "logits/chosen": -1.5184292793273926, + "logits/rejected": -1.4515997171401978, + "logps/chosen": -75.30520629882812, + "logps/rejected": -36.80487060546875, + "loss": 0.1898, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.303198337554932, + "rewards/margins": 3.7163655757904053, + "rewards/rejected": 0.5868328213691711, + "step": 9928 + }, + { + "epoch": 1.61, + "learning_rate": 9.569244702417323e-07, + "logits/chosen": -1.0350154638290405, + "logits/rejected": -1.0233287811279297, + "logps/chosen": -88.49736022949219, + "logps/rejected": -67.760986328125, + "loss": 0.7402, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.39263916015625, + "rewards/margins": -1.069467306137085, + "rewards/rejected": 2.462106466293335, + "step": 9929 + }, + { + "epoch": 1.61, + "learning_rate": 9.561513843673598e-07, + "logits/chosen": -1.3849495649337769, + "logits/rejected": -1.4155782461166382, + "logps/chosen": -38.69221496582031, + "logps/rejected": -69.99423217773438, + "loss": 0.2149, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.6818851232528687, + "rewards/margins": 0.8977946639060974, + "rewards/rejected": 0.7840904593467712, + "step": 9930 + }, + { + "epoch": 1.61, + "learning_rate": 9.553785778846646e-07, + "logits/chosen": -1.0741698741912842, + "logits/rejected": -0.9770917892456055, + "logps/chosen": -130.03709411621094, + "logps/rejected": -67.64900207519531, + "loss": 1.9636, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1314163208007812, + "rewards/margins": 1.771569848060608, + "rewards/rejected": 1.3598464727401733, + "step": 9931 + }, + { + "epoch": 1.61, + "learning_rate": 9.546060508470372e-07, + "logits/chosen": -1.261099100112915, + "logits/rejected": -1.294193148612976, + "logps/chosen": -44.426612854003906, + "logps/rejected": -49.346553802490234, + "loss": 0.4245, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.0728843212127686, + "rewards/margins": -0.22527742385864258, + "rewards/rejected": 3.298161745071411, + "step": 9932 + }, + { + "epoch": 1.61, + "learning_rate": 9.538338033078558e-07, + "logits/chosen": -1.0559790134429932, + "logits/rejected": -0.677973210811615, + "logps/chosen": -32.0656623840332, + "logps/rejected": -108.19778442382812, + "loss": 3.9102, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.020734429359436, + "rewards/margins": -3.366588592529297, + "rewards/rejected": 4.387322902679443, + "step": 9933 + }, + { + "epoch": 1.61, + "learning_rate": 9.530618353204718e-07, + "logits/chosen": -1.3051915168762207, + "logits/rejected": -1.2435895204544067, + "logps/chosen": -89.15165710449219, + "logps/rejected": -55.88276672363281, + "loss": 0.5689, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.4618561267852783, + "rewards/margins": -0.7283751964569092, + "rewards/rejected": 2.1902313232421875, + "step": 9934 + }, + { + "epoch": 1.61, + "learning_rate": 9.52290146938224e-07, + "logits/chosen": -1.6348114013671875, + "logits/rejected": -1.770777702331543, + "logps/chosen": -181.68753051757812, + "logps/rejected": -83.71917724609375, + "loss": 0.513, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.9447479248046875, + "rewards/margins": 0.07534170150756836, + "rewards/rejected": 6.869406223297119, + "step": 9935 + }, + { + "epoch": 1.61, + "learning_rate": 9.515187382144259e-07, + "logits/chosen": -1.0938419103622437, + "logits/rejected": -1.0914469957351685, + "logps/chosen": -0.5149670839309692, + "logps/rejected": -35.817386627197266, + "loss": 0.4349, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.10805132240056992, + "rewards/margins": -0.10486791282892227, + "rewards/rejected": 0.2129192352294922, + "step": 9936 + }, + { + "epoch": 1.61, + "learning_rate": 9.507476092023771e-07, + "logits/chosen": -1.3488870859146118, + "logits/rejected": -1.3306455612182617, + "logps/chosen": -53.45491027832031, + "logps/rejected": -55.735130310058594, + "loss": 3.0669, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.723935842514038, + "rewards/margins": 1.0895638465881348, + "rewards/rejected": 1.6343719959259033, + "step": 9937 + }, + { + "epoch": 1.61, + "learning_rate": 9.499767599553528e-07, + "logits/chosen": -1.4527405500411987, + "logits/rejected": -1.3854471445083618, + "logps/chosen": -95.55802154541016, + "logps/rejected": -82.04432678222656, + "loss": 0.2417, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.7693963050842285, + "rewards/margins": 3.180194854736328, + "rewards/rejected": 0.5892013907432556, + "step": 9938 + }, + { + "epoch": 1.61, + "learning_rate": 9.492061905266137e-07, + "logits/chosen": -1.4915590286254883, + "logits/rejected": -1.4115527868270874, + "logps/chosen": -58.511802673339844, + "logps/rejected": -54.674652099609375, + "loss": 0.3034, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.465195417404175, + "rewards/margins": 0.2021322250366211, + "rewards/rejected": 2.2630631923675537, + "step": 9939 + }, + { + "epoch": 1.61, + "learning_rate": 9.484359009693972e-07, + "logits/chosen": -1.1129029989242554, + "logits/rejected": -1.1129029989242554, + "logps/chosen": -37.971435546875, + "logps/rejected": -37.971435546875, + "loss": 0.8678, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.2071785926818848, + "rewards/margins": 0.0, + "rewards/rejected": 3.2071785926818848, + "step": 9940 + }, + { + "epoch": 1.61, + "learning_rate": 9.476658913369247e-07, + "logits/chosen": -1.167955994606018, + "logits/rejected": -1.149123191833496, + "logps/chosen": -79.1134033203125, + "logps/rejected": -76.8639907836914, + "loss": 0.8817, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.736814260482788, + "rewards/margins": -1.518498182296753, + "rewards/rejected": 4.255312442779541, + "step": 9941 + }, + { + "epoch": 1.61, + "learning_rate": 9.46896161682394e-07, + "logits/chosen": -1.5650197267532349, + "logits/rejected": -1.2988643646240234, + "logps/chosen": -147.11634826660156, + "logps/rejected": -14.25781536102295, + "loss": 0.0678, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.706251621246338, + "rewards/margins": 3.70646071434021, + "rewards/rejected": 0.9997908473014832, + "step": 9942 + }, + { + "epoch": 1.61, + "learning_rate": 9.461267120589895e-07, + "logits/chosen": -1.4242602586746216, + "logits/rejected": -1.4415602684020996, + "logps/chosen": -54.15024185180664, + "logps/rejected": -70.36073303222656, + "loss": 0.2947, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.7550106048583984, + "rewards/margins": 1.255698800086975, + "rewards/rejected": 1.4993118047714233, + "step": 9943 + }, + { + "epoch": 1.61, + "learning_rate": 9.453575425198691e-07, + "logits/chosen": -1.3238023519515991, + "logits/rejected": -1.310511589050293, + "logps/chosen": -82.35233306884766, + "logps/rejected": -53.8831787109375, + "loss": 2.7713, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.6290955543518066, + "rewards/margins": 0.03065347671508789, + "rewards/rejected": 2.5984420776367188, + "step": 9944 + }, + { + "epoch": 1.61, + "learning_rate": 9.445886531181791e-07, + "logits/chosen": -1.2569690942764282, + "logits/rejected": -1.2865195274353027, + "logps/chosen": -102.66336059570312, + "logps/rejected": -106.4986572265625, + "loss": 1.2119, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.8637382984161377, + "rewards/margins": -2.316526174545288, + "rewards/rejected": 4.180264472961426, + "step": 9945 + }, + { + "epoch": 1.61, + "learning_rate": 9.438200439070388e-07, + "logits/chosen": -1.4126571416854858, + "logits/rejected": -1.3644026517868042, + "logps/chosen": -54.04840850830078, + "logps/rejected": -75.83389282226562, + "loss": 1.2945, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.9304497241973877, + "rewards/margins": -1.2341759204864502, + "rewards/rejected": 5.164625644683838, + "step": 9946 + }, + { + "epoch": 1.61, + "learning_rate": 9.430517149395552e-07, + "logits/chosen": -1.4128285646438599, + "logits/rejected": -1.1741868257522583, + "logps/chosen": -130.56796264648438, + "logps/rejected": -26.880794525146484, + "loss": 0.408, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.28395414352417, + "rewards/margins": 3.079505681991577, + "rewards/rejected": 3.2044484615325928, + "step": 9947 + }, + { + "epoch": 1.61, + "learning_rate": 9.422836662688095e-07, + "logits/chosen": -1.085029125213623, + "logits/rejected": -1.0726877450942993, + "logps/chosen": -75.29978942871094, + "logps/rejected": -88.10519409179688, + "loss": 1.2439, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0562407970428467, + "rewards/margins": 2.7690956592559814, + "rewards/rejected": 0.2871452271938324, + "step": 9948 + }, + { + "epoch": 1.61, + "learning_rate": 9.415158979478689e-07, + "logits/chosen": -1.2574965953826904, + "logits/rejected": -1.2417258024215698, + "logps/chosen": -51.41587829589844, + "logps/rejected": -28.329185485839844, + "loss": 1.4959, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.2563987970352173, + "rewards/margins": -0.742584228515625, + "rewards/rejected": 1.9989830255508423, + "step": 9949 + }, + { + "epoch": 1.61, + "learning_rate": 9.40748410029777e-07, + "logits/chosen": -1.0105057954788208, + "logits/rejected": -1.0379886627197266, + "logps/chosen": -69.49076843261719, + "logps/rejected": -44.00091552734375, + "loss": 0.2481, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.104494571685791, + "rewards/margins": 0.44634556770324707, + "rewards/rejected": 1.658149003982544, + "step": 9950 + }, + { + "epoch": 1.62, + "learning_rate": 9.399812025675626e-07, + "logits/chosen": -1.6369121074676514, + "logits/rejected": -1.5862435102462769, + "logps/chosen": -98.55217742919922, + "logps/rejected": -173.6314697265625, + "loss": 0.8786, + "rewards/accuracies": 0.0, + "rewards/chosen": 5.968076229095459, + "rewards/margins": -0.8671455383300781, + "rewards/rejected": 6.835221767425537, + "step": 9951 + }, + { + "epoch": 1.62, + "learning_rate": 9.392142756142292e-07, + "logits/chosen": -1.2091772556304932, + "logits/rejected": -1.2337926626205444, + "logps/chosen": -24.230554580688477, + "logps/rejected": -50.312232971191406, + "loss": 0.6313, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.0253461599349976, + "rewards/margins": -0.2711362838745117, + "rewards/rejected": 1.2964824438095093, + "step": 9952 + }, + { + "epoch": 1.62, + "learning_rate": 9.384476292227673e-07, + "logits/chosen": -0.8726718425750732, + "logits/rejected": -0.8348633646965027, + "logps/chosen": -64.92362976074219, + "logps/rejected": -45.94664764404297, + "loss": 0.1698, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.38533091545105, + "rewards/margins": 0.9854903221130371, + "rewards/rejected": 2.3998405933380127, + "step": 9953 + }, + { + "epoch": 1.62, + "learning_rate": 9.376812634461418e-07, + "logits/chosen": -1.6221215724945068, + "logits/rejected": -1.5925935506820679, + "logps/chosen": -97.26870727539062, + "logps/rejected": -27.219799041748047, + "loss": 0.5792, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.9043152332305908, + "rewards/margins": 1.5509140491485596, + "rewards/rejected": 0.35340118408203125, + "step": 9954 + }, + { + "epoch": 1.62, + "learning_rate": 9.369151783373032e-07, + "logits/chosen": -1.2652980089187622, + "logits/rejected": -1.2652980089187622, + "logps/chosen": -35.37736129760742, + "logps/rejected": -35.37736129760742, + "loss": 0.44, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.8808685541152954, + "rewards/margins": 0.0, + "rewards/rejected": 0.8808685541152954, + "step": 9955 + }, + { + "epoch": 1.62, + "learning_rate": 9.361493739491817e-07, + "logits/chosen": -1.3618824481964111, + "logits/rejected": -1.3940922021865845, + "logps/chosen": -78.408447265625, + "logps/rejected": -60.46742248535156, + "loss": 1.303, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.8232522010803223, + "rewards/margins": -2.483717918395996, + "rewards/rejected": 5.306970119476318, + "step": 9956 + }, + { + "epoch": 1.62, + "learning_rate": 9.353838503346851e-07, + "logits/chosen": -1.059618353843689, + "logits/rejected": -1.059618353843689, + "logps/chosen": -44.85237503051758, + "logps/rejected": -44.85237503051758, + "loss": 0.3511, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.3907607793807983, + "rewards/margins": 0.0, + "rewards/rejected": 1.3907607793807983, + "step": 9957 + }, + { + "epoch": 1.62, + "learning_rate": 9.346186075467056e-07, + "logits/chosen": -1.2616767883300781, + "logits/rejected": -1.3368514776229858, + "logps/chosen": -69.37187194824219, + "logps/rejected": -128.22946166992188, + "loss": 0.2532, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9194557666778564, + "rewards/margins": 0.41710591316223145, + "rewards/rejected": 2.502349853515625, + "step": 9958 + }, + { + "epoch": 1.62, + "learning_rate": 9.33853645638112e-07, + "logits/chosen": -1.6197983026504517, + "logits/rejected": -1.505018711090088, + "logps/chosen": -75.41173553466797, + "logps/rejected": -56.60205078125, + "loss": 0.111, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1912286281585693, + "rewards/margins": 2.3524200916290283, + "rewards/rejected": 0.8388084769248962, + "step": 9959 + }, + { + "epoch": 1.62, + "learning_rate": 9.330889646617586e-07, + "logits/chosen": -1.4839121103286743, + "logits/rejected": -1.4839121103286743, + "logps/chosen": -86.09507751464844, + "logps/rejected": -86.09507751464844, + "loss": 0.4739, + "rewards/accuracies": 0.0, + "rewards/chosen": 5.370375156402588, + "rewards/margins": 0.0, + "rewards/rejected": 5.370375156402588, + "step": 9960 + }, + { + "epoch": 1.62, + "learning_rate": 9.323245646704754e-07, + "logits/chosen": -1.366831660270691, + "logits/rejected": -1.3498778343200684, + "logps/chosen": -22.978004455566406, + "logps/rejected": -4.810516357421875, + "loss": 0.2849, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.6504192352294922, + "rewards/margins": 0.28387174010276794, + "rewards/rejected": 0.36654749512672424, + "step": 9961 + }, + { + "epoch": 1.62, + "learning_rate": 9.315604457170768e-07, + "logits/chosen": -1.3185783624649048, + "logits/rejected": -1.2770678997039795, + "logps/chosen": -67.76343536376953, + "logps/rejected": -71.80802917480469, + "loss": 0.4711, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.081990957260132, + "rewards/margins": 2.2600364685058594, + "rewards/rejected": 0.8219543695449829, + "step": 9962 + }, + { + "epoch": 1.62, + "learning_rate": 9.307966078543545e-07, + "logits/chosen": -0.8892055153846741, + "logits/rejected": -0.9177555441856384, + "logps/chosen": -23.58034324645996, + "logps/rejected": -55.75938415527344, + "loss": 0.475, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.6895360946655273, + "rewards/margins": -0.3249887228012085, + "rewards/rejected": 1.0145248174667358, + "step": 9963 + }, + { + "epoch": 1.62, + "learning_rate": 9.300330511350841e-07, + "logits/chosen": -1.1389378309249878, + "logits/rejected": -1.1263240575790405, + "logps/chosen": -21.93489646911621, + "logps/rejected": -27.542011260986328, + "loss": 0.2075, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.0553327798843384, + "rewards/margins": 0.7905870676040649, + "rewards/rejected": 0.26474571228027344, + "step": 9964 + }, + { + "epoch": 1.62, + "learning_rate": 9.292697756120189e-07, + "logits/chosen": -1.2471675872802734, + "logits/rejected": -1.2411760091781616, + "logps/chosen": -37.2323112487793, + "logps/rejected": -83.77207946777344, + "loss": 0.1824, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.1664974689483643, + "rewards/margins": 0.8208446502685547, + "rewards/rejected": 1.3456528186798096, + "step": 9965 + }, + { + "epoch": 1.62, + "learning_rate": 9.285067813378956e-07, + "logits/chosen": -1.2422071695327759, + "logits/rejected": -1.2070276737213135, + "logps/chosen": -66.5816650390625, + "logps/rejected": -58.27286911010742, + "loss": 0.333, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.646620273590088, + "rewards/margins": 0.3200681209564209, + "rewards/rejected": 2.326552152633667, + "step": 9966 + }, + { + "epoch": 1.62, + "learning_rate": 9.277440683654276e-07, + "logits/chosen": -1.3707225322723389, + "logits/rejected": -1.4055618047714233, + "logps/chosen": -86.47796630859375, + "logps/rejected": -88.25689697265625, + "loss": 0.3433, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.7517197132110596, + "rewards/margins": 0.33579325675964355, + "rewards/rejected": 2.415926456451416, + "step": 9967 + }, + { + "epoch": 1.62, + "learning_rate": 9.269816367473139e-07, + "logits/chosen": -1.3239383697509766, + "logits/rejected": -1.302562952041626, + "logps/chosen": -49.048614501953125, + "logps/rejected": -56.69941711425781, + "loss": 0.6161, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.314830780029297, + "rewards/margins": -0.2660484313964844, + "rewards/rejected": 2.5808792114257812, + "step": 9968 + }, + { + "epoch": 1.62, + "learning_rate": 9.262194865362284e-07, + "logits/chosen": -1.0499998331069946, + "logits/rejected": -1.1020783185958862, + "logps/chosen": -110.33846282958984, + "logps/rejected": -72.39568328857422, + "loss": 0.6204, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.7252998352050781, + "rewards/margins": -0.7866501808166504, + "rewards/rejected": 2.5119500160217285, + "step": 9969 + }, + { + "epoch": 1.62, + "learning_rate": 9.254576177848313e-07, + "logits/chosen": -1.1796543598175049, + "logits/rejected": -1.1580328941345215, + "logps/chosen": -196.7796173095703, + "logps/rejected": -42.3147087097168, + "loss": 0.0753, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.420958042144775, + "rewards/margins": 4.261636734008789, + "rewards/rejected": 2.1593213081359863, + "step": 9970 + }, + { + "epoch": 1.62, + "learning_rate": 9.246960305457581e-07, + "logits/chosen": -0.9054762125015259, + "logits/rejected": -0.8291473388671875, + "logps/chosen": -59.77850341796875, + "logps/rejected": -30.838720321655273, + "loss": 0.0816, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.756312608718872, + "rewards/margins": 2.1789631843566895, + "rewards/rejected": 0.5773493051528931, + "step": 9971 + }, + { + "epoch": 1.62, + "learning_rate": 9.23934724871629e-07, + "logits/chosen": -1.2307367324829102, + "logits/rejected": -1.2827855348587036, + "logps/chosen": -88.94856262207031, + "logps/rejected": -91.51860809326172, + "loss": 0.8783, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.818006992340088, + "rewards/margins": 0.3983602523803711, + "rewards/rejected": 4.419646739959717, + "step": 9972 + }, + { + "epoch": 1.62, + "learning_rate": 9.231737008150416e-07, + "logits/chosen": -1.158697247505188, + "logits/rejected": -1.1395041942596436, + "logps/chosen": -104.78704833984375, + "logps/rejected": -109.91240692138672, + "loss": 0.4308, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.597613573074341, + "rewards/margins": 1.6961236000061035, + "rewards/rejected": 0.9014900326728821, + "step": 9973 + }, + { + "epoch": 1.62, + "learning_rate": 9.224129584285768e-07, + "logits/chosen": -1.0136741399765015, + "logits/rejected": -1.0136741399765015, + "logps/chosen": -12.835044860839844, + "logps/rejected": -12.835044860839844, + "loss": 0.3614, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.565786838531494, + "rewards/margins": 0.0, + "rewards/rejected": 2.565786838531494, + "step": 9974 + }, + { + "epoch": 1.62, + "learning_rate": 9.216524977647934e-07, + "logits/chosen": -1.5678619146347046, + "logits/rejected": -1.4646120071411133, + "logps/chosen": -62.52721405029297, + "logps/rejected": -14.69485855102539, + "loss": 0.0732, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2410104274749756, + "rewards/margins": 2.3336241245269775, + "rewards/rejected": 0.9073862433433533, + "step": 9975 + }, + { + "epoch": 1.62, + "learning_rate": 9.208923188762337e-07, + "logits/chosen": -1.3428314924240112, + "logits/rejected": -1.2758023738861084, + "logps/chosen": -188.59329223632812, + "logps/rejected": -69.20682525634766, + "loss": 0.0526, + "rewards/accuracies": 1.0, + "rewards/chosen": 7.213799953460693, + "rewards/margins": 3.763462781906128, + "rewards/rejected": 3.4503371715545654, + "step": 9976 + }, + { + "epoch": 1.62, + "learning_rate": 9.201324218154168e-07, + "logits/chosen": -1.4909049272537231, + "logits/rejected": -1.4545196294784546, + "logps/chosen": -52.0561637878418, + "logps/rejected": -49.09789276123047, + "loss": 0.3826, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.0993993282318115, + "rewards/margins": 0.12217223644256592, + "rewards/rejected": 1.9772270917892456, + "step": 9977 + }, + { + "epoch": 1.62, + "learning_rate": 9.193728066348467e-07, + "logits/chosen": -1.3871781826019287, + "logits/rejected": -1.364823341369629, + "logps/chosen": -70.10231018066406, + "logps/rejected": -65.32597351074219, + "loss": 1.0467, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.945629835128784, + "rewards/margins": -0.6007568836212158, + "rewards/rejected": 3.54638671875, + "step": 9978 + }, + { + "epoch": 1.62, + "learning_rate": 9.186134733870028e-07, + "logits/chosen": -1.0608346462249756, + "logits/rejected": -0.9981947541236877, + "logps/chosen": -77.65431213378906, + "logps/rejected": -72.32801818847656, + "loss": 1.021, + "rewards/accuracies": 0.0, + "rewards/chosen": 4.1046366691589355, + "rewards/margins": -0.9032773971557617, + "rewards/rejected": 5.007914066314697, + "step": 9979 + }, + { + "epoch": 1.62, + "learning_rate": 9.178544221243513e-07, + "logits/chosen": -1.321732997894287, + "logits/rejected": -1.2679167985916138, + "logps/chosen": -88.52471923828125, + "logps/rejected": -90.06759643554688, + "loss": 0.5748, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.772212266921997, + "rewards/margins": 0.4773697853088379, + "rewards/rejected": 2.294842481613159, + "step": 9980 + }, + { + "epoch": 1.62, + "learning_rate": 9.170956528993319e-07, + "logits/chosen": -1.4933967590332031, + "logits/rejected": -1.4408345222473145, + "logps/chosen": -89.67192840576172, + "logps/rejected": -111.9352035522461, + "loss": 0.2209, + "rewards/accuracies": 1.0, + "rewards/chosen": 7.26452112197876, + "rewards/margins": 2.3990888595581055, + "rewards/rejected": 4.865432262420654, + "step": 9981 + }, + { + "epoch": 1.62, + "learning_rate": 9.163371657643716e-07, + "logits/chosen": -0.9872975945472717, + "logits/rejected": -0.9872975945472717, + "logps/chosen": -27.601215362548828, + "logps/rejected": -27.601215362548828, + "loss": 0.71, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.3274803161621094, + "rewards/margins": 0.0, + "rewards/rejected": 3.3274803161621094, + "step": 9982 + }, + { + "epoch": 1.62, + "learning_rate": 9.155789607718718e-07, + "logits/chosen": -1.2466213703155518, + "logits/rejected": -1.1844439506530762, + "logps/chosen": -75.3941650390625, + "logps/rejected": -61.61511993408203, + "loss": 0.5872, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.1418190002441406, + "rewards/margins": 0.16790771484375, + "rewards/rejected": 1.9739112854003906, + "step": 9983 + }, + { + "epoch": 1.62, + "learning_rate": 9.1482103797422e-07, + "logits/chosen": -1.2119258642196655, + "logits/rejected": -1.0597596168518066, + "logps/chosen": -72.58383178710938, + "logps/rejected": -46.286468505859375, + "loss": 0.1911, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.9982101917266846, + "rewards/margins": 1.2315514087677002, + "rewards/rejected": 0.7666587829589844, + "step": 9984 + }, + { + "epoch": 1.62, + "learning_rate": 9.140633974237789e-07, + "logits/chosen": -1.3971176147460938, + "logits/rejected": -1.3971176147460938, + "logps/chosen": -45.7537727355957, + "logps/rejected": -45.7537727355957, + "loss": 0.3643, + "rewards/accuracies": 0.0, + "rewards/chosen": 4.3373517990112305, + "rewards/margins": 0.0, + "rewards/rejected": 4.3373517990112305, + "step": 9985 + }, + { + "epoch": 1.62, + "learning_rate": 9.133060391728965e-07, + "logits/chosen": -0.8254514932632446, + "logits/rejected": -0.8202889561653137, + "logps/chosen": -5.919097423553467, + "logps/rejected": -5.752982139587402, + "loss": 1.9392, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3792198598384857, + "rewards/margins": 0.24234728515148163, + "rewards/rejected": 0.1368725746870041, + "step": 9986 + }, + { + "epoch": 1.62, + "learning_rate": 9.125489632738971e-07, + "logits/chosen": -1.4096068143844604, + "logits/rejected": -0.9951494932174683, + "logps/chosen": -73.13560485839844, + "logps/rejected": -93.05632019042969, + "loss": 1.9026, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.599538564682007, + "rewards/margins": -0.8759315013885498, + "rewards/rejected": 3.4754700660705566, + "step": 9987 + }, + { + "epoch": 1.62, + "learning_rate": 9.117921697790899e-07, + "logits/chosen": -1.2543352842330933, + "logits/rejected": -1.2229100465774536, + "logps/chosen": -70.26220703125, + "logps/rejected": -52.39161682128906, + "loss": 0.423, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.7082382440567017, + "rewards/margins": 0.35242462158203125, + "rewards/rejected": 1.3558136224746704, + "step": 9988 + }, + { + "epoch": 1.62, + "learning_rate": 9.110356587407592e-07, + "logits/chosen": -1.5362091064453125, + "logits/rejected": -1.4446355104446411, + "logps/chosen": -116.90614318847656, + "logps/rejected": -25.568603515625, + "loss": 1.2416, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.3630433082580566, + "rewards/margins": 1.5678120851516724, + "rewards/rejected": 1.7952312231063843, + "step": 9989 + }, + { + "epoch": 1.62, + "learning_rate": 9.102794302111751e-07, + "logits/chosen": -1.0772840976715088, + "logits/rejected": -1.0998151302337646, + "logps/chosen": -62.2520751953125, + "logps/rejected": -85.14476013183594, + "loss": 2.9035, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.7912276983261108, + "rewards/margins": -1.0843697786331177, + "rewards/rejected": 2.8755974769592285, + "step": 9990 + }, + { + "epoch": 1.62, + "learning_rate": 9.095234842425854e-07, + "logits/chosen": -0.9635093212127686, + "logits/rejected": -1.0466282367706299, + "logps/chosen": -75.6641845703125, + "logps/rejected": -93.63148498535156, + "loss": 0.5679, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.7286102771759033, + "rewards/margins": -0.38820338249206543, + "rewards/rejected": 2.1168136596679688, + "step": 9991 + }, + { + "epoch": 1.62, + "learning_rate": 9.087678208872175e-07, + "logits/chosen": -1.410170078277588, + "logits/rejected": -1.3529707193374634, + "logps/chosen": -81.96549987792969, + "logps/rejected": -74.86181640625, + "loss": 0.245, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.7276253700256348, + "rewards/margins": 2.708651065826416, + "rewards/rejected": 0.01897430419921875, + "step": 9992 + }, + { + "epoch": 1.62, + "learning_rate": 9.080124401972834e-07, + "logits/chosen": -1.1334558725357056, + "logits/rejected": -1.162509560585022, + "logps/chosen": -70.53082275390625, + "logps/rejected": -107.60791778564453, + "loss": 0.8028, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.9940757751464844, + "rewards/margins": -1.3583712577819824, + "rewards/rejected": 4.352447032928467, + "step": 9993 + }, + { + "epoch": 1.62, + "learning_rate": 9.072573422249692e-07, + "logits/chosen": -1.2447367906570435, + "logits/rejected": -1.2277663946151733, + "logps/chosen": -120.93380737304688, + "logps/rejected": -88.39753723144531, + "loss": 0.091, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.740527629852295, + "rewards/margins": 1.6480560302734375, + "rewards/rejected": 4.092471599578857, + "step": 9994 + }, + { + "epoch": 1.62, + "learning_rate": 9.065025270224482e-07, + "logits/chosen": -1.1420749425888062, + "logits/rejected": -1.069559097290039, + "logps/chosen": -31.300819396972656, + "logps/rejected": -37.805747985839844, + "loss": 0.3089, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.7809128761291504, + "rewards/margins": 0.1638507843017578, + "rewards/rejected": 2.6170620918273926, + "step": 9995 + }, + { + "epoch": 1.62, + "learning_rate": 9.057479946418679e-07, + "logits/chosen": -0.9676486253738403, + "logits/rejected": -0.9371559619903564, + "logps/chosen": -94.67385864257812, + "logps/rejected": -62.134647369384766, + "loss": 0.1842, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.8840240240097046, + "rewards/margins": 0.8530802726745605, + "rewards/rejected": 1.030943751335144, + "step": 9996 + }, + { + "epoch": 1.62, + "learning_rate": 9.049937451353624e-07, + "logits/chosen": -1.3021433353424072, + "logits/rejected": -1.373584508895874, + "logps/chosen": -142.2631378173828, + "logps/rejected": -117.97198486328125, + "loss": 2.0265, + "rewards/accuracies": 0.0, + "rewards/chosen": 6.3012542724609375, + "rewards/margins": -0.8676695823669434, + "rewards/rejected": 7.168923854827881, + "step": 9997 + }, + { + "epoch": 1.62, + "learning_rate": 9.042397785550405e-07, + "logits/chosen": -1.55170738697052, + "logits/rejected": -1.5583022832870483, + "logps/chosen": -77.11007690429688, + "logps/rejected": -88.00325012207031, + "loss": 0.2168, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.3145196437835693, + "rewards/margins": 0.9736207723617554, + "rewards/rejected": 1.340898871421814, + "step": 9998 + }, + { + "epoch": 1.62, + "learning_rate": 9.034860949529973e-07, + "logits/chosen": -1.0640336275100708, + "logits/rejected": -1.0510714054107666, + "logps/chosen": -49.029258728027344, + "logps/rejected": -79.44585418701172, + "loss": 0.9422, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.580568790435791, + "rewards/margins": -1.2868354320526123, + "rewards/rejected": 3.8674042224884033, + "step": 9999 + }, + { + "epoch": 1.62, + "learning_rate": 9.027326943813014e-07, + "logits/chosen": -1.3934836387634277, + "logits/rejected": -1.3672435283660889, + "logps/chosen": -158.072021484375, + "logps/rejected": -153.6425323486328, + "loss": 0.1624, + "rewards/accuracies": 1.0, + "rewards/chosen": 7.49896240234375, + "rewards/margins": 1.2748823165893555, + "rewards/rejected": 6.2240800857543945, + "step": 10000 + }, + { + "epoch": 1.62, + "learning_rate": 9.019795768920093e-07, + "logits/chosen": -1.27476167678833, + "logits/rejected": -1.1791332960128784, + "logps/chosen": -72.27471923828125, + "logps/rejected": -25.72934341430664, + "loss": 0.5298, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.605207920074463, + "rewards/margins": 0.8149278163909912, + "rewards/rejected": 2.7902801036834717, + "step": 10001 + }, + { + "epoch": 1.62, + "learning_rate": 9.012267425371513e-07, + "logits/chosen": -1.190946340560913, + "logits/rejected": -1.1205235719680786, + "logps/chosen": -216.92684936523438, + "logps/rejected": -10.550159454345703, + "loss": 0.0128, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.026754856109619, + "rewards/margins": 3.700822591781616, + "rewards/rejected": 0.3259323239326477, + "step": 10002 + }, + { + "epoch": 1.62, + "learning_rate": 9.004741913687432e-07, + "logits/chosen": -1.2995221614837646, + "logits/rejected": -1.2943581342697144, + "logps/chosen": -74.20291137695312, + "logps/rejected": -93.76972961425781, + "loss": 0.4195, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.602276563644409, + "rewards/margins": 0.8778609037399292, + "rewards/rejected": 1.72441565990448, + "step": 10003 + }, + { + "epoch": 1.62, + "learning_rate": 8.997219234387777e-07, + "logits/chosen": -1.0298594236373901, + "logits/rejected": -1.0367043018341064, + "logps/chosen": -4.689672470092773, + "logps/rejected": -1.672872543334961, + "loss": 0.7378, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.13845978677272797, + "rewards/margins": -0.17207859456539154, + "rewards/rejected": 0.3105383813381195, + "step": 10004 + }, + { + "epoch": 1.62, + "learning_rate": 8.989699387992313e-07, + "logits/chosen": -1.1804910898208618, + "logits/rejected": -1.1591500043869019, + "logps/chosen": -72.76363372802734, + "logps/rejected": -110.83683776855469, + "loss": 0.4165, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.514347791671753, + "rewards/margins": 1.7216750383377075, + "rewards/rejected": 0.7926727533340454, + "step": 10005 + }, + { + "epoch": 1.62, + "learning_rate": 8.982182375020565e-07, + "logits/chosen": -1.0766206979751587, + "logits/rejected": -1.0569695234298706, + "logps/chosen": -40.086143493652344, + "logps/rejected": -61.55458068847656, + "loss": 0.7296, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.919519066810608, + "rewards/margins": -0.9610642194747925, + "rewards/rejected": 2.8805832862854004, + "step": 10006 + }, + { + "epoch": 1.62, + "learning_rate": 8.974668195991909e-07, + "logits/chosen": -1.109819769859314, + "logits/rejected": -1.043903112411499, + "logps/chosen": -38.405494689941406, + "logps/rejected": -68.54598999023438, + "loss": 0.6458, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.333223819732666, + "rewards/margins": -0.9505438804626465, + "rewards/rejected": 3.2837677001953125, + "step": 10007 + }, + { + "epoch": 1.62, + "learning_rate": 8.96715685142549e-07, + "logits/chosen": -1.0852164030075073, + "logits/rejected": -0.9373549222946167, + "logps/chosen": -72.74859619140625, + "logps/rejected": -54.0662841796875, + "loss": 0.288, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.525224447250366, + "rewards/margins": 1.5376291275024414, + "rewards/rejected": 0.9875953793525696, + "step": 10008 + }, + { + "epoch": 1.62, + "learning_rate": 8.959648341840283e-07, + "logits/chosen": -1.331838846206665, + "logits/rejected": -1.3560949563980103, + "logps/chosen": -79.49512481689453, + "logps/rejected": -81.16913604736328, + "loss": 0.4283, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.323223114013672, + "rewards/margins": 1.3554458618164062, + "rewards/rejected": 0.9677772521972656, + "step": 10009 + }, + { + "epoch": 1.62, + "learning_rate": 8.952142667755038e-07, + "logits/chosen": -1.1564719676971436, + "logits/rejected": -1.1477911472320557, + "logps/chosen": -61.87774658203125, + "logps/rejected": -79.3502426147461, + "loss": 0.47, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.2881743907928467, + "rewards/margins": 1.101891279220581, + "rewards/rejected": 1.1862831115722656, + "step": 10010 + }, + { + "epoch": 1.62, + "learning_rate": 8.944639829688351e-07, + "logits/chosen": -1.5679134130477905, + "logits/rejected": -1.586172342300415, + "logps/chosen": -54.48451232910156, + "logps/rejected": -76.24040222167969, + "loss": 0.9642, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.7058465480804443, + "rewards/margins": -1.722923994064331, + "rewards/rejected": 4.428770542144775, + "step": 10011 + }, + { + "epoch": 1.63, + "learning_rate": 8.93713982815857e-07, + "logits/chosen": -1.042846918106079, + "logits/rejected": -1.042846918106079, + "logps/chosen": -5.888750076293945, + "logps/rejected": -5.888750076293945, + "loss": 0.5941, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.665147602558136, + "rewards/margins": 0.0, + "rewards/rejected": 0.665147602558136, + "step": 10012 + }, + { + "epoch": 1.63, + "learning_rate": 8.929642663683896e-07, + "logits/chosen": -0.9424176216125488, + "logits/rejected": -0.9137106537818909, + "logps/chosen": -47.69813537597656, + "logps/rejected": -18.049436569213867, + "loss": 0.239, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.5955883264541626, + "rewards/margins": 1.03994619846344, + "rewards/rejected": 0.5556421279907227, + "step": 10013 + }, + { + "epoch": 1.63, + "learning_rate": 8.922148336782288e-07, + "logits/chosen": -1.6633281707763672, + "logits/rejected": -1.6879847049713135, + "logps/chosen": -77.0799560546875, + "logps/rejected": -73.10408020019531, + "loss": 2.1978, + "rewards/accuracies": 0.0, + "rewards/chosen": 4.1669816970825195, + "rewards/margins": -3.257049560546875, + "rewards/rejected": 7.4240312576293945, + "step": 10014 + }, + { + "epoch": 1.63, + "learning_rate": 8.914656847971565e-07, + "logits/chosen": -1.4494733810424805, + "logits/rejected": -1.4000638723373413, + "logps/chosen": -87.03703308105469, + "logps/rejected": -57.9337158203125, + "loss": 0.7204, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.435168504714966, + "rewards/margins": -0.7960395812988281, + "rewards/rejected": 3.231208086013794, + "step": 10015 + }, + { + "epoch": 1.63, + "learning_rate": 8.907168197769284e-07, + "logits/chosen": -0.9842506647109985, + "logits/rejected": -1.159825325012207, + "logps/chosen": -46.29462432861328, + "logps/rejected": -119.3863525390625, + "loss": 1.2884, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.923386335372925, + "rewards/margins": -2.288738489151001, + "rewards/rejected": 6.212124824523926, + "step": 10016 + }, + { + "epoch": 1.63, + "learning_rate": 8.89968238669287e-07, + "logits/chosen": -0.910233199596405, + "logits/rejected": -0.8179110884666443, + "logps/chosen": -47.6195068359375, + "logps/rejected": -62.465538024902344, + "loss": 1.8157, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.761991262435913, + "rewards/margins": 1.479989767074585, + "rewards/rejected": 1.2820014953613281, + "step": 10017 + }, + { + "epoch": 1.63, + "learning_rate": 8.892199415259501e-07, + "logits/chosen": -1.2660753726959229, + "logits/rejected": -1.2358267307281494, + "logps/chosen": -142.61090087890625, + "logps/rejected": -90.07481384277344, + "loss": 1.1028, + "rewards/accuracies": 1.0, + "rewards/chosen": 8.422256469726562, + "rewards/margins": 0.23502349853515625, + "rewards/rejected": 8.187232971191406, + "step": 10018 + }, + { + "epoch": 1.63, + "learning_rate": 8.884719283986193e-07, + "logits/chosen": -1.3710606098175049, + "logits/rejected": -1.5410040616989136, + "logps/chosen": -57.79898452758789, + "logps/rejected": -34.563194274902344, + "loss": 0.0405, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9240224361419678, + "rewards/margins": 2.649291753768921, + "rewards/rejected": 0.2747306823730469, + "step": 10019 + }, + { + "epoch": 1.63, + "learning_rate": 8.877241993389735e-07, + "logits/chosen": -1.1408796310424805, + "logits/rejected": -1.225770354270935, + "logps/chosen": -171.07864379882812, + "logps/rejected": -113.24591064453125, + "loss": 0.133, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.4546356201171875, + "rewards/margins": 1.1954374313354492, + "rewards/rejected": 4.259198188781738, + "step": 10020 + }, + { + "epoch": 1.63, + "learning_rate": 8.869767543986763e-07, + "logits/chosen": -1.3540383577346802, + "logits/rejected": -1.3343968391418457, + "logps/chosen": -87.50827026367188, + "logps/rejected": -90.50729370117188, + "loss": 0.6458, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.6773117780685425, + "rewards/margins": 0.10856020450592041, + "rewards/rejected": 1.568751573562622, + "step": 10021 + }, + { + "epoch": 1.63, + "learning_rate": 8.862295936293658e-07, + "logits/chosen": -1.3202879428863525, + "logits/rejected": -1.3175722360610962, + "logps/chosen": -46.09201431274414, + "logps/rejected": -68.25581359863281, + "loss": 1.9713, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.2362592220306396, + "rewards/margins": -2.5392444133758545, + "rewards/rejected": 4.775503635406494, + "step": 10022 + }, + { + "epoch": 1.63, + "learning_rate": 8.854827170826675e-07, + "logits/chosen": -1.181091547012329, + "logits/rejected": -1.1838852167129517, + "logps/chosen": -52.88762283325195, + "logps/rejected": -86.39790344238281, + "loss": 0.3458, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.9722225666046143, + "rewards/margins": 0.9051357507705688, + "rewards/rejected": 1.0670868158340454, + "step": 10023 + }, + { + "epoch": 1.63, + "learning_rate": 8.847361248101799e-07, + "logits/chosen": -1.1346973180770874, + "logits/rejected": -1.1608846187591553, + "logps/chosen": -72.82007598876953, + "logps/rejected": -85.71314239501953, + "loss": 2.1473, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.4421653747558594, + "rewards/margins": -2.7613067626953125, + "rewards/rejected": 5.203472137451172, + "step": 10024 + }, + { + "epoch": 1.63, + "learning_rate": 8.839898168634881e-07, + "logits/chosen": -1.0848987102508545, + "logits/rejected": -0.9998413324356079, + "logps/chosen": -37.83935546875, + "logps/rejected": -44.45466995239258, + "loss": 0.2441, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.101140022277832, + "rewards/margins": 1.1068120002746582, + "rewards/rejected": 3.994328022003174, + "step": 10025 + }, + { + "epoch": 1.63, + "learning_rate": 8.832437932941528e-07, + "logits/chosen": -1.1922099590301514, + "logits/rejected": -1.0855056047439575, + "logps/chosen": -94.91382598876953, + "logps/rejected": -57.72352981567383, + "loss": 0.116, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.6161766052246094, + "rewards/margins": 1.8525692224502563, + "rewards/rejected": 1.763607382774353, + "step": 10026 + }, + { + "epoch": 1.63, + "learning_rate": 8.824980541537187e-07, + "logits/chosen": -1.329343557357788, + "logits/rejected": -1.293007254600525, + "logps/chosen": -36.854736328125, + "logps/rejected": -26.191492080688477, + "loss": 0.7257, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.9980369806289673, + "rewards/margins": 0.341930627822876, + "rewards/rejected": 1.6561063528060913, + "step": 10027 + }, + { + "epoch": 1.63, + "learning_rate": 8.817525994937109e-07, + "logits/chosen": -1.4336098432540894, + "logits/rejected": -1.4325129985809326, + "logps/chosen": -61.38311767578125, + "logps/rejected": -77.06770324707031, + "loss": 2.1827, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.4884071350097656, + "rewards/margins": 0.6155111789703369, + "rewards/rejected": 2.8728959560394287, + "step": 10028 + }, + { + "epoch": 1.63, + "learning_rate": 8.81007429365629e-07, + "logits/chosen": -0.9922753572463989, + "logits/rejected": -0.9922753572463989, + "logps/chosen": -53.45396423339844, + "logps/rejected": -53.45396423339844, + "loss": 0.8278, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.4491899013519287, + "rewards/margins": 0.0, + "rewards/rejected": 2.4491899013519287, + "step": 10029 + }, + { + "epoch": 1.63, + "learning_rate": 8.802625438209606e-07, + "logits/chosen": -1.4339392185211182, + "logits/rejected": -1.299918293952942, + "logps/chosen": -155.63343811035156, + "logps/rejected": -157.64122009277344, + "loss": 0.0668, + "rewards/accuracies": 1.0, + "rewards/chosen": 7.270951747894287, + "rewards/margins": 1.9932584762573242, + "rewards/rejected": 5.277693271636963, + "step": 10030 + }, + { + "epoch": 1.63, + "learning_rate": 8.795179429111677e-07, + "logits/chosen": -1.427682876586914, + "logits/rejected": -1.4152655601501465, + "logps/chosen": -69.66749572753906, + "logps/rejected": -91.97749328613281, + "loss": 0.1784, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.442340135574341, + "rewards/margins": 0.8779335021972656, + "rewards/rejected": 1.5644066333770752, + "step": 10031 + }, + { + "epoch": 1.63, + "learning_rate": 8.787736266876984e-07, + "logits/chosen": -1.3761249780654907, + "logits/rejected": -1.3336762189865112, + "logps/chosen": -140.87840270996094, + "logps/rejected": -39.306549072265625, + "loss": 0.0973, + "rewards/accuracies": 1.0, + "rewards/chosen": 7.046245098114014, + "rewards/margins": 6.135109901428223, + "rewards/rejected": 0.9111351370811462, + "step": 10032 + }, + { + "epoch": 1.63, + "learning_rate": 8.78029595201974e-07, + "logits/chosen": -0.7204486727714539, + "logits/rejected": -0.7228398323059082, + "logps/chosen": -2.203235387802124, + "logps/rejected": -5.644232273101807, + "loss": 1.2159, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.2940874397754669, + "rewards/margins": -0.039623767137527466, + "rewards/rejected": 0.3337112069129944, + "step": 10033 + }, + { + "epoch": 1.63, + "learning_rate": 8.772858485054042e-07, + "logits/chosen": -1.18136727809906, + "logits/rejected": -1.1839349269866943, + "logps/chosen": -70.70804595947266, + "logps/rejected": -74.0560302734375, + "loss": 0.9169, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.999021291732788, + "rewards/margins": -1.4261300563812256, + "rewards/rejected": 4.425151348114014, + "step": 10034 + }, + { + "epoch": 1.63, + "learning_rate": 8.76542386649371e-07, + "logits/chosen": -1.1123383045196533, + "logits/rejected": -1.1557555198669434, + "logps/chosen": -56.90681457519531, + "logps/rejected": -91.03404235839844, + "loss": 0.0368, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.4788193702697754, + "rewards/margins": 2.609555244445801, + "rewards/rejected": -0.13073578476905823, + "step": 10035 + }, + { + "epoch": 1.63, + "learning_rate": 8.757992096852441e-07, + "logits/chosen": -1.1839736700057983, + "logits/rejected": -0.9810270667076111, + "logps/chosen": -83.47026062011719, + "logps/rejected": -25.894540786743164, + "loss": 0.6331, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.167773485183716, + "rewards/margins": 3.0060317516326904, + "rewards/rejected": 0.16174183785915375, + "step": 10036 + }, + { + "epoch": 1.63, + "learning_rate": 8.750563176643667e-07, + "logits/chosen": -1.1072405576705933, + "logits/rejected": -1.0979559421539307, + "logps/chosen": -76.96160888671875, + "logps/rejected": -59.94192123413086, + "loss": 0.1074, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.3628082275390625, + "rewards/margins": 1.45146906375885, + "rewards/rejected": 1.9113391637802124, + "step": 10037 + }, + { + "epoch": 1.63, + "learning_rate": 8.743137106380683e-07, + "logits/chosen": -0.9324187636375427, + "logits/rejected": -0.8581836819648743, + "logps/chosen": -33.953590393066406, + "logps/rejected": -60.93495178222656, + "loss": 0.2674, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.066716432571411, + "rewards/margins": 0.4435398578643799, + "rewards/rejected": 1.6231765747070312, + "step": 10038 + }, + { + "epoch": 1.63, + "learning_rate": 8.735713886576536e-07, + "logits/chosen": -1.321166753768921, + "logits/rejected": -1.3159468173980713, + "logps/chosen": -60.035980224609375, + "logps/rejected": -92.30857849121094, + "loss": 1.6203, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.4837663173675537, + "rewards/margins": 2.6882708072662354, + "rewards/rejected": -0.20450440049171448, + "step": 10039 + }, + { + "epoch": 1.63, + "learning_rate": 8.72829351774413e-07, + "logits/chosen": -1.2989071607589722, + "logits/rejected": -1.4847520589828491, + "logps/chosen": -86.9520492553711, + "logps/rejected": -151.44866943359375, + "loss": 2.2659, + "rewards/accuracies": 0.0, + "rewards/chosen": 4.744384288787842, + "rewards/margins": -4.1678595542907715, + "rewards/rejected": 8.912243843078613, + "step": 10040 + }, + { + "epoch": 1.63, + "learning_rate": 8.720876000396105e-07, + "logits/chosen": -1.428012728691101, + "logits/rejected": -1.353177547454834, + "logps/chosen": -220.8335418701172, + "logps/rejected": -57.591461181640625, + "loss": 0.5392, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.210432529449463, + "rewards/margins": 4.018324375152588, + "rewards/rejected": 1.192108154296875, + "step": 10041 + }, + { + "epoch": 1.63, + "learning_rate": 8.713461335044981e-07, + "logits/chosen": -1.1181743144989014, + "logits/rejected": -1.0001829862594604, + "logps/chosen": -67.466796875, + "logps/rejected": -22.804170608520508, + "loss": 0.5385, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.2065765857696533, + "rewards/margins": 0.7594887018203735, + "rewards/rejected": 0.4470878541469574, + "step": 10042 + }, + { + "epoch": 1.63, + "learning_rate": 8.706049522203008e-07, + "logits/chosen": -1.1023081541061401, + "logits/rejected": -1.1128830909729004, + "logps/chosen": -1.8675525188446045, + "logps/rejected": -13.960376739501953, + "loss": 1.0687, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.29058870673179626, + "rewards/margins": -0.4840258061885834, + "rewards/rejected": 0.7746145129203796, + "step": 10043 + }, + { + "epoch": 1.63, + "learning_rate": 8.698640562382299e-07, + "logits/chosen": -1.4215471744537354, + "logits/rejected": -1.546526551246643, + "logps/chosen": -115.28688049316406, + "logps/rejected": -184.01937866210938, + "loss": 1.1657, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.1666839122772217, + "rewards/margins": -0.6930756568908691, + "rewards/rejected": 2.859759569168091, + "step": 10044 + }, + { + "epoch": 1.63, + "learning_rate": 8.691234456094716e-07, + "logits/chosen": -1.2935839891433716, + "logits/rejected": -1.3616321086883545, + "logps/chosen": -246.6318359375, + "logps/rejected": -119.96755981445312, + "loss": 0.6696, + "rewards/accuracies": 0.0, + "rewards/chosen": 7.268365383148193, + "rewards/margins": -0.1369156837463379, + "rewards/rejected": 7.405281066894531, + "step": 10045 + }, + { + "epoch": 1.63, + "learning_rate": 8.683831203851967e-07, + "logits/chosen": -1.2209899425506592, + "logits/rejected": -1.2209899425506592, + "logps/chosen": -3.3613533973693848, + "logps/rejected": -3.3613533973693848, + "loss": 0.5159, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.47658583521842957, + "rewards/margins": 0.0, + "rewards/rejected": 0.47658583521842957, + "step": 10046 + }, + { + "epoch": 1.63, + "learning_rate": 8.676430806165553e-07, + "logits/chosen": -1.4869354963302612, + "logits/rejected": -1.4824018478393555, + "logps/chosen": -54.01045608520508, + "logps/rejected": -44.531192779541016, + "loss": 0.3303, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.857938766479492, + "rewards/margins": 0.13230657577514648, + "rewards/rejected": 2.7256321907043457, + "step": 10047 + }, + { + "epoch": 1.63, + "learning_rate": 8.669033263546756e-07, + "logits/chosen": -1.3714553117752075, + "logits/rejected": -1.465178370475769, + "logps/chosen": -224.4015350341797, + "logps/rejected": -78.29135131835938, + "loss": 0.0467, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.539250373840332, + "rewards/margins": 3.2960145473480225, + "rewards/rejected": 2.2432358264923096, + "step": 10048 + }, + { + "epoch": 1.63, + "learning_rate": 8.661638576506693e-07, + "logits/chosen": -1.2385214567184448, + "logits/rejected": -1.256192922592163, + "logps/chosen": -40.235076904296875, + "logps/rejected": -85.38397216796875, + "loss": 0.2348, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.36700439453125, + "rewards/margins": 1.5764023065567017, + "rewards/rejected": 1.7906020879745483, + "step": 10049 + }, + { + "epoch": 1.63, + "learning_rate": 8.654246745556244e-07, + "logits/chosen": -1.3083851337432861, + "logits/rejected": -1.3356376886367798, + "logps/chosen": -65.11196899414062, + "logps/rejected": -94.36326599121094, + "loss": 0.4602, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.261481523513794, + "rewards/margins": 0.6636643409729004, + "rewards/rejected": 2.5978171825408936, + "step": 10050 + }, + { + "epoch": 1.63, + "learning_rate": 8.64685777120614e-07, + "logits/chosen": -1.3723828792572021, + "logits/rejected": -1.1962655782699585, + "logps/chosen": -102.16276550292969, + "logps/rejected": -62.05174255371094, + "loss": 0.0244, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.109139919281006, + "rewards/margins": 3.198622703552246, + "rewards/rejected": 2.9105172157287598, + "step": 10051 + }, + { + "epoch": 1.63, + "learning_rate": 8.639471653966869e-07, + "logits/chosen": -1.0044665336608887, + "logits/rejected": -1.0057940483093262, + "logps/chosen": -56.55364227294922, + "logps/rejected": -86.32067108154297, + "loss": 0.5021, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.6917915344238281, + "rewards/margins": 0.16673052310943604, + "rewards/rejected": 1.525061011314392, + "step": 10052 + }, + { + "epoch": 1.63, + "learning_rate": 8.632088394348759e-07, + "logits/chosen": -0.9351184368133545, + "logits/rejected": -0.989916205406189, + "logps/chosen": -25.98946762084961, + "logps/rejected": -47.04447937011719, + "loss": 0.6716, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.7732013463974, + "rewards/margins": -0.4198261499404907, + "rewards/rejected": 2.1930274963378906, + "step": 10053 + }, + { + "epoch": 1.63, + "learning_rate": 8.624707992861897e-07, + "logits/chosen": -1.0924310684204102, + "logits/rejected": -1.0731563568115234, + "logps/chosen": -16.086700439453125, + "logps/rejected": -3.9306440353393555, + "loss": 0.6089, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.7473911643028259, + "rewards/margins": 0.43346211314201355, + "rewards/rejected": 0.3139290511608124, + "step": 10054 + }, + { + "epoch": 1.63, + "learning_rate": 8.617330450016237e-07, + "logits/chosen": -1.1355388164520264, + "logits/rejected": -1.0755434036254883, + "logps/chosen": -64.67720794677734, + "logps/rejected": -50.32489013671875, + "loss": 1.2948, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.6823891401290894, + "rewards/margins": -0.8909400701522827, + "rewards/rejected": 2.573329210281372, + "step": 10055 + }, + { + "epoch": 1.63, + "learning_rate": 8.609955766321459e-07, + "logits/chosen": -1.2495688199996948, + "logits/rejected": -1.1823838949203491, + "logps/chosen": -81.16941833496094, + "logps/rejected": -105.1209945678711, + "loss": 0.0478, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.89302396774292, + "rewards/margins": 2.3120462894439697, + "rewards/rejected": 2.58097767829895, + "step": 10056 + }, + { + "epoch": 1.63, + "learning_rate": 8.602583942287113e-07, + "logits/chosen": -1.4611085653305054, + "logits/rejected": -1.301719307899475, + "logps/chosen": -104.2001724243164, + "logps/rejected": -60.606136322021484, + "loss": 0.8281, + "rewards/accuracies": 1.0, + "rewards/chosen": 7.047634124755859, + "rewards/margins": 4.497752666473389, + "rewards/rejected": 2.5498814582824707, + "step": 10057 + }, + { + "epoch": 1.63, + "learning_rate": 8.5952149784225e-07, + "logits/chosen": -1.1919955015182495, + "logits/rejected": -1.1615358591079712, + "logps/chosen": -32.09868621826172, + "logps/rejected": -10.330195426940918, + "loss": 0.2592, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.059924840927124, + "rewards/margins": 1.0254334211349487, + "rewards/rejected": 1.0344914197921753, + "step": 10058 + }, + { + "epoch": 1.63, + "learning_rate": 8.58784887523677e-07, + "logits/chosen": -1.2467409372329712, + "logits/rejected": -1.2072666883468628, + "logps/chosen": -101.6127700805664, + "logps/rejected": -58.115196228027344, + "loss": 0.5694, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.3628425598144531, + "rewards/margins": -0.6972367763519287, + "rewards/rejected": 2.060079336166382, + "step": 10059 + }, + { + "epoch": 1.63, + "learning_rate": 8.580485633238822e-07, + "logits/chosen": -1.182481288909912, + "logits/rejected": -1.1623390913009644, + "logps/chosen": -110.76654815673828, + "logps/rejected": -51.93264389038086, + "loss": 1.9292, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.4338905811309814, + "rewards/margins": 1.1173923015594482, + "rewards/rejected": 2.316498279571533, + "step": 10060 + }, + { + "epoch": 1.63, + "learning_rate": 8.573125252937414e-07, + "logits/chosen": -0.9799601435661316, + "logits/rejected": -1.0405573844909668, + "logps/chosen": -50.99772644042969, + "logps/rejected": -88.22685241699219, + "loss": 0.0271, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.275575160980225, + "rewards/margins": 2.9435629844665527, + "rewards/rejected": 1.3320121765136719, + "step": 10061 + }, + { + "epoch": 1.63, + "learning_rate": 8.565767734841057e-07, + "logits/chosen": -0.7182515263557434, + "logits/rejected": -0.8084880709648132, + "logps/chosen": -42.40228271484375, + "logps/rejected": -67.54560089111328, + "loss": 0.3402, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.8221702575683594, + "rewards/margins": 0.073638916015625, + "rewards/rejected": 2.7485313415527344, + "step": 10062 + }, + { + "epoch": 1.63, + "learning_rate": 8.558413079458106e-07, + "logits/chosen": -1.3968950510025024, + "logits/rejected": -1.2416023015975952, + "logps/chosen": -84.2965316772461, + "logps/rejected": -82.6378402709961, + "loss": 0.2326, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.877028942108154, + "rewards/margins": 2.231489896774292, + "rewards/rejected": 3.6455390453338623, + "step": 10063 + }, + { + "epoch": 1.63, + "learning_rate": 8.551061287296675e-07, + "logits/chosen": -0.9015197157859802, + "logits/rejected": -0.9015197157859802, + "logps/chosen": -41.821922302246094, + "logps/rejected": -41.821922302246094, + "loss": 0.3525, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.7165745496749878, + "rewards/margins": 0.0, + "rewards/rejected": 1.7165745496749878, + "step": 10064 + }, + { + "epoch": 1.63, + "learning_rate": 8.543712358864726e-07, + "logits/chosen": -1.480512261390686, + "logits/rejected": -1.485971212387085, + "logps/chosen": -164.06719970703125, + "logps/rejected": -94.22720336914062, + "loss": 0.0197, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.512251377105713, + "rewards/margins": 3.5298798084259033, + "rewards/rejected": 1.9823715686798096, + "step": 10065 + }, + { + "epoch": 1.63, + "learning_rate": 8.536366294669979e-07, + "logits/chosen": -1.0009199380874634, + "logits/rejected": -0.9043998122215271, + "logps/chosen": -118.47976684570312, + "logps/rejected": -35.57292938232422, + "loss": 0.3953, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.686496257781982, + "rewards/margins": 2.2793474197387695, + "rewards/rejected": 2.407148838043213, + "step": 10066 + }, + { + "epoch": 1.63, + "learning_rate": 8.529023095220002e-07, + "logits/chosen": -1.5786068439483643, + "logits/rejected": -1.4999531507492065, + "logps/chosen": -51.41246032714844, + "logps/rejected": -31.597293853759766, + "loss": 0.3347, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.735998511314392, + "rewards/margins": 1.125393271446228, + "rewards/rejected": 0.6106052398681641, + "step": 10067 + }, + { + "epoch": 1.63, + "learning_rate": 8.521682761022116e-07, + "logits/chosen": -1.2994098663330078, + "logits/rejected": -1.244083285331726, + "logps/chosen": -86.92461395263672, + "logps/rejected": -61.06170654296875, + "loss": 0.1615, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.062110185623169, + "rewards/margins": 1.0216255187988281, + "rewards/rejected": 1.0404846668243408, + "step": 10068 + }, + { + "epoch": 1.63, + "learning_rate": 8.514345292583487e-07, + "logits/chosen": -1.392541766166687, + "logits/rejected": -1.339341640472412, + "logps/chosen": -98.29812622070312, + "logps/rejected": -15.167946815490723, + "loss": 0.4486, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.5217392444610596, + "rewards/margins": 2.1163113117218018, + "rewards/rejected": 0.405428022146225, + "step": 10069 + }, + { + "epoch": 1.63, + "learning_rate": 8.50701069041105e-07, + "logits/chosen": -1.092885136604309, + "logits/rejected": -1.1182806491851807, + "logps/chosen": -90.3935546875, + "logps/rejected": -55.2352294921875, + "loss": 0.2569, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.2791221141815186, + "rewards/margins": 0.3997321128845215, + "rewards/rejected": 1.879390001296997, + "step": 10070 + }, + { + "epoch": 1.63, + "learning_rate": 8.49967895501157e-07, + "logits/chosen": -1.253341555595398, + "logits/rejected": -1.095356822013855, + "logps/chosen": -49.18775939941406, + "logps/rejected": -36.97328567504883, + "loss": 1.0666, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.6761474609375, + "rewards/margins": -0.793381929397583, + "rewards/rejected": 3.469529390335083, + "step": 10071 + }, + { + "epoch": 1.63, + "learning_rate": 8.492350086891587e-07, + "logits/chosen": -1.0941805839538574, + "logits/rejected": -1.0881074666976929, + "logps/chosen": -15.462911605834961, + "logps/rejected": -3.141045570373535, + "loss": 2.4065, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.4338949918746948, + "rewards/margins": 1.088934302330017, + "rewards/rejected": 0.34496065974235535, + "step": 10072 + }, + { + "epoch": 1.63, + "learning_rate": 8.485024086557476e-07, + "logits/chosen": -0.9524809122085571, + "logits/rejected": -0.9647219777107239, + "logps/chosen": -57.0623779296875, + "logps/rejected": -37.31663513183594, + "loss": 0.1167, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.6320786476135254, + "rewards/margins": 1.3703300952911377, + "rewards/rejected": 1.2617485523223877, + "step": 10073 + }, + { + "epoch": 1.64, + "learning_rate": 8.477700954515372e-07, + "logits/chosen": -0.8582333326339722, + "logits/rejected": -0.8582333326339722, + "logps/chosen": -15.22552490234375, + "logps/rejected": -15.22552490234375, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.7142812609672546, + "rewards/margins": 0.0, + "rewards/rejected": 0.7142812609672546, + "step": 10074 + }, + { + "epoch": 1.64, + "learning_rate": 8.470380691271252e-07, + "logits/chosen": -1.6771023273468018, + "logits/rejected": -1.725628137588501, + "logps/chosen": -100.82032775878906, + "logps/rejected": -120.00540161132812, + "loss": 0.584, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.4932861328125, + "rewards/margins": 2.503213405609131, + "rewards/rejected": 0.9900726675987244, + "step": 10075 + }, + { + "epoch": 1.64, + "learning_rate": 8.463063297330859e-07, + "logits/chosen": -0.8801660537719727, + "logits/rejected": -0.8776847124099731, + "logps/chosen": -3.474893808364868, + "logps/rejected": -24.46792984008789, + "loss": 0.7271, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.37242183089256287, + "rewards/margins": -0.43918976187705994, + "rewards/rejected": 0.8116115927696228, + "step": 10076 + }, + { + "epoch": 1.64, + "learning_rate": 8.455748773199778e-07, + "logits/chosen": -1.3446073532104492, + "logits/rejected": -1.0912574529647827, + "logps/chosen": -135.83938598632812, + "logps/rejected": -48.57991027832031, + "loss": 0.0037, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.682446479797363, + "rewards/margins": 5.049951553344727, + "rewards/rejected": 0.6324947476387024, + "step": 10077 + }, + { + "epoch": 1.64, + "learning_rate": 8.448437119383352e-07, + "logits/chosen": -0.87131667137146, + "logits/rejected": -0.8403470516204834, + "logps/chosen": -21.454673767089844, + "logps/rejected": -3.8779008388519287, + "loss": 0.2179, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.7896913886070251, + "rewards/margins": 0.6346578598022461, + "rewards/rejected": 0.15503351390361786, + "step": 10078 + }, + { + "epoch": 1.64, + "learning_rate": 8.441128336386767e-07, + "logits/chosen": -1.0851850509643555, + "logits/rejected": -1.0837762355804443, + "logps/chosen": -1.9826340675354004, + "logps/rejected": -4.059864044189453, + "loss": 0.3506, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.3517208993434906, + "rewards/margins": -0.0051978230476379395, + "rewards/rejected": 0.35691872239112854, + "step": 10079 + }, + { + "epoch": 1.64, + "learning_rate": 8.433822424714966e-07, + "logits/chosen": -1.4720786809921265, + "logits/rejected": -1.4720786809921265, + "logps/chosen": -45.45656967163086, + "logps/rejected": -45.45656967163086, + "loss": 0.5088, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.8162811994552612, + "rewards/margins": 0.0, + "rewards/rejected": 1.8162811994552612, + "step": 10080 + }, + { + "epoch": 1.64, + "learning_rate": 8.426519384872733e-07, + "logits/chosen": -1.2805726528167725, + "logits/rejected": -1.3359812498092651, + "logps/chosen": -84.48680114746094, + "logps/rejected": -104.00881958007812, + "loss": 2.7706, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.7243187427520752, + "rewards/margins": -1.6287941932678223, + "rewards/rejected": 3.3531129360198975, + "step": 10081 + }, + { + "epoch": 1.64, + "learning_rate": 8.419219217364654e-07, + "logits/chosen": -1.4812419414520264, + "logits/rejected": -1.4409934282302856, + "logps/chosen": -91.35330200195312, + "logps/rejected": -56.885536193847656, + "loss": 0.286, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.6075668334960938, + "rewards/margins": 0.7189499139785767, + "rewards/rejected": 1.888616919517517, + "step": 10082 + }, + { + "epoch": 1.64, + "learning_rate": 8.411921922695071e-07, + "logits/chosen": -1.2819463014602661, + "logits/rejected": -1.2730592489242554, + "logps/chosen": -120.73824310302734, + "logps/rejected": -98.85716247558594, + "loss": 0.3154, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.030136823654175, + "rewards/margins": 0.39843666553497314, + "rewards/rejected": 1.6317001581192017, + "step": 10083 + }, + { + "epoch": 1.64, + "learning_rate": 8.40462750136819e-07, + "logits/chosen": -1.1599540710449219, + "logits/rejected": -1.1268420219421387, + "logps/chosen": -75.97003936767578, + "logps/rejected": -103.32557678222656, + "loss": 0.6787, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.5690438747406006, + "rewards/margins": 1.744814395904541, + "rewards/rejected": 0.8242294192314148, + "step": 10084 + }, + { + "epoch": 1.64, + "learning_rate": 8.397335953887953e-07, + "logits/chosen": -1.4811264276504517, + "logits/rejected": -1.5049936771392822, + "logps/chosen": -91.38996887207031, + "logps/rejected": -93.81981658935547, + "loss": 1.9375, + "rewards/accuracies": 0.0, + "rewards/chosen": 4.945533752441406, + "rewards/margins": -3.7637691497802734, + "rewards/rejected": 8.70930290222168, + "step": 10085 + }, + { + "epoch": 1.64, + "learning_rate": 8.390047280758163e-07, + "logits/chosen": -1.3723063468933105, + "logits/rejected": -1.3708332777023315, + "logps/chosen": -30.118566513061523, + "logps/rejected": -56.20696258544922, + "loss": 1.6439, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.17956805229187, + "rewards/margins": -0.6097135543823242, + "rewards/rejected": 3.7892816066741943, + "step": 10086 + }, + { + "epoch": 1.64, + "learning_rate": 8.382761482482382e-07, + "logits/chosen": -1.1281812191009521, + "logits/rejected": -1.0391610860824585, + "logps/chosen": -52.281227111816406, + "logps/rejected": -53.03617477416992, + "loss": 0.162, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.2131233215332031, + "rewards/margins": 1.0483005046844482, + "rewards/rejected": 0.1648227721452713, + "step": 10087 + }, + { + "epoch": 1.64, + "learning_rate": 8.375478559564004e-07, + "logits/chosen": -1.3780547380447388, + "logits/rejected": -1.1716264486312866, + "logps/chosen": -98.19808959960938, + "logps/rejected": -49.34174346923828, + "loss": 0.011, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.729470729827881, + "rewards/margins": 4.515353202819824, + "rewards/rejected": 2.2141175270080566, + "step": 10088 + }, + { + "epoch": 1.64, + "learning_rate": 8.368198512506192e-07, + "logits/chosen": -1.0681343078613281, + "logits/rejected": -1.1349900960922241, + "logps/chosen": -43.79207229614258, + "logps/rejected": -236.62747192382812, + "loss": 1.8854, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.5155887603759766, + "rewards/margins": -3.74619722366333, + "rewards/rejected": 7.261785984039307, + "step": 10089 + }, + { + "epoch": 1.64, + "learning_rate": 8.360921341811956e-07, + "logits/chosen": -1.0586843490600586, + "logits/rejected": -0.9717673659324646, + "logps/chosen": -128.98037719726562, + "logps/rejected": -45.183834075927734, + "loss": 0.1519, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.7140228748321533, + "rewards/margins": 1.942846655845642, + "rewards/rejected": 1.7711762189865112, + "step": 10090 + }, + { + "epoch": 1.64, + "learning_rate": 8.353647047984048e-07, + "logits/chosen": -0.708316445350647, + "logits/rejected": -0.708316445350647, + "logps/chosen": -1.392509937286377, + "logps/rejected": -1.392509937286377, + "loss": 1.1128, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.19809184968471527, + "rewards/margins": 0.0, + "rewards/rejected": 0.19809184968471527, + "step": 10091 + }, + { + "epoch": 1.64, + "learning_rate": 8.346375631525078e-07, + "logits/chosen": -1.418776035308838, + "logits/rejected": -1.006953477859497, + "logps/chosen": -81.89512634277344, + "logps/rejected": -141.44882202148438, + "loss": 0.367, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.942147731781006, + "rewards/margins": 0.05166149139404297, + "rewards/rejected": 4.890486240386963, + "step": 10092 + }, + { + "epoch": 1.64, + "learning_rate": 8.339107092937409e-07, + "logits/chosen": -1.4716620445251465, + "logits/rejected": -1.4566869735717773, + "logps/chosen": -130.521484375, + "logps/rejected": -102.05548095703125, + "loss": 1.4836, + "rewards/accuracies": 0.0, + "rewards/chosen": 6.321311950683594, + "rewards/margins": -2.8625478744506836, + "rewards/rejected": 9.183859825134277, + "step": 10093 + }, + { + "epoch": 1.64, + "learning_rate": 8.331841432723253e-07, + "logits/chosen": -1.1751418113708496, + "logits/rejected": -1.1751418113708496, + "logps/chosen": -70.67507934570312, + "logps/rejected": -70.67507934570312, + "loss": 0.6613, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.9620163440704346, + "rewards/margins": 0.0, + "rewards/rejected": 2.9620163440704346, + "step": 10094 + }, + { + "epoch": 1.64, + "learning_rate": 8.324578651384574e-07, + "logits/chosen": -1.3886882066726685, + "logits/rejected": -1.3886882066726685, + "logps/chosen": -45.93613052368164, + "logps/rejected": -45.93613052368164, + "loss": 0.4025, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.025547504425049, + "rewards/margins": 0.0, + "rewards/rejected": 2.025547504425049, + "step": 10095 + }, + { + "epoch": 1.64, + "learning_rate": 8.317318749423192e-07, + "logits/chosen": -1.2082502841949463, + "logits/rejected": -1.2107701301574707, + "logps/chosen": -72.4337387084961, + "logps/rejected": -78.12924194335938, + "loss": 1.1034, + "rewards/accuracies": 0.0, + "rewards/chosen": 4.613162517547607, + "rewards/margins": -0.6199736595153809, + "rewards/rejected": 5.233136177062988, + "step": 10096 + }, + { + "epoch": 1.64, + "learning_rate": 8.310061727340657e-07, + "logits/chosen": -1.393220067024231, + "logits/rejected": -1.3777165412902832, + "logps/chosen": -90.33856201171875, + "logps/rejected": -76.4710693359375, + "loss": 0.227, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0386109352111816, + "rewards/margins": 0.5805435180664062, + "rewards/rejected": 2.4580674171447754, + "step": 10097 + }, + { + "epoch": 1.64, + "learning_rate": 8.302807585638401e-07, + "logits/chosen": -1.6103535890579224, + "logits/rejected": -1.7114309072494507, + "logps/chosen": -71.66415405273438, + "logps/rejected": -187.07168579101562, + "loss": 1.4673, + "rewards/accuracies": 0.0, + "rewards/chosen": 6.593348979949951, + "rewards/margins": -2.879805088043213, + "rewards/rejected": 9.473154067993164, + "step": 10098 + }, + { + "epoch": 1.64, + "learning_rate": 8.295556324817588e-07, + "logits/chosen": -1.1720246076583862, + "logits/rejected": -1.143370509147644, + "logps/chosen": -40.440616607666016, + "logps/rejected": -85.30038452148438, + "loss": 1.388, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.843778610229492, + "rewards/margins": -1.0546131134033203, + "rewards/rejected": 3.8983917236328125, + "step": 10099 + }, + { + "epoch": 1.64, + "learning_rate": 8.288307945379231e-07, + "logits/chosen": -1.4263410568237305, + "logits/rejected": -1.4263410568237305, + "logps/chosen": -30.689212799072266, + "logps/rejected": -30.689212799072266, + "loss": 0.5817, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.6341958045959473, + "rewards/margins": 0.0, + "rewards/rejected": 2.6341958045959473, + "step": 10100 + }, + { + "epoch": 1.64, + "learning_rate": 8.281062447824101e-07, + "logits/chosen": -1.3190674781799316, + "logits/rejected": -1.313387155532837, + "logps/chosen": -88.47338104248047, + "logps/rejected": -68.75015258789062, + "loss": 0.2962, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.4535179138183594, + "rewards/margins": 0.24669945240020752, + "rewards/rejected": 1.2068184614181519, + "step": 10101 + }, + { + "epoch": 1.64, + "learning_rate": 8.273819832652824e-07, + "logits/chosen": -1.1700663566589355, + "logits/rejected": -1.154329538345337, + "logps/chosen": -81.396240234375, + "logps/rejected": -78.88076782226562, + "loss": 0.8121, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0639100074768066, + "rewards/margins": 2.1589813232421875, + "rewards/rejected": 0.9049286246299744, + "step": 10102 + }, + { + "epoch": 1.64, + "learning_rate": 8.266580100365762e-07, + "logits/chosen": -1.5642670392990112, + "logits/rejected": -1.4609912633895874, + "logps/chosen": -70.92990112304688, + "logps/rejected": -17.03531837463379, + "loss": 0.3457, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.7130882143974304, + "rewards/margins": 0.22880896925926208, + "rewards/rejected": 0.48427924513816833, + "step": 10103 + }, + { + "epoch": 1.64, + "learning_rate": 8.259343251463148e-07, + "logits/chosen": -1.4259655475616455, + "logits/rejected": -1.3998003005981445, + "logps/chosen": -35.31082534790039, + "logps/rejected": -15.828216552734375, + "loss": 1.9377, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.8631504774093628, + "rewards/margins": 1.067211627960205, + "rewards/rejected": 0.7959389090538025, + "step": 10104 + }, + { + "epoch": 1.64, + "learning_rate": 8.252109286444948e-07, + "logits/chosen": -1.2332724332809448, + "logits/rejected": -1.2729483842849731, + "logps/chosen": -115.44529724121094, + "logps/rejected": -128.2806396484375, + "loss": 0.4202, + "rewards/accuracies": 0.0, + "rewards/chosen": 5.342652797698975, + "rewards/margins": -0.2478165626525879, + "rewards/rejected": 5.5904693603515625, + "step": 10105 + }, + { + "epoch": 1.64, + "learning_rate": 8.244878205810986e-07, + "logits/chosen": -1.4641578197479248, + "logits/rejected": -1.4003971815109253, + "logps/chosen": -156.73020935058594, + "logps/rejected": -93.89749145507812, + "loss": 0.0327, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.894902229309082, + "rewards/margins": 4.027674674987793, + "rewards/rejected": 2.86722731590271, + "step": 10106 + }, + { + "epoch": 1.64, + "learning_rate": 8.237650010060832e-07, + "logits/chosen": -1.3036712408065796, + "logits/rejected": -1.3639214038848877, + "logps/chosen": -58.44161605834961, + "logps/rejected": -72.46028137207031, + "loss": 0.304, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.4290928840637207, + "rewards/margins": 0.6899311542510986, + "rewards/rejected": 2.739161729812622, + "step": 10107 + }, + { + "epoch": 1.64, + "learning_rate": 8.230424699693923e-07, + "logits/chosen": -1.2286427021026611, + "logits/rejected": -1.1399283409118652, + "logps/chosen": -90.51829528808594, + "logps/rejected": -57.98369598388672, + "loss": 0.7162, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.505073547363281, + "rewards/margins": 1.6674885749816895, + "rewards/rejected": 4.837584972381592, + "step": 10108 + }, + { + "epoch": 1.64, + "learning_rate": 8.223202275209425e-07, + "logits/chosen": -1.087380051612854, + "logits/rejected": -1.26331627368927, + "logps/chosen": -57.43267822265625, + "logps/rejected": -132.71200561523438, + "loss": 1.0762, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.9892349243164062, + "rewards/margins": -1.8551440238952637, + "rewards/rejected": 4.84437894821167, + "step": 10109 + }, + { + "epoch": 1.64, + "learning_rate": 8.215982737106365e-07, + "logits/chosen": -1.186757206916809, + "logits/rejected": -1.186757206916809, + "logps/chosen": -1.2362544536590576, + "logps/rejected": -1.2362544536590576, + "loss": 0.4965, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.22277610003948212, + "rewards/margins": 0.0, + "rewards/rejected": 0.22277610003948212, + "step": 10110 + }, + { + "epoch": 1.64, + "learning_rate": 8.208766085883524e-07, + "logits/chosen": -1.1760084629058838, + "logits/rejected": -1.1760084629058838, + "logps/chosen": -24.615703582763672, + "logps/rejected": -24.615703582763672, + "loss": 0.7662, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.4664103984832764, + "rewards/margins": 0.0, + "rewards/rejected": 2.4664103984832764, + "step": 10111 + }, + { + "epoch": 1.64, + "learning_rate": 8.201552322039524e-07, + "logits/chosen": -1.5884525775909424, + "logits/rejected": -1.6027629375457764, + "logps/chosen": -117.27249908447266, + "logps/rejected": -124.54060363769531, + "loss": 0.454, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.0257012844085693, + "rewards/margins": -0.2721061706542969, + "rewards/rejected": 2.297807455062866, + "step": 10112 + }, + { + "epoch": 1.64, + "learning_rate": 8.194341446072746e-07, + "logits/chosen": -1.1373406648635864, + "logits/rejected": -1.0427632331848145, + "logps/chosen": -61.118839263916016, + "logps/rejected": -27.793087005615234, + "loss": 0.068, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.923604965209961, + "rewards/margins": 1.9880199432373047, + "rewards/rejected": 0.9355850219726562, + "step": 10113 + }, + { + "epoch": 1.64, + "learning_rate": 8.187133458481416e-07, + "logits/chosen": -1.779535174369812, + "logits/rejected": -1.8481625318527222, + "logps/chosen": -182.3848876953125, + "logps/rejected": -201.205810546875, + "loss": 1.2052, + "rewards/accuracies": 0.0, + "rewards/chosen": 8.424285888671875, + "rewards/margins": -0.8490362167358398, + "rewards/rejected": 9.273322105407715, + "step": 10114 + }, + { + "epoch": 1.64, + "learning_rate": 8.179928359763517e-07, + "logits/chosen": -1.245120882987976, + "logits/rejected": -1.3122625350952148, + "logps/chosen": -176.28567504882812, + "logps/rejected": -86.66472625732422, + "loss": 0.0676, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.056338787078857, + "rewards/margins": 2.24617862701416, + "rewards/rejected": 3.8101601600646973, + "step": 10115 + }, + { + "epoch": 1.64, + "learning_rate": 8.172726150416876e-07, + "logits/chosen": -1.525571584701538, + "logits/rejected": -1.4548778533935547, + "logps/chosen": -47.9218635559082, + "logps/rejected": -14.954079627990723, + "loss": 0.1296, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.174316883087158, + "rewards/margins": 2.212272882461548, + "rewards/rejected": 0.9620440602302551, + "step": 10116 + }, + { + "epoch": 1.64, + "learning_rate": 8.165526830939069e-07, + "logits/chosen": -1.2726956605911255, + "logits/rejected": -1.2272924184799194, + "logps/chosen": -115.658447265625, + "logps/rejected": -56.51736068725586, + "loss": 0.2249, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.5201995372772217, + "rewards/margins": 0.5991442203521729, + "rewards/rejected": 2.921055316925049, + "step": 10117 + }, + { + "epoch": 1.64, + "learning_rate": 8.158330401827524e-07, + "logits/chosen": -1.3847371339797974, + "logits/rejected": -1.3893249034881592, + "logps/chosen": -9.642877578735352, + "logps/rejected": -1.2282482385635376, + "loss": 0.3548, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.17865896224975586, + "rewards/margins": -0.01651427149772644, + "rewards/rejected": 0.1951732337474823, + "step": 10118 + }, + { + "epoch": 1.64, + "learning_rate": 8.151136863579445e-07, + "logits/chosen": -1.5673061609268188, + "logits/rejected": -1.5958746671676636, + "logps/chosen": -77.52595520019531, + "logps/rejected": -19.408607482910156, + "loss": 0.4083, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.123003363609314, + "rewards/margins": 0.46760880947113037, + "rewards/rejected": 0.6553945541381836, + "step": 10119 + }, + { + "epoch": 1.64, + "learning_rate": 8.143946216691817e-07, + "logits/chosen": -1.027368426322937, + "logits/rejected": -0.9774701595306396, + "logps/chosen": -50.20079040527344, + "logps/rejected": -12.790700912475586, + "loss": 0.2148, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.970458984375, + "rewards/margins": 1.1518834829330444, + "rewards/rejected": 0.8185755014419556, + "step": 10120 + }, + { + "epoch": 1.64, + "learning_rate": 8.136758461661476e-07, + "logits/chosen": -1.0385780334472656, + "logits/rejected": -1.096340537071228, + "logps/chosen": -53.18305206298828, + "logps/rejected": -53.94080352783203, + "loss": 0.9436, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.7700531482696533, + "rewards/margins": -1.3749992847442627, + "rewards/rejected": 3.145052433013916, + "step": 10121 + }, + { + "epoch": 1.64, + "learning_rate": 8.129573598984996e-07, + "logits/chosen": -1.4649637937545776, + "logits/rejected": -1.1528903245925903, + "logps/chosen": -138.4064178466797, + "logps/rejected": -31.86231231689453, + "loss": 0.2237, + "rewards/accuracies": 1.0, + "rewards/chosen": 7.153018474578857, + "rewards/margins": 3.4996187686920166, + "rewards/rejected": 3.653399705886841, + "step": 10122 + }, + { + "epoch": 1.64, + "learning_rate": 8.122391629158816e-07, + "logits/chosen": -1.4032785892486572, + "logits/rejected": -1.4109735488891602, + "logps/chosen": -133.38658142089844, + "logps/rejected": -116.65640258789062, + "loss": 0.1271, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.290589809417725, + "rewards/margins": 1.315659999847412, + "rewards/rejected": 4.9749298095703125, + "step": 10123 + }, + { + "epoch": 1.64, + "learning_rate": 8.115212552679108e-07, + "logits/chosen": -1.4317843914031982, + "logits/rejected": -1.3838467597961426, + "logps/chosen": -71.69869995117188, + "logps/rejected": -47.72636032104492, + "loss": 0.1601, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.32102370262146, + "rewards/margins": 1.9721875190734863, + "rewards/rejected": 0.34883615374565125, + "step": 10124 + }, + { + "epoch": 1.64, + "learning_rate": 8.108036370041911e-07, + "logits/chosen": -1.6842843294143677, + "logits/rejected": -1.552160382270813, + "logps/chosen": -260.38525390625, + "logps/rejected": -33.99408721923828, + "loss": 0.236, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.147741794586182, + "rewards/margins": 5.041783809661865, + "rewards/rejected": 0.10595779865980148, + "step": 10125 + }, + { + "epoch": 1.64, + "learning_rate": 8.100863081742999e-07, + "logits/chosen": -1.1937607526779175, + "logits/rejected": -1.1189817190170288, + "logps/chosen": -38.04414749145508, + "logps/rejected": -26.8185977935791, + "loss": 0.4575, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.7509639263153076, + "rewards/margins": 1.0627778768539429, + "rewards/rejected": 1.6881860494613647, + "step": 10126 + }, + { + "epoch": 1.64, + "learning_rate": 8.09369268827801e-07, + "logits/chosen": -0.8443304896354675, + "logits/rejected": -0.891009509563446, + "logps/chosen": -69.20262145996094, + "logps/rejected": -56.48792266845703, + "loss": 0.172, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.8635146617889404, + "rewards/margins": 0.8907432556152344, + "rewards/rejected": 2.972771406173706, + "step": 10127 + }, + { + "epoch": 1.64, + "learning_rate": 8.086525190142325e-07, + "logits/chosen": -0.9119378328323364, + "logits/rejected": -0.9999175667762756, + "logps/chosen": -64.10469055175781, + "logps/rejected": -127.82018280029297, + "loss": 0.4389, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.3693368434906006, + "rewards/margins": -0.3186018466949463, + "rewards/rejected": 2.687938690185547, + "step": 10128 + }, + { + "epoch": 1.64, + "learning_rate": 8.07936058783117e-07, + "logits/chosen": -1.1783909797668457, + "logits/rejected": -1.2450776100158691, + "logps/chosen": -45.43000030517578, + "logps/rejected": -107.63356018066406, + "loss": 1.8641, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.0466866493225098, + "rewards/margins": 0.25230491161346436, + "rewards/rejected": 1.7943817377090454, + "step": 10129 + }, + { + "epoch": 1.64, + "learning_rate": 8.072198881839527e-07, + "logits/chosen": -1.4842562675476074, + "logits/rejected": -1.3283156156539917, + "logps/chosen": -104.01432037353516, + "logps/rejected": -76.84822082519531, + "loss": 0.0047, + "rewards/accuracies": 1.0, + "rewards/chosen": 11.597771644592285, + "rewards/margins": 8.207143783569336, + "rewards/rejected": 3.3906280994415283, + "step": 10130 + }, + { + "epoch": 1.64, + "learning_rate": 8.065040072662228e-07, + "logits/chosen": -1.280642032623291, + "logits/rejected": -1.3334741592407227, + "logps/chosen": -90.43705749511719, + "logps/rejected": -91.15678405761719, + "loss": 0.6647, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.816519260406494, + "rewards/margins": -1.0077180862426758, + "rewards/rejected": 4.82423734664917, + "step": 10131 + }, + { + "epoch": 1.64, + "learning_rate": 8.05788416079386e-07, + "logits/chosen": -1.358981966972351, + "logits/rejected": -1.4071165323257446, + "logps/chosen": -260.94256591796875, + "logps/rejected": -95.78246307373047, + "loss": 0.805, + "rewards/accuracies": 0.0, + "rewards/chosen": 4.186642646789551, + "rewards/margins": -1.2521581649780273, + "rewards/rejected": 5.438800811767578, + "step": 10132 + }, + { + "epoch": 1.64, + "learning_rate": 8.050731146728846e-07, + "logits/chosen": -1.080557942390442, + "logits/rejected": -1.1440321207046509, + "logps/chosen": -91.54335021972656, + "logps/rejected": -83.81773376464844, + "loss": 0.281, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.8838798999786377, + "rewards/margins": 0.6955018043518066, + "rewards/rejected": 2.188378095626831, + "step": 10133 + }, + { + "epoch": 1.64, + "learning_rate": 8.043581030961372e-07, + "logits/chosen": -1.4754717350006104, + "logits/rejected": -1.4618892669677734, + "logps/chosen": -50.65327072143555, + "logps/rejected": -67.6803970336914, + "loss": 0.1261, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.6381008625030518, + "rewards/margins": 1.5050640106201172, + "rewards/rejected": 0.133036807179451, + "step": 10134 + }, + { + "epoch": 1.65, + "learning_rate": 8.036433813985478e-07, + "logits/chosen": -1.0695449113845825, + "logits/rejected": -0.8924832344055176, + "logps/chosen": -51.848182678222656, + "logps/rejected": -58.44212341308594, + "loss": 0.0752, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.89138126373291, + "rewards/margins": 2.6296563148498535, + "rewards/rejected": 3.2617249488830566, + "step": 10135 + }, + { + "epoch": 1.65, + "learning_rate": 8.029289496294918e-07, + "logits/chosen": -1.3116788864135742, + "logits/rejected": -1.306043267250061, + "logps/chosen": -169.14474487304688, + "logps/rejected": -80.06678771972656, + "loss": 0.1185, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.933972358703613, + "rewards/margins": 1.5344195365905762, + "rewards/rejected": 5.399552822113037, + "step": 10136 + }, + { + "epoch": 1.65, + "learning_rate": 8.022148078383324e-07, + "logits/chosen": -1.322162389755249, + "logits/rejected": -0.9581689238548279, + "logps/chosen": -147.70993041992188, + "logps/rejected": -49.26284408569336, + "loss": 0.0133, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.169851779937744, + "rewards/margins": 4.4114179611206055, + "rewards/rejected": 1.7584339380264282, + "step": 10137 + }, + { + "epoch": 1.65, + "learning_rate": 8.015009560744114e-07, + "logits/chosen": -1.1922094821929932, + "logits/rejected": -1.1879802942276, + "logps/chosen": -3.3229804039001465, + "logps/rejected": -3.017582416534424, + "loss": 0.8233, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.2593083381652832, + "rewards/margins": -0.23953089118003845, + "rewards/rejected": 0.49883922934532166, + "step": 10138 + }, + { + "epoch": 1.65, + "learning_rate": 8.007873943870464e-07, + "logits/chosen": -1.5321892499923706, + "logits/rejected": -1.5321892499923706, + "logps/chosen": -51.9010124206543, + "logps/rejected": -51.9010124206543, + "loss": 0.3544, + "rewards/accuracies": 0.0, + "rewards/chosen": 5.282641410827637, + "rewards/margins": 0.0, + "rewards/rejected": 5.282641410827637, + "step": 10139 + }, + { + "epoch": 1.65, + "learning_rate": 8.000741228255398e-07, + "logits/chosen": -1.058686375617981, + "logits/rejected": -1.0632965564727783, + "logps/chosen": -14.096445083618164, + "logps/rejected": -17.815256118774414, + "loss": 0.6012, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5645508170127869, + "rewards/margins": 0.05294513702392578, + "rewards/rejected": 0.5116056799888611, + "step": 10140 + }, + { + "epoch": 1.65, + "learning_rate": 7.993611414391706e-07, + "logits/chosen": -1.3714464902877808, + "logits/rejected": -1.3666046857833862, + "logps/chosen": -39.130882263183594, + "logps/rejected": -29.58786392211914, + "loss": 0.4491, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.5160068869590759, + "rewards/margins": -0.11959606409072876, + "rewards/rejected": 0.6356029510498047, + "step": 10141 + }, + { + "epoch": 1.65, + "learning_rate": 7.986484502772012e-07, + "logits/chosen": -1.482559323310852, + "logits/rejected": -1.4569480419158936, + "logps/chosen": -91.8294677734375, + "logps/rejected": -84.49114990234375, + "loss": 1.0063, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.2418465614318848, + "rewards/margins": -0.11420822143554688, + "rewards/rejected": 2.3560547828674316, + "step": 10142 + }, + { + "epoch": 1.65, + "learning_rate": 7.979360493888688e-07, + "logits/chosen": -1.5390561819076538, + "logits/rejected": -1.5566611289978027, + "logps/chosen": -49.04900360107422, + "logps/rejected": -67.80422973632812, + "loss": 1.8397, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.7255340814590454, + "rewards/margins": -3.6407570838928223, + "rewards/rejected": 5.366291046142578, + "step": 10143 + }, + { + "epoch": 1.65, + "learning_rate": 7.972239388233959e-07, + "logits/chosen": -1.0255587100982666, + "logits/rejected": -1.1116573810577393, + "logps/chosen": -24.55689239501953, + "logps/rejected": -69.71751403808594, + "loss": 0.9446, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.49152299761772156, + "rewards/margins": -1.7200584411621094, + "rewards/rejected": 2.2115814685821533, + "step": 10144 + }, + { + "epoch": 1.65, + "learning_rate": 7.965121186299812e-07, + "logits/chosen": -1.1882882118225098, + "logits/rejected": -1.053928256034851, + "logps/chosen": -87.62027740478516, + "logps/rejected": -103.01631927490234, + "loss": 0.3079, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.932366371154785, + "rewards/margins": 2.8743059635162354, + "rewards/rejected": 3.05806040763855, + "step": 10145 + }, + { + "epoch": 1.65, + "learning_rate": 7.958005888578063e-07, + "logits/chosen": -1.1986935138702393, + "logits/rejected": -1.1396244764328003, + "logps/chosen": -67.32608795166016, + "logps/rejected": -53.43738555908203, + "loss": 0.3116, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.5433220863342285, + "rewards/margins": 0.16780924797058105, + "rewards/rejected": 2.3755128383636475, + "step": 10146 + }, + { + "epoch": 1.65, + "learning_rate": 7.950893495560291e-07, + "logits/chosen": -1.6479694843292236, + "logits/rejected": -1.6787042617797852, + "logps/chosen": -67.19901275634766, + "logps/rejected": -62.698463439941406, + "loss": 1.6431, + "rewards/accuracies": 0.0, + "rewards/chosen": 5.384148597717285, + "rewards/margins": -2.5483765602111816, + "rewards/rejected": 7.932525157928467, + "step": 10147 + }, + { + "epoch": 1.65, + "learning_rate": 7.943784007737915e-07, + "logits/chosen": -1.2483470439910889, + "logits/rejected": -0.8588372468948364, + "logps/chosen": -75.12054443359375, + "logps/rejected": -26.728702545166016, + "loss": 0.0033, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.872109889984131, + "rewards/margins": 5.6935858726501465, + "rewards/rejected": 0.17852401733398438, + "step": 10148 + }, + { + "epoch": 1.65, + "learning_rate": 7.936677425602113e-07, + "logits/chosen": -1.2353129386901855, + "logits/rejected": -1.2359530925750732, + "logps/chosen": -17.411272048950195, + "logps/rejected": -20.366924285888672, + "loss": 0.4972, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.472455233335495, + "rewards/margins": 0.2699875235557556, + "rewards/rejected": 0.20246772468090057, + "step": 10149 + }, + { + "epoch": 1.65, + "learning_rate": 7.92957374964391e-07, + "logits/chosen": -1.2723931074142456, + "logits/rejected": -1.3766714334487915, + "logps/chosen": -30.11971092224121, + "logps/rejected": -89.4490737915039, + "loss": 1.2894, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.033571481704712, + "rewards/margins": -2.4424898624420166, + "rewards/rejected": 4.4760613441467285, + "step": 10150 + }, + { + "epoch": 1.65, + "learning_rate": 7.922472980354073e-07, + "logits/chosen": -0.8250362277030945, + "logits/rejected": -0.8250362277030945, + "logps/chosen": -52.432952880859375, + "logps/rejected": -52.432952880859375, + "loss": 0.4102, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.646039605140686, + "rewards/margins": 0.0, + "rewards/rejected": 1.646039605140686, + "step": 10151 + }, + { + "epoch": 1.65, + "learning_rate": 7.915375118223223e-07, + "logits/chosen": -1.393015742301941, + "logits/rejected": -1.3552225828170776, + "logps/chosen": -97.32745361328125, + "logps/rejected": -100.76181030273438, + "loss": 0.0885, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.869189739227295, + "rewards/margins": 1.6834750175476074, + "rewards/rejected": 4.1857147216796875, + "step": 10152 + }, + { + "epoch": 1.65, + "learning_rate": 7.908280163741733e-07, + "logits/chosen": -1.0489543676376343, + "logits/rejected": -1.21977698802948, + "logps/chosen": -42.10577392578125, + "logps/rejected": -99.3251953125, + "loss": 1.8515, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.8429641723632812, + "rewards/margins": -3.6545825004577637, + "rewards/rejected": 6.497546672821045, + "step": 10153 + }, + { + "epoch": 1.65, + "learning_rate": 7.901188117399817e-07, + "logits/chosen": -0.9154565334320068, + "logits/rejected": -0.9154565334320068, + "logps/chosen": -2.9593911170959473, + "logps/rejected": -2.9593911170959473, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.17067952454090118, + "rewards/margins": 0.0, + "rewards/rejected": 0.17067952454090118, + "step": 10154 + }, + { + "epoch": 1.65, + "learning_rate": 7.894098979687448e-07, + "logits/chosen": -1.527759075164795, + "logits/rejected": -1.6451411247253418, + "logps/chosen": -272.8999938964844, + "logps/rejected": -97.53811645507812, + "loss": 0.0084, + "rewards/accuracies": 1.0, + "rewards/chosen": 9.417001724243164, + "rewards/margins": 5.061679363250732, + "rewards/rejected": 4.355322360992432, + "step": 10155 + }, + { + "epoch": 1.65, + "learning_rate": 7.887012751094447e-07, + "logits/chosen": -1.4311367273330688, + "logits/rejected": -1.3856841325759888, + "logps/chosen": -38.852272033691406, + "logps/rejected": -62.69014358520508, + "loss": 1.7413, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.615825653076172, + "rewards/margins": -1.2834110260009766, + "rewards/rejected": 3.8992366790771484, + "step": 10156 + }, + { + "epoch": 1.65, + "learning_rate": 7.879929432110372e-07, + "logits/chosen": -1.3171961307525635, + "logits/rejected": -1.3002934455871582, + "logps/chosen": -60.836708068847656, + "logps/rejected": -78.10487365722656, + "loss": 0.3242, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.2197022438049316, + "rewards/margins": 0.14220738410949707, + "rewards/rejected": 2.0774948596954346, + "step": 10157 + }, + { + "epoch": 1.65, + "learning_rate": 7.872849023224638e-07, + "logits/chosen": -1.3909872770309448, + "logits/rejected": -1.328683853149414, + "logps/chosen": -55.68488693237305, + "logps/rejected": -8.79955768585205, + "loss": 0.1578, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.8390445709228516, + "rewards/margins": 1.145033597946167, + "rewards/rejected": 1.6940109729766846, + "step": 10158 + }, + { + "epoch": 1.65, + "learning_rate": 7.865771524926419e-07, + "logits/chosen": -1.3513355255126953, + "logits/rejected": -1.3217893838882446, + "logps/chosen": -68.3115234375, + "logps/rejected": -72.56319427490234, + "loss": 0.3414, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.553847789764404, + "rewards/margins": 2.2805063724517822, + "rewards/rejected": 2.273341417312622, + "step": 10159 + }, + { + "epoch": 1.65, + "learning_rate": 7.858696937704724e-07, + "logits/chosen": -1.338716983795166, + "logits/rejected": -1.2308605909347534, + "logps/chosen": -48.076744079589844, + "logps/rejected": -28.533246994018555, + "loss": 0.3392, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.10910177230835, + "rewards/margins": 2.2219290733337402, + "rewards/rejected": 1.8871725797653198, + "step": 10160 + }, + { + "epoch": 1.65, + "learning_rate": 7.851625262048307e-07, + "logits/chosen": -1.3434876203536987, + "logits/rejected": -1.1565797328948975, + "logps/chosen": -106.10433959960938, + "logps/rejected": -37.46841049194336, + "loss": 0.0568, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.898199558258057, + "rewards/margins": 4.479946136474609, + "rewards/rejected": 2.4182536602020264, + "step": 10161 + }, + { + "epoch": 1.65, + "learning_rate": 7.844556498445788e-07, + "logits/chosen": -1.5921752452850342, + "logits/rejected": -1.4091479778289795, + "logps/chosen": -90.12577056884766, + "logps/rejected": -23.589265823364258, + "loss": 0.0045, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.6352763175964355, + "rewards/margins": 5.3325605392456055, + "rewards/rejected": 0.30271586775779724, + "step": 10162 + }, + { + "epoch": 1.65, + "learning_rate": 7.837490647385526e-07, + "logits/chosen": -1.0076245069503784, + "logits/rejected": -0.9702088832855225, + "logps/chosen": -143.8619842529297, + "logps/rejected": -56.0779914855957, + "loss": 0.149, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.422163486480713, + "rewards/margins": 1.0885807275772095, + "rewards/rejected": 1.3335827589035034, + "step": 10163 + }, + { + "epoch": 1.65, + "learning_rate": 7.830427709355726e-07, + "logits/chosen": -1.109017014503479, + "logits/rejected": -1.090320110321045, + "logps/chosen": -102.69752502441406, + "logps/rejected": -120.71028137207031, + "loss": 0.7428, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.3683884143829346, + "rewards/margins": -0.41225576400756836, + "rewards/rejected": 2.780644178390503, + "step": 10164 + }, + { + "epoch": 1.65, + "learning_rate": 7.823367684844346e-07, + "logits/chosen": -0.9855738878250122, + "logits/rejected": -0.9855738878250122, + "logps/chosen": -47.50988006591797, + "logps/rejected": -47.50988006591797, + "loss": 1.0044, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.116875410079956, + "rewards/margins": 0.0, + "rewards/rejected": 2.116875410079956, + "step": 10165 + }, + { + "epoch": 1.65, + "learning_rate": 7.816310574339192e-07, + "logits/chosen": -1.2654480934143066, + "logits/rejected": -1.0551221370697021, + "logps/chosen": -66.80236053466797, + "logps/rejected": -28.66078758239746, + "loss": 0.0303, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.262493848800659, + "rewards/margins": 3.2482030391693115, + "rewards/rejected": -0.9857091903686523, + "step": 10166 + }, + { + "epoch": 1.65, + "learning_rate": 7.809256378327818e-07, + "logits/chosen": -0.7598455548286438, + "logits/rejected": -0.7346033453941345, + "logps/chosen": -17.771228790283203, + "logps/rejected": -1.6296985149383545, + "loss": 0.5536, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.8510137796401978, + "rewards/margins": 0.5728694796562195, + "rewards/rejected": 0.27814429998397827, + "step": 10167 + }, + { + "epoch": 1.65, + "learning_rate": 7.802205097297633e-07, + "logits/chosen": -1.366338849067688, + "logits/rejected": -1.256935954093933, + "logps/chosen": -57.73442459106445, + "logps/rejected": -35.73939514160156, + "loss": 0.2037, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.944016695022583, + "rewards/margins": 0.9497174024581909, + "rewards/rejected": 1.994299292564392, + "step": 10168 + }, + { + "epoch": 1.65, + "learning_rate": 7.795156731735781e-07, + "logits/chosen": -1.4722399711608887, + "logits/rejected": -1.3011348247528076, + "logps/chosen": -168.9957733154297, + "logps/rejected": -68.26303100585938, + "loss": 0.0555, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.81069803237915, + "rewards/margins": 2.482445478439331, + "rewards/rejected": 3.3282525539398193, + "step": 10169 + }, + { + "epoch": 1.65, + "learning_rate": 7.788111282129263e-07, + "logits/chosen": -1.2106484174728394, + "logits/rejected": -1.3065987825393677, + "logps/chosen": -67.50067901611328, + "logps/rejected": -118.2325439453125, + "loss": 0.9178, + "rewards/accuracies": 0.0, + "rewards/chosen": 6.4431328773498535, + "rewards/margins": -1.5249762535095215, + "rewards/rejected": 7.968109130859375, + "step": 10170 + }, + { + "epoch": 1.65, + "learning_rate": 7.781068748964832e-07, + "logits/chosen": -1.5721861124038696, + "logits/rejected": -1.604072093963623, + "logps/chosen": -65.86264038085938, + "logps/rejected": -186.438720703125, + "loss": 2.9645, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.4980697631835938, + "rewards/margins": -5.8696489334106445, + "rewards/rejected": 9.367718696594238, + "step": 10171 + }, + { + "epoch": 1.65, + "learning_rate": 7.774029132729089e-07, + "logits/chosen": -1.432607650756836, + "logits/rejected": -1.4238839149475098, + "logps/chosen": -52.635093688964844, + "logps/rejected": -49.00922775268555, + "loss": 0.7897, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.4324729442596436, + "rewards/margins": -1.2963268756866455, + "rewards/rejected": 3.728799819946289, + "step": 10172 + }, + { + "epoch": 1.65, + "learning_rate": 7.766992433908366e-07, + "logits/chosen": -1.5938076972961426, + "logits/rejected": -1.57262122631073, + "logps/chosen": -123.30125427246094, + "logps/rejected": -126.52253723144531, + "loss": 0.2331, + "rewards/accuracies": 1.0, + "rewards/chosen": 9.576136589050293, + "rewards/margins": 2.627614974975586, + "rewards/rejected": 6.948521614074707, + "step": 10173 + }, + { + "epoch": 1.65, + "learning_rate": 7.759958652988858e-07, + "logits/chosen": -1.4969134330749512, + "logits/rejected": -1.3909493684768677, + "logps/chosen": -170.35494995117188, + "logps/rejected": -72.37187194824219, + "loss": 0.1362, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.1916704177856445, + "rewards/margins": 1.1696815490722656, + "rewards/rejected": 4.021988868713379, + "step": 10174 + }, + { + "epoch": 1.65, + "learning_rate": 7.752927790456544e-07, + "logits/chosen": -0.971773624420166, + "logits/rejected": -1.0135105848312378, + "logps/chosen": -34.341552734375, + "logps/rejected": -100.9033203125, + "loss": 0.5891, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.2072842121124268, + "rewards/margins": 0.30230069160461426, + "rewards/rejected": 1.9049835205078125, + "step": 10175 + }, + { + "epoch": 1.65, + "learning_rate": 7.745899846797156e-07, + "logits/chosen": -1.1497023105621338, + "logits/rejected": -1.1497023105621338, + "logps/chosen": -77.94895935058594, + "logps/rejected": -77.94895935058594, + "loss": 0.3774, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.6587975025177, + "rewards/margins": 0.0, + "rewards/rejected": 3.6587975025177, + "step": 10176 + }, + { + "epoch": 1.65, + "learning_rate": 7.73887482249629e-07, + "logits/chosen": -0.8866740465164185, + "logits/rejected": -0.8855874538421631, + "logps/chosen": -2.7638771533966064, + "logps/rejected": -1.2990357875823975, + "loss": 0.72, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.3120885193347931, + "rewards/margins": -0.10475805401802063, + "rewards/rejected": 0.4168465733528137, + "step": 10177 + }, + { + "epoch": 1.65, + "learning_rate": 7.731852718039284e-07, + "logits/chosen": -0.887269139289856, + "logits/rejected": -0.9106311202049255, + "logps/chosen": -15.246804237365723, + "logps/rejected": -29.61603546142578, + "loss": 0.4711, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.6810328364372253, + "rewards/margins": -0.4422028660774231, + "rewards/rejected": 1.1232357025146484, + "step": 10178 + }, + { + "epoch": 1.65, + "learning_rate": 7.72483353391132e-07, + "logits/chosen": -1.0818842649459839, + "logits/rejected": -0.9247598648071289, + "logps/chosen": -22.074796676635742, + "logps/rejected": -36.49726486206055, + "loss": 0.9315, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.449535608291626, + "rewards/margins": 1.0884653329849243, + "rewards/rejected": 0.3610702455043793, + "step": 10179 + }, + { + "epoch": 1.65, + "learning_rate": 7.717817270597339e-07, + "logits/chosen": -1.165860652923584, + "logits/rejected": -1.0881376266479492, + "logps/chosen": -41.66696548461914, + "logps/rejected": -63.99004364013672, + "loss": 0.3406, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.4920413494110107, + "rewards/margins": 0.08563804626464844, + "rewards/rejected": 2.4064033031463623, + "step": 10180 + }, + { + "epoch": 1.65, + "learning_rate": 7.710803928582117e-07, + "logits/chosen": -1.4247663021087646, + "logits/rejected": -1.5323147773742676, + "logps/chosen": -164.50509643554688, + "logps/rejected": -36.109230041503906, + "loss": 0.0192, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.508291721343994, + "rewards/margins": 3.3418500423431396, + "rewards/rejected": 0.16644172370433807, + "step": 10181 + }, + { + "epoch": 1.65, + "learning_rate": 7.703793508350188e-07, + "logits/chosen": -1.0076587200164795, + "logits/rejected": -1.032367467880249, + "logps/chosen": -73.58140563964844, + "logps/rejected": -50.422889709472656, + "loss": 1.8931, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.477184295654297, + "rewards/margins": 1.2335773706436157, + "rewards/rejected": 1.2436069250106812, + "step": 10182 + }, + { + "epoch": 1.65, + "learning_rate": 7.696786010385932e-07, + "logits/chosen": -1.2807875871658325, + "logits/rejected": -1.1200215816497803, + "logps/chosen": -119.92891693115234, + "logps/rejected": -72.03360748291016, + "loss": 0.3039, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.533022403717041, + "rewards/margins": 0.6427421569824219, + "rewards/rejected": 4.890280246734619, + "step": 10183 + }, + { + "epoch": 1.65, + "learning_rate": 7.689781435173465e-07, + "logits/chosen": -1.4973024129867554, + "logits/rejected": -1.6373838186264038, + "logps/chosen": -108.76657104492188, + "logps/rejected": -101.16545104980469, + "loss": 0.7522, + "rewards/accuracies": 0.0, + "rewards/chosen": 7.540360927581787, + "rewards/margins": -1.2520966529846191, + "rewards/rejected": 8.792457580566406, + "step": 10184 + }, + { + "epoch": 1.65, + "learning_rate": 7.682779783196781e-07, + "logits/chosen": -0.8896443247795105, + "logits/rejected": -0.8874419927597046, + "logps/chosen": -5.463980197906494, + "logps/rejected": -1.1208637952804565, + "loss": 0.3894, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.039263248443603516, + "rewards/margins": -0.14978839457035065, + "rewards/rejected": 0.11052514612674713, + "step": 10185 + }, + { + "epoch": 1.65, + "learning_rate": 7.675781054939585e-07, + "logits/chosen": -0.8901335597038269, + "logits/rejected": -0.958728551864624, + "logps/chosen": -55.02869415283203, + "logps/rejected": -72.10795593261719, + "loss": 0.777, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.6142547130584717, + "rewards/margins": -1.310861349105835, + "rewards/rejected": 3.9251160621643066, + "step": 10186 + }, + { + "epoch": 1.65, + "learning_rate": 7.668785250885458e-07, + "logits/chosen": -1.3639838695526123, + "logits/rejected": -1.3317009210586548, + "logps/chosen": -192.24411010742188, + "logps/rejected": -266.15631103515625, + "loss": 0.447, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1481568813323975, + "rewards/margins": 0.16380620002746582, + "rewards/rejected": 2.9843506813049316, + "step": 10187 + }, + { + "epoch": 1.65, + "learning_rate": 7.661792371517718e-07, + "logits/chosen": -1.3365752696990967, + "logits/rejected": -1.4332501888275146, + "logps/chosen": -67.60678100585938, + "logps/rejected": -127.01544189453125, + "loss": 2.8339, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.5545265674591064, + "rewards/margins": -4.490374565124512, + "rewards/rejected": 8.044900894165039, + "step": 10188 + }, + { + "epoch": 1.65, + "learning_rate": 7.654802417319523e-07, + "logits/chosen": -1.359929084777832, + "logits/rejected": -1.3764792680740356, + "logps/chosen": -66.77000427246094, + "logps/rejected": -102.69618225097656, + "loss": 0.5857, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0708420276641846, + "rewards/margins": 2.3808236122131348, + "rewards/rejected": 0.6900184750556946, + "step": 10189 + }, + { + "epoch": 1.65, + "learning_rate": 7.647815388773799e-07, + "logits/chosen": -1.250786304473877, + "logits/rejected": -1.2639507055282593, + "logps/chosen": -3.7175891399383545, + "logps/rejected": -13.077189445495605, + "loss": 0.4999, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3878437876701355, + "rewards/margins": 0.1683860570192337, + "rewards/rejected": 0.2194577306509018, + "step": 10190 + }, + { + "epoch": 1.65, + "learning_rate": 7.640831286363303e-07, + "logits/chosen": -1.1603686809539795, + "logits/rejected": -1.1610063314437866, + "logps/chosen": -4.840976715087891, + "logps/rejected": -2.476163864135742, + "loss": 0.8093, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.6520994305610657, + "rewards/margins": 0.048546433448791504, + "rewards/rejected": 0.6035529971122742, + "step": 10191 + }, + { + "epoch": 1.65, + "learning_rate": 7.633850110570551e-07, + "logits/chosen": -1.353485107421875, + "logits/rejected": -1.3153961896896362, + "logps/chosen": -79.43229675292969, + "logps/rejected": -42.52582931518555, + "loss": 0.1924, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.445513963699341, + "rewards/margins": 0.857878565788269, + "rewards/rejected": 1.5876353979110718, + "step": 10192 + }, + { + "epoch": 1.65, + "learning_rate": 7.626871861877888e-07, + "logits/chosen": -1.509867548942566, + "logits/rejected": -1.486870527267456, + "logps/chosen": -88.04016876220703, + "logps/rejected": -87.12419128417969, + "loss": 1.0298, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.544104814529419, + "rewards/margins": -1.920111894607544, + "rewards/rejected": 4.464216709136963, + "step": 10193 + }, + { + "epoch": 1.65, + "learning_rate": 7.619896540767435e-07, + "logits/chosen": -1.0864218473434448, + "logits/rejected": -1.0927510261535645, + "logps/chosen": -48.228126525878906, + "logps/rejected": -65.47587585449219, + "loss": 0.5337, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.747399926185608, + "rewards/margins": -0.2470306158065796, + "rewards/rejected": 1.9944305419921875, + "step": 10194 + }, + { + "epoch": 1.65, + "learning_rate": 7.61292414772114e-07, + "logits/chosen": -0.826149046421051, + "logits/rejected": -0.8283572793006897, + "logps/chosen": -5.387248992919922, + "logps/rejected": -4.896026611328125, + "loss": 0.6743, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.41502293944358826, + "rewards/margins": 0.11542114615440369, + "rewards/rejected": 0.29960179328918457, + "step": 10195 + }, + { + "epoch": 1.65, + "learning_rate": 7.605954683220701e-07, + "logits/chosen": -1.291378378868103, + "logits/rejected": -1.1937426328659058, + "logps/chosen": -58.567054748535156, + "logps/rejected": -72.51802062988281, + "loss": 0.7172, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.6908302307128906, + "rewards/margins": 0.4394416809082031, + "rewards/rejected": 2.2513885498046875, + "step": 10196 + }, + { + "epoch": 1.66, + "learning_rate": 7.598988147747666e-07, + "logits/chosen": -1.1315842866897583, + "logits/rejected": -1.1152061223983765, + "logps/chosen": -76.7674789428711, + "logps/rejected": -74.31364440917969, + "loss": 0.1103, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.929863691329956, + "rewards/margins": 1.424252986907959, + "rewards/rejected": 1.505610704421997, + "step": 10197 + }, + { + "epoch": 1.66, + "learning_rate": 7.592024541783344e-07, + "logits/chosen": -1.0396933555603027, + "logits/rejected": -1.0548852682113647, + "logps/chosen": -113.40238189697266, + "logps/rejected": -202.40084838867188, + "loss": 0.1969, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.0789437294006348, + "rewards/margins": 0.7887260913848877, + "rewards/rejected": 1.290217638015747, + "step": 10198 + }, + { + "epoch": 1.66, + "learning_rate": 7.585063865808862e-07, + "logits/chosen": -1.2462632656097412, + "logits/rejected": -1.286997675895691, + "logps/chosen": -93.68818664550781, + "logps/rejected": -62.620574951171875, + "loss": 0.0716, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.711508274078369, + "rewards/margins": 1.9214928150177002, + "rewards/rejected": 1.790015459060669, + "step": 10199 + }, + { + "epoch": 1.66, + "learning_rate": 7.578106120305129e-07, + "logits/chosen": -0.9829152822494507, + "logits/rejected": -0.971757173538208, + "logps/chosen": -26.16021728515625, + "logps/rejected": -18.622615814208984, + "loss": 1.058, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.18398533761501312, + "rewards/margins": -0.5398405194282532, + "rewards/rejected": 0.35585519671440125, + "step": 10200 + }, + { + "epoch": 1.66, + "learning_rate": 7.571151305752866e-07, + "logits/chosen": -1.3805029392242432, + "logits/rejected": -1.3782988786697388, + "logps/chosen": -49.95765686035156, + "logps/rejected": -64.11555480957031, + "loss": 0.5527, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.4889053106307983, + "rewards/margins": -0.5710114240646362, + "rewards/rejected": 2.0599167346954346, + "step": 10201 + }, + { + "epoch": 1.66, + "learning_rate": 7.564199422632578e-07, + "logits/chosen": -1.3658093214035034, + "logits/rejected": -1.3182929754257202, + "logps/chosen": -46.822471618652344, + "logps/rejected": -49.63056182861328, + "loss": 0.1291, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.758958578109741, + "rewards/margins": 1.4776115417480469, + "rewards/rejected": 2.2813470363616943, + "step": 10202 + }, + { + "epoch": 1.66, + "learning_rate": 7.557250471424588e-07, + "logits/chosen": -1.5926671028137207, + "logits/rejected": -1.6610829830169678, + "logps/chosen": -94.28126525878906, + "logps/rejected": -90.17227172851562, + "loss": 0.3309, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.5061066150665283, + "rewards/margins": 0.13906478881835938, + "rewards/rejected": 2.367041826248169, + "step": 10203 + }, + { + "epoch": 1.66, + "learning_rate": 7.550304452608981e-07, + "logits/chosen": -1.2464888095855713, + "logits/rejected": -1.2391374111175537, + "logps/chosen": -8.465028762817383, + "logps/rejected": -2.2809457778930664, + "loss": 0.4426, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.3476770520210266, + "rewards/margins": -0.2162872552871704, + "rewards/rejected": 0.563964307308197, + "step": 10204 + }, + { + "epoch": 1.66, + "learning_rate": 7.543361366665686e-07, + "logits/chosen": -1.0117601156234741, + "logits/rejected": -1.012844443321228, + "logps/chosen": -4.678280830383301, + "logps/rejected": -1.297863245010376, + "loss": 3.0225, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5616610646247864, + "rewards/margins": 0.2585236728191376, + "rewards/rejected": 0.3031373918056488, + "step": 10205 + }, + { + "epoch": 1.66, + "learning_rate": 7.53642121407438e-07, + "logits/chosen": -1.353105068206787, + "logits/rejected": -1.3591980934143066, + "logps/chosen": -64.7101058959961, + "logps/rejected": -179.3465576171875, + "loss": 0.1626, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.9641151428222656, + "rewards/margins": 1.0273659229278564, + "rewards/rejected": 0.936749279499054, + "step": 10206 + }, + { + "epoch": 1.66, + "learning_rate": 7.529483995314585e-07, + "logits/chosen": -1.6289856433868408, + "logits/rejected": -1.5742733478546143, + "logps/chosen": -249.90267944335938, + "logps/rejected": -84.59197998046875, + "loss": 0.302, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.7156312465667725, + "rewards/margins": 0.6294288635253906, + "rewards/rejected": 2.086202383041382, + "step": 10207 + }, + { + "epoch": 1.66, + "learning_rate": 7.522549710865579e-07, + "logits/chosen": -1.2952430248260498, + "logits/rejected": -1.3167572021484375, + "logps/chosen": -136.72998046875, + "logps/rejected": -113.18718719482422, + "loss": 0.0345, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.764090061187744, + "rewards/margins": 3.2095541954040527, + "rewards/rejected": 1.5545357465744019, + "step": 10208 + }, + { + "epoch": 1.66, + "learning_rate": 7.515618361206456e-07, + "logits/chosen": -1.2072278261184692, + "logits/rejected": -1.0886880159378052, + "logps/chosen": -305.4765319824219, + "logps/rejected": -30.692066192626953, + "loss": 0.004, + "rewards/accuracies": 1.0, + "rewards/chosen": 8.976981163024902, + "rewards/margins": 5.6872711181640625, + "rewards/rejected": 3.289710283279419, + "step": 10209 + }, + { + "epoch": 1.66, + "learning_rate": 7.508689946816128e-07, + "logits/chosen": -1.3629157543182373, + "logits/rejected": -1.3346813917160034, + "logps/chosen": -99.43000793457031, + "logps/rejected": -151.19776916503906, + "loss": 0.8845, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.1416077613830566, + "rewards/margins": 0.7087570428848267, + "rewards/rejected": 1.43285071849823, + "step": 10210 + }, + { + "epoch": 1.66, + "learning_rate": 7.501764468173256e-07, + "logits/chosen": -1.3531113862991333, + "logits/rejected": -1.3531113862991333, + "logps/chosen": -52.98173904418945, + "logps/rejected": -52.98173904418945, + "loss": 0.4186, + "rewards/accuracies": 0.0, + "rewards/chosen": 4.9624457359313965, + "rewards/margins": 0.0, + "rewards/rejected": 4.9624457359313965, + "step": 10211 + }, + { + "epoch": 1.66, + "learning_rate": 7.494841925756353e-07, + "logits/chosen": -1.359309196472168, + "logits/rejected": -1.3149441480636597, + "logps/chosen": -64.53144836425781, + "logps/rejected": -79.1624755859375, + "loss": 0.1711, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.029715061187744, + "rewards/margins": 0.8983299732208252, + "rewards/rejected": 1.131385087966919, + "step": 10212 + }, + { + "epoch": 1.66, + "learning_rate": 7.487922320043667e-07, + "logits/chosen": -1.3157286643981934, + "logits/rejected": -1.2502120733261108, + "logps/chosen": -49.426788330078125, + "logps/rejected": -57.950740814208984, + "loss": 2.4884, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.2373123168945312, + "rewards/margins": -1.5284137725830078, + "rewards/rejected": 3.765726089477539, + "step": 10213 + }, + { + "epoch": 1.66, + "learning_rate": 7.481005651513312e-07, + "logits/chosen": -1.5956169366836548, + "logits/rejected": -1.6833096742630005, + "logps/chosen": -145.60577392578125, + "logps/rejected": -63.892578125, + "loss": 0.0615, + "rewards/accuracies": 1.0, + "rewards/chosen": 7.722222805023193, + "rewards/margins": 2.116744041442871, + "rewards/rejected": 5.605478763580322, + "step": 10214 + }, + { + "epoch": 1.66, + "learning_rate": 7.474091920643134e-07, + "logits/chosen": -1.3871581554412842, + "logits/rejected": -1.3649322986602783, + "logps/chosen": -56.19696044921875, + "logps/rejected": -57.43000793457031, + "loss": 0.7178, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.028280735015869, + "rewards/margins": 1.2754700183868408, + "rewards/rejected": 1.7528107166290283, + "step": 10215 + }, + { + "epoch": 1.66, + "learning_rate": 7.467181127910833e-07, + "logits/chosen": -0.9524757862091064, + "logits/rejected": -0.9524757862091064, + "logps/chosen": -21.759693145751953, + "logps/rejected": -21.759693145751953, + "loss": 0.5997, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.4648834466934204, + "rewards/margins": 0.0, + "rewards/rejected": 1.4648834466934204, + "step": 10216 + }, + { + "epoch": 1.66, + "learning_rate": 7.460273273793856e-07, + "logits/chosen": -1.0881016254425049, + "logits/rejected": -1.0970838069915771, + "logps/chosen": -62.8073616027832, + "logps/rejected": -62.256553649902344, + "loss": 0.6556, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.559011459350586, + "rewards/margins": 0.06642651557922363, + "rewards/rejected": 2.4925849437713623, + "step": 10217 + }, + { + "epoch": 1.66, + "learning_rate": 7.453368358769492e-07, + "logits/chosen": -1.2780330181121826, + "logits/rejected": -1.291010856628418, + "logps/chosen": -75.24491882324219, + "logps/rejected": -73.0051498413086, + "loss": 0.2597, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.4463417530059814, + "rewards/margins": 1.5678681135177612, + "rewards/rejected": 1.8784736394882202, + "step": 10218 + }, + { + "epoch": 1.66, + "learning_rate": 7.446466383314782e-07, + "logits/chosen": -1.4531476497650146, + "logits/rejected": -1.4401586055755615, + "logps/chosen": -126.5687026977539, + "logps/rejected": -92.15436553955078, + "loss": 0.2141, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1843955516815186, + "rewards/margins": 0.819488525390625, + "rewards/rejected": 2.3649070262908936, + "step": 10219 + }, + { + "epoch": 1.66, + "learning_rate": 7.439567347906612e-07, + "logits/chosen": -0.9997139573097229, + "logits/rejected": -1.0372276306152344, + "logps/chosen": -45.427146911621094, + "logps/rejected": -66.17022705078125, + "loss": 1.121, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.3165230751037598, + "rewards/margins": 0.7303017377853394, + "rewards/rejected": 1.5862213373184204, + "step": 10220 + }, + { + "epoch": 1.66, + "learning_rate": 7.432671253021618e-07, + "logits/chosen": -1.1295630931854248, + "logits/rejected": -0.9221720695495605, + "logps/chosen": -32.29454040527344, + "logps/rejected": -47.939910888671875, + "loss": 2.4903, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.8889825344085693, + "rewards/margins": 1.7192720174789429, + "rewards/rejected": 1.1697105169296265, + "step": 10221 + }, + { + "epoch": 1.66, + "learning_rate": 7.425778099136272e-07, + "logits/chosen": -1.5507087707519531, + "logits/rejected": -1.4624706506729126, + "logps/chosen": -76.56714630126953, + "logps/rejected": -35.11648178100586, + "loss": 0.2977, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.047418117523193, + "rewards/margins": 1.328895092010498, + "rewards/rejected": 3.7185230255126953, + "step": 10222 + }, + { + "epoch": 1.66, + "learning_rate": 7.418887886726811e-07, + "logits/chosen": -0.996540367603302, + "logits/rejected": -0.9968361854553223, + "logps/chosen": -46.239105224609375, + "logps/rejected": -38.124778747558594, + "loss": 0.7571, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.115818738937378, + "rewards/margins": -1.1467888355255127, + "rewards/rejected": 3.2626075744628906, + "step": 10223 + }, + { + "epoch": 1.66, + "learning_rate": 7.4120006162693e-07, + "logits/chosen": -1.2250819206237793, + "logits/rejected": -1.2153115272521973, + "logps/chosen": -86.17730712890625, + "logps/rejected": -49.32522964477539, + "loss": 0.4262, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.1764824390411377, + "rewards/margins": -0.2949641942977905, + "rewards/rejected": 1.4714466333389282, + "step": 10224 + }, + { + "epoch": 1.66, + "learning_rate": 7.405116288239562e-07, + "logits/chosen": -1.5216515064239502, + "logits/rejected": -1.464868426322937, + "logps/chosen": -62.395938873291016, + "logps/rejected": -76.33258056640625, + "loss": 0.3066, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.495548725128174, + "rewards/margins": 0.17424654960632324, + "rewards/rejected": 3.3213021755218506, + "step": 10225 + }, + { + "epoch": 1.66, + "learning_rate": 7.398234903113266e-07, + "logits/chosen": -1.4563544988632202, + "logits/rejected": -1.400172233581543, + "logps/chosen": -64.03530883789062, + "logps/rejected": -74.50492095947266, + "loss": 0.6264, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.0795273780822754, + "rewards/margins": -0.3893454074859619, + "rewards/rejected": 3.4688727855682373, + "step": 10226 + }, + { + "epoch": 1.66, + "learning_rate": 7.391356461365823e-07, + "logits/chosen": -1.1808336973190308, + "logits/rejected": -1.1832447052001953, + "logps/chosen": -3.33831787109375, + "logps/rejected": -7.136765956878662, + "loss": 0.3899, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.35785403847694397, + "rewards/margins": 0.03490915894508362, + "rewards/rejected": 0.32294487953186035, + "step": 10227 + }, + { + "epoch": 1.66, + "learning_rate": 7.384480963472496e-07, + "logits/chosen": -1.4581098556518555, + "logits/rejected": -1.5804157257080078, + "logps/chosen": -67.00196075439453, + "logps/rejected": -107.06837463378906, + "loss": 3.2072, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.1630303859710693, + "rewards/margins": -6.22268009185791, + "rewards/rejected": 8.385710716247559, + "step": 10228 + }, + { + "epoch": 1.66, + "learning_rate": 7.377608409908287e-07, + "logits/chosen": -1.3489569425582886, + "logits/rejected": -1.2588146924972534, + "logps/chosen": -40.799774169921875, + "logps/rejected": -36.23276901245117, + "loss": 0.2001, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.0293405055999756, + "rewards/margins": 0.9402016401290894, + "rewards/rejected": 1.0891388654708862, + "step": 10229 + }, + { + "epoch": 1.66, + "learning_rate": 7.370738801148053e-07, + "logits/chosen": -1.3540241718292236, + "logits/rejected": -1.2659809589385986, + "logps/chosen": -70.7542495727539, + "logps/rejected": -61.7393684387207, + "loss": 0.1324, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.6518173217773438, + "rewards/margins": 1.310935616493225, + "rewards/rejected": 1.3408817052841187, + "step": 10230 + }, + { + "epoch": 1.66, + "learning_rate": 7.3638721376664e-07, + "logits/chosen": -1.3934298753738403, + "logits/rejected": -1.1734728813171387, + "logps/chosen": -99.04449462890625, + "logps/rejected": -45.77714538574219, + "loss": 0.0105, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.343252658843994, + "rewards/margins": 4.070370674133301, + "rewards/rejected": 2.2728822231292725, + "step": 10231 + }, + { + "epoch": 1.66, + "learning_rate": 7.357008419937761e-07, + "logits/chosen": -1.4641377925872803, + "logits/rejected": -1.4641377925872803, + "logps/chosen": -117.14344024658203, + "logps/rejected": -117.14344024658203, + "loss": 0.6378, + "rewards/accuracies": 0.0, + "rewards/chosen": 6.886208534240723, + "rewards/margins": 0.0, + "rewards/rejected": 6.886208534240723, + "step": 10232 + }, + { + "epoch": 1.66, + "learning_rate": 7.35014764843634e-07, + "logits/chosen": -1.560839056968689, + "logits/rejected": -1.5611920356750488, + "logps/chosen": -60.157169342041016, + "logps/rejected": -91.58045959472656, + "loss": 0.5495, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.6447200775146484, + "rewards/margins": -0.6509342193603516, + "rewards/rejected": 3.295654296875, + "step": 10233 + }, + { + "epoch": 1.66, + "learning_rate": 7.343289823636168e-07, + "logits/chosen": -1.107688546180725, + "logits/rejected": -1.107688546180725, + "logps/chosen": -83.2898941040039, + "logps/rejected": -83.2898941040039, + "loss": 0.4821, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.292112112045288, + "rewards/margins": 0.0, + "rewards/rejected": 3.292112112045288, + "step": 10234 + }, + { + "epoch": 1.66, + "learning_rate": 7.33643494601104e-07, + "logits/chosen": -1.2160006761550903, + "logits/rejected": -1.230117678642273, + "logps/chosen": -26.17084312438965, + "logps/rejected": -30.53377914428711, + "loss": 0.3816, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.063965082168579, + "rewards/margins": 0.02957630157470703, + "rewards/rejected": 3.034388780593872, + "step": 10235 + }, + { + "epoch": 1.66, + "learning_rate": 7.329583016034581e-07, + "logits/chosen": -1.4514671564102173, + "logits/rejected": -1.4863872528076172, + "logps/chosen": -170.0972442626953, + "logps/rejected": -67.6336669921875, + "loss": 0.0385, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.643006801605225, + "rewards/margins": 4.139229774475098, + "rewards/rejected": 2.503777265548706, + "step": 10236 + }, + { + "epoch": 1.66, + "learning_rate": 7.322734034180174e-07, + "logits/chosen": -1.1214354038238525, + "logits/rejected": -1.1239945888519287, + "logps/chosen": -52.922943115234375, + "logps/rejected": -85.43426513671875, + "loss": 1.4156, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.1258697509765625, + "rewards/margins": -1.534250020980835, + "rewards/rejected": 3.6601197719573975, + "step": 10237 + }, + { + "epoch": 1.66, + "learning_rate": 7.315888000921034e-07, + "logits/chosen": -1.081383466720581, + "logits/rejected": -1.0105161666870117, + "logps/chosen": -48.0478515625, + "logps/rejected": -72.85944366455078, + "loss": 0.3256, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.3729782104492188, + "rewards/margins": 0.19378876686096191, + "rewards/rejected": 2.179189443588257, + "step": 10238 + }, + { + "epoch": 1.66, + "learning_rate": 7.309044916730146e-07, + "logits/chosen": -1.1621568202972412, + "logits/rejected": -1.1621568202972412, + "logps/chosen": -27.165790557861328, + "logps/rejected": -27.165790557861328, + "loss": 0.3476, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.186489462852478, + "rewards/margins": 0.0, + "rewards/rejected": 1.186489462852478, + "step": 10239 + }, + { + "epoch": 1.66, + "learning_rate": 7.302204782080324e-07, + "logits/chosen": -1.1171718835830688, + "logits/rejected": -1.1800490617752075, + "logps/chosen": -57.697837829589844, + "logps/rejected": -44.57710647583008, + "loss": 0.5145, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.908623456954956, + "rewards/margins": -0.05918693542480469, + "rewards/rejected": 2.9678103923797607, + "step": 10240 + }, + { + "epoch": 1.66, + "learning_rate": 7.295367597444125e-07, + "logits/chosen": -1.0444213151931763, + "logits/rejected": -0.9513816833496094, + "logps/chosen": -84.09283447265625, + "logps/rejected": -13.495261192321777, + "loss": 0.6062, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.9700798392295837, + "rewards/margins": 0.12717336416244507, + "rewards/rejected": 0.8429064750671387, + "step": 10241 + }, + { + "epoch": 1.66, + "learning_rate": 7.288533363293959e-07, + "logits/chosen": -1.350441336631775, + "logits/rejected": -0.8336607813835144, + "logps/chosen": -73.50981140136719, + "logps/rejected": -213.77252197265625, + "loss": 0.2589, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9599976539611816, + "rewards/margins": 1.3303452730178833, + "rewards/rejected": 1.6296523809432983, + "step": 10242 + }, + { + "epoch": 1.66, + "learning_rate": 7.281702080102004e-07, + "logits/chosen": -1.4260634183883667, + "logits/rejected": -1.456525444984436, + "logps/chosen": -125.20916748046875, + "logps/rejected": -100.97737121582031, + "loss": 1.019, + "rewards/accuracies": 0.0, + "rewards/chosen": 6.906256198883057, + "rewards/margins": -1.8909316062927246, + "rewards/rejected": 8.797187805175781, + "step": 10243 + }, + { + "epoch": 1.66, + "learning_rate": 7.274873748340216e-07, + "logits/chosen": -1.16089928150177, + "logits/rejected": -1.1502050161361694, + "logps/chosen": -17.298887252807617, + "logps/rejected": -26.309837341308594, + "loss": 0.8315, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.9084417223930359, + "rewards/margins": -0.21097201108932495, + "rewards/rejected": 1.1194137334823608, + "step": 10244 + }, + { + "epoch": 1.66, + "learning_rate": 7.268048368480396e-07, + "logits/chosen": -1.235373616218567, + "logits/rejected": -1.084904670715332, + "logps/chosen": -107.79058837890625, + "logps/rejected": -32.50638198852539, + "loss": 0.2305, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.238302707672119, + "rewards/margins": 0.8834972381591797, + "rewards/rejected": 2.3548054695129395, + "step": 10245 + }, + { + "epoch": 1.66, + "learning_rate": 7.261225940994088e-07, + "logits/chosen": -1.0914019346237183, + "logits/rejected": -1.1743665933609009, + "logps/chosen": -120.92847442626953, + "logps/rejected": -106.8668212890625, + "loss": 1.0961, + "rewards/accuracies": 0.0, + "rewards/chosen": 6.887973785400391, + "rewards/margins": -1.3846931457519531, + "rewards/rejected": 8.272666931152344, + "step": 10246 + }, + { + "epoch": 1.66, + "learning_rate": 7.254406466352682e-07, + "logits/chosen": -1.3097217082977295, + "logits/rejected": -1.3061891794204712, + "logps/chosen": -67.9401626586914, + "logps/rejected": -76.82779693603516, + "loss": 0.7367, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.9065895080566406, + "rewards/margins": -0.5577774047851562, + "rewards/rejected": 2.464366912841797, + "step": 10247 + }, + { + "epoch": 1.66, + "learning_rate": 7.247589945027311e-07, + "logits/chosen": -1.3988487720489502, + "logits/rejected": -1.404784083366394, + "logps/chosen": -69.1124038696289, + "logps/rejected": -78.96479797363281, + "loss": 0.3153, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.025189161300659, + "rewards/margins": 0.4492156505584717, + "rewards/rejected": 1.5759735107421875, + "step": 10248 + }, + { + "epoch": 1.66, + "learning_rate": 7.240776377488962e-07, + "logits/chosen": -1.4394599199295044, + "logits/rejected": -1.4550663232803345, + "logps/chosen": -37.0582389831543, + "logps/rejected": -54.74320983886719, + "loss": 0.4963, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.736638307571411, + "rewards/margins": 0.0908207893371582, + "rewards/rejected": 3.645817518234253, + "step": 10249 + }, + { + "epoch": 1.66, + "learning_rate": 7.233965764208367e-07, + "logits/chosen": -1.3379687070846558, + "logits/rejected": -1.2481826543807983, + "logps/chosen": -58.48919677734375, + "logps/rejected": -13.488136291503906, + "loss": 0.1436, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.8607711791992188, + "rewards/margins": 1.103894591331482, + "rewards/rejected": 0.7568765878677368, + "step": 10250 + }, + { + "epoch": 1.66, + "learning_rate": 7.227158105656084e-07, + "logits/chosen": -0.9144759774208069, + "logits/rejected": -0.9112324118614197, + "logps/chosen": -3.8990204334259033, + "logps/rejected": -13.747014999389648, + "loss": 0.6513, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.5741273164749146, + "rewards/margins": -0.3390030860900879, + "rewards/rejected": 0.9131304025650024, + "step": 10251 + }, + { + "epoch": 1.66, + "learning_rate": 7.22035340230245e-07, + "logits/chosen": -1.1328117847442627, + "logits/rejected": -1.0865492820739746, + "logps/chosen": -60.734962463378906, + "logps/rejected": -60.282997131347656, + "loss": 0.2, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.8871331214904785, + "rewards/margins": 0.7164504528045654, + "rewards/rejected": 2.170682668685913, + "step": 10252 + }, + { + "epoch": 1.66, + "learning_rate": 7.213551654617623e-07, + "logits/chosen": -1.6102266311645508, + "logits/rejected": -1.0877165794372559, + "logps/chosen": -142.4071807861328, + "logps/rejected": -127.41445922851562, + "loss": 0.2014, + "rewards/accuracies": 1.0, + "rewards/chosen": 7.990708827972412, + "rewards/margins": 6.276780605316162, + "rewards/rejected": 1.71392822265625, + "step": 10253 + }, + { + "epoch": 1.66, + "learning_rate": 7.206752863071515e-07, + "logits/chosen": -1.191962480545044, + "logits/rejected": -1.191962480545044, + "logps/chosen": -60.87136459350586, + "logps/rejected": -60.87136459350586, + "loss": 0.7064, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.5083850622177124, + "rewards/margins": 0.0, + "rewards/rejected": 1.5083850622177124, + "step": 10254 + }, + { + "epoch": 1.66, + "learning_rate": 7.19995702813388e-07, + "logits/chosen": -1.4275745153427124, + "logits/rejected": -1.3154792785644531, + "logps/chosen": -98.49357604980469, + "logps/rejected": -28.63524627685547, + "loss": 0.1184, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.46142578125, + "rewards/margins": 1.5634613037109375, + "rewards/rejected": -0.1020355224609375, + "step": 10255 + }, + { + "epoch": 1.66, + "learning_rate": 7.193164150274229e-07, + "logits/chosen": -0.8802509307861328, + "logits/rejected": -0.8802509307861328, + "logps/chosen": -63.674400329589844, + "logps/rejected": -63.674400329589844, + "loss": 0.4643, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.7839561700820923, + "rewards/margins": 0.0, + "rewards/rejected": 1.7839561700820923, + "step": 10256 + }, + { + "epoch": 1.66, + "learning_rate": 7.186374229961902e-07, + "logits/chosen": -1.6906346082687378, + "logits/rejected": -1.5601654052734375, + "logps/chosen": -32.485286712646484, + "logps/rejected": -94.48225402832031, + "loss": 0.3272, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.202827215194702, + "rewards/margins": 0.3922710418701172, + "rewards/rejected": 2.810556173324585, + "step": 10257 + }, + { + "epoch": 1.66, + "learning_rate": 7.179587267665999e-07, + "logits/chosen": -0.9774608016014099, + "logits/rejected": -0.9774608016014099, + "logps/chosen": -12.87971019744873, + "logps/rejected": -12.87971019744873, + "loss": 0.3776, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.1049714088439941, + "rewards/margins": 0.0, + "rewards/rejected": 1.1049714088439941, + "step": 10258 + }, + { + "epoch": 1.67, + "learning_rate": 7.172803263855455e-07, + "logits/chosen": -1.1578136682510376, + "logits/rejected": -1.1239954233169556, + "logps/chosen": -53.542606353759766, + "logps/rejected": -69.38554382324219, + "loss": 0.74, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.17870831489563, + "rewards/margins": -0.5369687080383301, + "rewards/rejected": 2.71567702293396, + "step": 10259 + }, + { + "epoch": 1.67, + "learning_rate": 7.166022218998963e-07, + "logits/chosen": -1.4537856578826904, + "logits/rejected": -1.5306479930877686, + "logps/chosen": -89.1949462890625, + "logps/rejected": -91.14202880859375, + "loss": 0.9918, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.004054307937622, + "rewards/margins": -1.798147439956665, + "rewards/rejected": 4.802201747894287, + "step": 10260 + }, + { + "epoch": 1.67, + "learning_rate": 7.159244133565047e-07, + "logits/chosen": -1.3250834941864014, + "logits/rejected": -1.3795225620269775, + "logps/chosen": -52.62376403808594, + "logps/rejected": -76.54252624511719, + "loss": 0.6538, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.144505262374878, + "rewards/margins": -0.45153284072875977, + "rewards/rejected": 3.5960381031036377, + "step": 10261 + }, + { + "epoch": 1.67, + "learning_rate": 7.152469008021984e-07, + "logits/chosen": -0.9171630144119263, + "logits/rejected": -0.9898419976234436, + "logps/chosen": -25.632732391357422, + "logps/rejected": -43.26920700073242, + "loss": 1.3803, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.063983917236328, + "rewards/margins": -2.175654411315918, + "rewards/rejected": 4.239638328552246, + "step": 10262 + }, + { + "epoch": 1.67, + "learning_rate": 7.145696842837895e-07, + "logits/chosen": -1.3637769222259521, + "logits/rejected": -1.1350650787353516, + "logps/chosen": -89.52937316894531, + "logps/rejected": -9.317516326904297, + "loss": 0.0326, + "rewards/accuracies": 1.0, + "rewards/chosen": 7.062257289886475, + "rewards/margins": 6.255478382110596, + "rewards/rejected": 0.8067787289619446, + "step": 10263 + }, + { + "epoch": 1.67, + "learning_rate": 7.138927638480659e-07, + "logits/chosen": -0.9523524045944214, + "logits/rejected": -0.9799956679344177, + "logps/chosen": -40.197532653808594, + "logps/rejected": -81.05990600585938, + "loss": 0.6624, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.7751197814941406, + "rewards/margins": 1.7293243408203125, + "rewards/rejected": 1.0457954406738281, + "step": 10264 + }, + { + "epoch": 1.67, + "learning_rate": 7.132161395417964e-07, + "logits/chosen": -1.3833624124526978, + "logits/rejected": -1.3514537811279297, + "logps/chosen": -71.10916137695312, + "logps/rejected": -89.15208435058594, + "loss": 2.5348, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.711225986480713, + "rewards/margins": -4.451791286468506, + "rewards/rejected": 7.163017272949219, + "step": 10265 + }, + { + "epoch": 1.67, + "learning_rate": 7.125398114117305e-07, + "logits/chosen": -1.4639391899108887, + "logits/rejected": -1.5378150939941406, + "logps/chosen": -262.0589904785156, + "logps/rejected": -86.15653228759766, + "loss": 0.0261, + "rewards/accuracies": 1.0, + "rewards/chosen": 7.343570232391357, + "rewards/margins": 3.5548593997955322, + "rewards/rejected": 3.788710832595825, + "step": 10266 + }, + { + "epoch": 1.67, + "learning_rate": 7.118637795045946e-07, + "logits/chosen": -1.3350757360458374, + "logits/rejected": -1.385625958442688, + "logps/chosen": -91.81815338134766, + "logps/rejected": -251.24859619140625, + "loss": 0.9031, + "rewards/accuracies": 0.0, + "rewards/chosen": 4.783080577850342, + "rewards/margins": -1.5655155181884766, + "rewards/rejected": 6.348596096038818, + "step": 10267 + }, + { + "epoch": 1.67, + "learning_rate": 7.111880438670982e-07, + "logits/chosen": -1.4048123359680176, + "logits/rejected": -1.3045679330825806, + "logps/chosen": -90.90692138671875, + "logps/rejected": -113.93582916259766, + "loss": 0.2181, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.40797758102417, + "rewards/margins": 0.7071452140808105, + "rewards/rejected": 4.700832366943359, + "step": 10268 + }, + { + "epoch": 1.67, + "learning_rate": 7.105126045459254e-07, + "logits/chosen": -1.2879481315612793, + "logits/rejected": -1.2273682355880737, + "logps/chosen": -53.188568115234375, + "logps/rejected": -44.987159729003906, + "loss": 0.5747, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.372116804122925, + "rewards/margins": -0.4973106384277344, + "rewards/rejected": 3.869427442550659, + "step": 10269 + }, + { + "epoch": 1.67, + "learning_rate": 7.098374615877452e-07, + "logits/chosen": -1.274781346321106, + "logits/rejected": -1.293319582939148, + "logps/chosen": -78.20384216308594, + "logps/rejected": -58.41777801513672, + "loss": 0.2415, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.6563827991485596, + "rewards/margins": 0.8552917242050171, + "rewards/rejected": 1.8010910749435425, + "step": 10270 + }, + { + "epoch": 1.67, + "learning_rate": 7.091626150392011e-07, + "logits/chosen": -1.2935619354248047, + "logits/rejected": -1.2935619354248047, + "logps/chosen": -36.243263244628906, + "logps/rejected": -36.243263244628906, + "loss": 0.363, + "rewards/accuracies": 0.0, + "rewards/chosen": 4.139136791229248, + "rewards/margins": 0.0, + "rewards/rejected": 4.139136791229248, + "step": 10271 + }, + { + "epoch": 1.67, + "learning_rate": 7.084880649469217e-07, + "logits/chosen": -1.0008503198623657, + "logits/rejected": -1.0480692386627197, + "logps/chosen": -29.538589477539062, + "logps/rejected": -71.54653930664062, + "loss": 0.4432, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9057605266571045, + "rewards/margins": 1.5454639196395874, + "rewards/rejected": 1.360296607017517, + "step": 10272 + }, + { + "epoch": 1.67, + "learning_rate": 7.078138113575089e-07, + "logits/chosen": -1.3157739639282227, + "logits/rejected": -1.1452374458312988, + "logps/chosen": -84.61532592773438, + "logps/rejected": -55.4265251159668, + "loss": 0.3229, + "rewards/accuracies": 1.0, + "rewards/chosen": 7.868997097015381, + "rewards/margins": 3.501716136932373, + "rewards/rejected": 4.367280960083008, + "step": 10273 + }, + { + "epoch": 1.67, + "learning_rate": 7.0713985431755e-07, + "logits/chosen": -1.2813266515731812, + "logits/rejected": -1.3142627477645874, + "logps/chosen": -56.10960388183594, + "logps/rejected": -146.6986083984375, + "loss": 0.6295, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.8586822748184204, + "rewards/margins": -0.24240267276763916, + "rewards/rejected": 2.1010849475860596, + "step": 10274 + }, + { + "epoch": 1.67, + "learning_rate": 7.064661938736067e-07, + "logits/chosen": -1.236404538154602, + "logits/rejected": -1.1897510290145874, + "logps/chosen": -54.669837951660156, + "logps/rejected": -76.42329406738281, + "loss": 3.6155, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.8873443603515625, + "rewards/margins": -0.13225412368774414, + "rewards/rejected": 3.0195984840393066, + "step": 10275 + }, + { + "epoch": 1.67, + "learning_rate": 7.057928300722244e-07, + "logits/chosen": -1.5505377054214478, + "logits/rejected": -1.439751386642456, + "logps/chosen": -47.50721740722656, + "logps/rejected": -15.680907249450684, + "loss": 0.0605, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.722611904144287, + "rewards/margins": 2.704392433166504, + "rewards/rejected": 2.018219470977783, + "step": 10276 + }, + { + "epoch": 1.67, + "learning_rate": 7.051197629599249e-07, + "logits/chosen": -1.4616031646728516, + "logits/rejected": -1.455775499343872, + "logps/chosen": -100.17066955566406, + "logps/rejected": -75.28884887695312, + "loss": 1.3901, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.2265350818634033, + "rewards/margins": -2.5524370670318604, + "rewards/rejected": 5.778972148895264, + "step": 10277 + }, + { + "epoch": 1.67, + "learning_rate": 7.044469925832115e-07, + "logits/chosen": -1.2772940397262573, + "logits/rejected": -1.2588400840759277, + "logps/chosen": -29.34899139404297, + "logps/rejected": -56.93714904785156, + "loss": 1.2132, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.694395065307617, + "rewards/margins": -0.5577206611633301, + "rewards/rejected": 3.2521157264709473, + "step": 10278 + }, + { + "epoch": 1.67, + "learning_rate": 7.037745189885653e-07, + "logits/chosen": -1.1279610395431519, + "logits/rejected": -1.219922423362732, + "logps/chosen": -78.64723205566406, + "logps/rejected": -137.8560791015625, + "loss": 1.2905, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.800234317779541, + "rewards/margins": -2.2988791465759277, + "rewards/rejected": 5.099113464355469, + "step": 10279 + }, + { + "epoch": 1.67, + "learning_rate": 7.031023422224498e-07, + "logits/chosen": -1.1460537910461426, + "logits/rejected": -1.1110526323318481, + "logps/chosen": -59.44459533691406, + "logps/rejected": -65.34867095947266, + "loss": 0.434, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.453009009361267, + "rewards/margins": -0.3040642738342285, + "rewards/rejected": 1.7570732831954956, + "step": 10280 + }, + { + "epoch": 1.67, + "learning_rate": 7.024304623313038e-07, + "logits/chosen": -1.4994122982025146, + "logits/rejected": -1.4685642719268799, + "logps/chosen": -87.53790283203125, + "logps/rejected": -71.82813262939453, + "loss": 0.6347, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.018113851547241, + "rewards/margins": -0.10561370849609375, + "rewards/rejected": 3.123727560043335, + "step": 10281 + }, + { + "epoch": 1.67, + "learning_rate": 7.017588793615499e-07, + "logits/chosen": -1.2511414289474487, + "logits/rejected": -1.2511414289474487, + "logps/chosen": -108.38668823242188, + "logps/rejected": -108.38668823242188, + "loss": 0.3942, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.7885100841522217, + "rewards/margins": 0.0, + "rewards/rejected": 3.7885100841522217, + "step": 10282 + }, + { + "epoch": 1.67, + "learning_rate": 7.010875933595856e-07, + "logits/chosen": -1.3268349170684814, + "logits/rejected": -1.3491895198822021, + "logps/chosen": -65.67671966552734, + "logps/rejected": -49.112979888916016, + "loss": 0.3528, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.5912842750549316, + "rewards/margins": 0.5106220245361328, + "rewards/rejected": 2.080662250518799, + "step": 10283 + }, + { + "epoch": 1.67, + "learning_rate": 7.004166043717936e-07, + "logits/chosen": -1.3681751489639282, + "logits/rejected": -1.3602412939071655, + "logps/chosen": -65.65446472167969, + "logps/rejected": -130.61550903320312, + "loss": 0.8326, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.047137498855591, + "rewards/margins": 1.2483367919921875, + "rewards/rejected": 0.7988006472587585, + "step": 10284 + }, + { + "epoch": 1.67, + "learning_rate": 6.997459124445294e-07, + "logits/chosen": -0.8716973066329956, + "logits/rejected": -0.7341541647911072, + "logps/chosen": -27.366344451904297, + "logps/rejected": -4.098586559295654, + "loss": 0.3123, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.948463797569275, + "rewards/margins": 1.2833073139190674, + "rewards/rejected": 0.6651565432548523, + "step": 10285 + }, + { + "epoch": 1.67, + "learning_rate": 6.990755176241349e-07, + "logits/chosen": -1.4306285381317139, + "logits/rejected": -1.191731333732605, + "logps/chosen": -162.00140380859375, + "logps/rejected": -96.4224853515625, + "loss": 0.047, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.314802646636963, + "rewards/margins": 2.7809159755706787, + "rewards/rejected": 3.533886671066284, + "step": 10286 + }, + { + "epoch": 1.67, + "learning_rate": 6.984054199569251e-07, + "logits/chosen": -1.2277742624282837, + "logits/rejected": -1.0851813554763794, + "logps/chosen": -72.24217987060547, + "logps/rejected": -56.6650390625, + "loss": 0.0851, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.6645781993865967, + "rewards/margins": 1.696304202079773, + "rewards/rejected": 1.9682739973068237, + "step": 10287 + }, + { + "epoch": 1.67, + "learning_rate": 6.977356194891999e-07, + "logits/chosen": -1.1937915086746216, + "logits/rejected": -1.3428361415863037, + "logps/chosen": -80.86465454101562, + "logps/rejected": -97.09964752197266, + "loss": 1.4891, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.287837266921997, + "rewards/margins": -1.919339895248413, + "rewards/rejected": 5.20717716217041, + "step": 10288 + }, + { + "epoch": 1.67, + "learning_rate": 6.970661162672338e-07, + "logits/chosen": -1.535408616065979, + "logits/rejected": -1.2666003704071045, + "logps/chosen": -107.34870147705078, + "logps/rejected": -155.84835815429688, + "loss": 0.1319, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.753148555755615, + "rewards/margins": 6.4187397956848145, + "rewards/rejected": -0.6655914187431335, + "step": 10289 + }, + { + "epoch": 1.67, + "learning_rate": 6.963969103372858e-07, + "logits/chosen": -1.2486705780029297, + "logits/rejected": -1.0909513235092163, + "logps/chosen": -99.48341369628906, + "logps/rejected": -71.29110717773438, + "loss": 0.0324, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.932455539703369, + "rewards/margins": 4.035896301269531, + "rewards/rejected": 2.896559238433838, + "step": 10290 + }, + { + "epoch": 1.67, + "learning_rate": 6.957280017455887e-07, + "logits/chosen": -0.8712236285209656, + "logits/rejected": -0.8711668848991394, + "logps/chosen": -1.2790229320526123, + "logps/rejected": -2.7570080757141113, + "loss": 0.9126, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.3042931854724884, + "rewards/margins": -0.10761606693267822, + "rewards/rejected": 0.4119092524051666, + "step": 10291 + }, + { + "epoch": 1.67, + "learning_rate": 6.950593905383607e-07, + "logits/chosen": -1.76251220703125, + "logits/rejected": -1.5491188764572144, + "logps/chosen": -84.47064208984375, + "logps/rejected": -85.33505249023438, + "loss": 0.2098, + "rewards/accuracies": 1.0, + "rewards/chosen": 8.437071800231934, + "rewards/margins": 1.7918248176574707, + "rewards/rejected": 6.645246982574463, + "step": 10292 + }, + { + "epoch": 1.67, + "learning_rate": 6.943910767617934e-07, + "logits/chosen": -1.244425892829895, + "logits/rejected": -1.1315491199493408, + "logps/chosen": -75.81695556640625, + "logps/rejected": -56.96075439453125, + "loss": 0.1827, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.338284969329834, + "rewards/margins": 1.9463064670562744, + "rewards/rejected": 2.3919785022735596, + "step": 10293 + }, + { + "epoch": 1.67, + "learning_rate": 6.937230604620642e-07, + "logits/chosen": -1.3600956201553345, + "logits/rejected": -1.2495042085647583, + "logps/chosen": -156.1796875, + "logps/rejected": -179.31568908691406, + "loss": 1.179, + "rewards/accuracies": 0.0, + "rewards/chosen": 9.0128812789917, + "rewards/margins": -2.2578630447387695, + "rewards/rejected": 11.270744323730469, + "step": 10294 + }, + { + "epoch": 1.67, + "learning_rate": 6.930553416853242e-07, + "logits/chosen": -1.3825786113739014, + "logits/rejected": -1.5197426080703735, + "logps/chosen": -100.07632446289062, + "logps/rejected": -36.75894546508789, + "loss": 0.159, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.081998586654663, + "rewards/margins": 1.931104063987732, + "rewards/rejected": 0.15089455246925354, + "step": 10295 + }, + { + "epoch": 1.67, + "learning_rate": 6.923879204777084e-07, + "logits/chosen": -1.3342695236206055, + "logits/rejected": -1.3742220401763916, + "logps/chosen": -44.41845703125, + "logps/rejected": -97.24504089355469, + "loss": 0.3304, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.6027755737304688, + "rewards/margins": 0.693016767501831, + "rewards/rejected": 2.9097588062286377, + "step": 10296 + }, + { + "epoch": 1.67, + "learning_rate": 6.917207968853268e-07, + "logits/chosen": -1.645885944366455, + "logits/rejected": -1.635424017906189, + "logps/chosen": -109.06001281738281, + "logps/rejected": -106.67280578613281, + "loss": 0.5319, + "rewards/accuracies": 0.0, + "rewards/chosen": 5.5242600440979, + "rewards/margins": -0.5996828079223633, + "rewards/rejected": 6.123942852020264, + "step": 10297 + }, + { + "epoch": 1.67, + "learning_rate": 6.910539709542747e-07, + "logits/chosen": -1.1694573163986206, + "logits/rejected": -1.114466905593872, + "logps/chosen": -36.916282653808594, + "logps/rejected": -21.83165740966797, + "loss": 0.2535, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.6341254711151123, + "rewards/margins": 1.9361414909362793, + "rewards/rejected": 1.697983980178833, + "step": 10298 + }, + { + "epoch": 1.67, + "learning_rate": 6.903874427306196e-07, + "logits/chosen": -0.9560680389404297, + "logits/rejected": -0.9406812787055969, + "logps/chosen": -74.10572052001953, + "logps/rejected": -44.958274841308594, + "loss": 0.4508, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.2068703174591064, + "rewards/margins": 0.21597522497177124, + "rewards/rejected": 0.9908950924873352, + "step": 10299 + }, + { + "epoch": 1.67, + "learning_rate": 6.897212122604147e-07, + "logits/chosen": -1.3159289360046387, + "logits/rejected": -1.2611172199249268, + "logps/chosen": -251.45025634765625, + "logps/rejected": -140.25894165039062, + "loss": 1.2067, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.313641548156738, + "rewards/margins": 4.802220344543457, + "rewards/rejected": 1.5114212036132812, + "step": 10300 + }, + { + "epoch": 1.67, + "learning_rate": 6.89055279589691e-07, + "logits/chosen": -0.9102250337600708, + "logits/rejected": -0.9168919920921326, + "logps/chosen": -11.361956596374512, + "logps/rejected": -1.8794090747833252, + "loss": 0.4582, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.08978643268346786, + "rewards/margins": -0.09849847108125687, + "rewards/rejected": 0.18828490376472473, + "step": 10301 + }, + { + "epoch": 1.67, + "learning_rate": 6.883896447644556e-07, + "logits/chosen": -1.511372685432434, + "logits/rejected": -1.5350185632705688, + "logps/chosen": -73.14653015136719, + "logps/rejected": -68.19630432128906, + "loss": 0.4102, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.373403310775757, + "rewards/margins": -0.23804378509521484, + "rewards/rejected": 2.6114470958709717, + "step": 10302 + }, + { + "epoch": 1.67, + "learning_rate": 6.877243078307e-07, + "logits/chosen": -1.5423182249069214, + "logits/rejected": -1.4505268335342407, + "logps/chosen": -79.39388275146484, + "logps/rejected": -24.408140182495117, + "loss": 0.0658, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9222984313964844, + "rewards/margins": 2.5031228065490723, + "rewards/rejected": 0.4191757142543793, + "step": 10303 + }, + { + "epoch": 1.67, + "learning_rate": 6.870592688343908e-07, + "logits/chosen": -1.2455565929412842, + "logits/rejected": -1.254966378211975, + "logps/chosen": -51.025596618652344, + "logps/rejected": -92.43283081054688, + "loss": 0.29, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.049593448638916, + "rewards/margins": 1.3363258838653564, + "rewards/rejected": 0.7132675051689148, + "step": 10304 + }, + { + "epoch": 1.67, + "learning_rate": 6.863945278214773e-07, + "logits/chosen": -1.0082322359085083, + "logits/rejected": -0.9551526308059692, + "logps/chosen": -45.544921875, + "logps/rejected": -70.01397705078125, + "loss": 0.3088, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.491626024246216, + "rewards/margins": 0.18077921867370605, + "rewards/rejected": 2.3108468055725098, + "step": 10305 + }, + { + "epoch": 1.67, + "learning_rate": 6.857300848378857e-07, + "logits/chosen": -1.7951761484146118, + "logits/rejected": -1.7014409303665161, + "logps/chosen": -134.2267303466797, + "logps/rejected": -14.518311500549316, + "loss": 0.009, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.72958517074585, + "rewards/margins": 5.803871154785156, + "rewards/rejected": 0.9257140159606934, + "step": 10306 + }, + { + "epoch": 1.67, + "learning_rate": 6.85065939929524e-07, + "logits/chosen": -1.4011832475662231, + "logits/rejected": -1.2686519622802734, + "logps/chosen": -100.97308349609375, + "logps/rejected": -107.74249267578125, + "loss": 0.415, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.037376403808594, + "rewards/margins": 2.8476362228393555, + "rewards/rejected": 1.1897400617599487, + "step": 10307 + }, + { + "epoch": 1.67, + "learning_rate": 6.844020931422763e-07, + "logits/chosen": -0.9128146171569824, + "logits/rejected": -0.8700813055038452, + "logps/chosen": -93.75798034667969, + "logps/rejected": -72.98873901367188, + "loss": 0.0898, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.840935468673706, + "rewards/margins": 1.6539024114608765, + "rewards/rejected": 1.1870330572128296, + "step": 10308 + }, + { + "epoch": 1.67, + "learning_rate": 6.837385445220107e-07, + "logits/chosen": -1.0882498025894165, + "logits/rejected": -1.06120765209198, + "logps/chosen": -90.30465698242188, + "logps/rejected": -50.92706298828125, + "loss": 0.4063, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.97098708152771, + "rewards/margins": -0.07976365089416504, + "rewards/rejected": 3.050750732421875, + "step": 10309 + }, + { + "epoch": 1.67, + "learning_rate": 6.830752941145702e-07, + "logits/chosen": -1.386728286743164, + "logits/rejected": -1.4525483846664429, + "logps/chosen": -96.59902954101562, + "logps/rejected": -152.9864501953125, + "loss": 0.9778, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.397540330886841, + "rewards/margins": -1.4859192371368408, + "rewards/rejected": 4.883459568023682, + "step": 10310 + }, + { + "epoch": 1.67, + "learning_rate": 6.824123419657802e-07, + "logits/chosen": -1.3075968027114868, + "logits/rejected": -1.3136348724365234, + "logps/chosen": -56.763328552246094, + "logps/rejected": -70.12220764160156, + "loss": 0.6679, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.459033250808716, + "rewards/margins": -1.0299973487854004, + "rewards/rejected": 3.489030599594116, + "step": 10311 + }, + { + "epoch": 1.67, + "learning_rate": 6.81749688121443e-07, + "logits/chosen": -0.9835054278373718, + "logits/rejected": -1.0146828889846802, + "logps/chosen": -21.178613662719727, + "logps/rejected": -20.687198638916016, + "loss": 1.068, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.2609308362007141, + "rewards/margins": -1.3113272190093994, + "rewards/rejected": 1.5722579956054688, + "step": 10312 + }, + { + "epoch": 1.67, + "learning_rate": 6.810873326273438e-07, + "logits/chosen": -0.6912917494773865, + "logits/rejected": -0.6912917494773865, + "logps/chosen": -9.032638549804688, + "logps/rejected": -9.032638549804688, + "loss": 1.123, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.6329441070556641, + "rewards/margins": 0.0, + "rewards/rejected": 0.6329441070556641, + "step": 10313 + }, + { + "epoch": 1.67, + "learning_rate": 6.804252755292429e-07, + "logits/chosen": -1.3441227674484253, + "logits/rejected": -1.3073668479919434, + "logps/chosen": -74.14497375488281, + "logps/rejected": -173.06028747558594, + "loss": 0.1583, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.3990790843963623, + "rewards/margins": 1.7201132774353027, + "rewards/rejected": 1.6789658069610596, + "step": 10314 + }, + { + "epoch": 1.67, + "learning_rate": 6.797635168728844e-07, + "logits/chosen": -1.1485905647277832, + "logits/rejected": -1.1904634237289429, + "logps/chosen": -84.60322570800781, + "logps/rejected": -117.30125427246094, + "loss": 0.6342, + "rewards/accuracies": 0.0, + "rewards/chosen": 4.754350185394287, + "rewards/margins": -0.49939727783203125, + "rewards/rejected": 5.253747463226318, + "step": 10315 + }, + { + "epoch": 1.67, + "learning_rate": 6.791020567039874e-07, + "logits/chosen": -1.3729709386825562, + "logits/rejected": -1.373143196105957, + "logps/chosen": -2.4442172050476074, + "logps/rejected": -1.6910933256149292, + "loss": 0.797, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.18220864236354828, + "rewards/margins": -0.09762386977672577, + "rewards/rejected": 0.27983251214027405, + "step": 10316 + }, + { + "epoch": 1.67, + "learning_rate": 6.784408950682547e-07, + "logits/chosen": -1.0290714502334595, + "logits/rejected": -1.0290714502334595, + "logps/chosen": -89.20005798339844, + "logps/rejected": -89.20005798339844, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.7331284284591675, + "rewards/margins": 0.0, + "rewards/rejected": 1.7331284284591675, + "step": 10317 + }, + { + "epoch": 1.67, + "learning_rate": 6.777800320113642e-07, + "logits/chosen": -1.0085901021957397, + "logits/rejected": -0.9834259152412415, + "logps/chosen": -60.149803161621094, + "logps/rejected": -90.22407531738281, + "loss": 0.5227, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.3088555335998535, + "rewards/margins": -0.1207726001739502, + "rewards/rejected": 3.4296281337738037, + "step": 10318 + }, + { + "epoch": 1.67, + "learning_rate": 6.771194675789771e-07, + "logits/chosen": -1.1839500665664673, + "logits/rejected": -1.1839500665664673, + "logps/chosen": -70.2717056274414, + "logps/rejected": -70.2717056274414, + "loss": 0.6145, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.435701847076416, + "rewards/margins": 0.0, + "rewards/rejected": 2.435701847076416, + "step": 10319 + }, + { + "epoch": 1.68, + "learning_rate": 6.764592018167298e-07, + "logits/chosen": -1.0974780321121216, + "logits/rejected": -1.0113475322723389, + "logps/chosen": -50.13651657104492, + "logps/rejected": -39.219078063964844, + "loss": 0.2639, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.527332067489624, + "rewards/margins": 1.0827503204345703, + "rewards/rejected": 2.4445817470550537, + "step": 10320 + }, + { + "epoch": 1.68, + "learning_rate": 6.757992347702435e-07, + "logits/chosen": -1.5198301076889038, + "logits/rejected": -1.497578501701355, + "logps/chosen": -93.420166015625, + "logps/rejected": -99.52935028076172, + "loss": 0.2229, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.869342088699341, + "rewards/margins": 0.8070931434631348, + "rewards/rejected": 2.062248945236206, + "step": 10321 + }, + { + "epoch": 1.68, + "learning_rate": 6.751395664851135e-07, + "logits/chosen": -1.1043342351913452, + "logits/rejected": -1.1053820848464966, + "logps/chosen": -40.67487335205078, + "logps/rejected": -77.15676879882812, + "loss": 1.1251, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.3379395008087158, + "rewards/margins": -1.6265244483947754, + "rewards/rejected": 2.964463949203491, + "step": 10322 + }, + { + "epoch": 1.68, + "learning_rate": 6.744801970069176e-07, + "logits/chosen": -1.3045231103897095, + "logits/rejected": -1.3157609701156616, + "logps/chosen": -93.68920135498047, + "logps/rejected": -120.0892562866211, + "loss": 1.2786, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.9249428510665894, + "rewards/margins": 0.38829660415649414, + "rewards/rejected": 1.5366462469100952, + "step": 10323 + }, + { + "epoch": 1.68, + "learning_rate": 6.738211263812111e-07, + "logits/chosen": -1.0477445125579834, + "logits/rejected": -1.0403969287872314, + "logps/chosen": -62.89132308959961, + "logps/rejected": -13.89819049835205, + "loss": 0.3127, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.0716999769210815, + "rewards/margins": 0.2157248854637146, + "rewards/rejected": 0.8559750914573669, + "step": 10324 + }, + { + "epoch": 1.68, + "learning_rate": 6.731623546535304e-07, + "logits/chosen": -1.09196937084198, + "logits/rejected": -1.139100193977356, + "logps/chosen": -81.5030517578125, + "logps/rejected": -114.3323974609375, + "loss": 0.6987, + "rewards/accuracies": 0.0, + "rewards/chosen": 4.598239898681641, + "rewards/margins": -1.1102838516235352, + "rewards/rejected": 5.708523750305176, + "step": 10325 + }, + { + "epoch": 1.68, + "learning_rate": 6.725038818693897e-07, + "logits/chosen": -0.9451826810836792, + "logits/rejected": -0.9367022514343262, + "logps/chosen": -1.270987868309021, + "logps/rejected": -5.399672508239746, + "loss": 0.4651, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2916216552257538, + "rewards/margins": 0.47579193115234375, + "rewards/rejected": -0.18417029082775116, + "step": 10326 + }, + { + "epoch": 1.68, + "learning_rate": 6.718457080742846e-07, + "logits/chosen": -1.44647216796875, + "logits/rejected": -1.526110053062439, + "logps/chosen": -76.02455139160156, + "logps/rejected": -103.01177978515625, + "loss": 3.6725, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.913522481918335, + "rewards/margins": -7.268035888671875, + "rewards/rejected": 10.181558609008789, + "step": 10327 + }, + { + "epoch": 1.68, + "learning_rate": 6.711878333136862e-07, + "logits/chosen": -1.0646319389343262, + "logits/rejected": -1.1087721586227417, + "logps/chosen": -52.93921661376953, + "logps/rejected": -55.24232864379883, + "loss": 0.8672, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.1038825511932373, + "rewards/margins": -0.6009693145751953, + "rewards/rejected": 2.7048518657684326, + "step": 10328 + }, + { + "epoch": 1.68, + "learning_rate": 6.705302576330502e-07, + "logits/chosen": -1.3608357906341553, + "logits/rejected": -1.3036627769470215, + "logps/chosen": -46.55457305908203, + "logps/rejected": -36.77340316772461, + "loss": 1.8871, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.091733694076538, + "rewards/margins": -2.3344876766204834, + "rewards/rejected": 4.4262213706970215, + "step": 10329 + }, + { + "epoch": 1.68, + "learning_rate": 6.698729810778065e-07, + "logits/chosen": -1.0691018104553223, + "logits/rejected": -0.9649333357810974, + "logps/chosen": -75.90402221679688, + "logps/rejected": -29.47683334350586, + "loss": 0.2853, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.6863784790039062, + "rewards/margins": 0.5719358921051025, + "rewards/rejected": 2.1144425868988037, + "step": 10330 + }, + { + "epoch": 1.68, + "learning_rate": 6.692160036933692e-07, + "logits/chosen": -1.330034613609314, + "logits/rejected": -1.1099932193756104, + "logps/chosen": -120.48019409179688, + "logps/rejected": -25.632612228393555, + "loss": 0.0274, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.4304046630859375, + "rewards/margins": 6.160305976867676, + "rewards/rejected": 0.27009889483451843, + "step": 10331 + }, + { + "epoch": 1.68, + "learning_rate": 6.685593255251266e-07, + "logits/chosen": -1.2856290340423584, + "logits/rejected": -1.254986047744751, + "logps/chosen": -114.07244110107422, + "logps/rejected": -57.255393981933594, + "loss": 0.2358, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.346340179443359, + "rewards/margins": 0.5096173286437988, + "rewards/rejected": 5.8367228507995605, + "step": 10332 + }, + { + "epoch": 1.68, + "learning_rate": 6.679029466184506e-07, + "logits/chosen": -1.1793363094329834, + "logits/rejected": -1.0813850164413452, + "logps/chosen": -58.00399398803711, + "logps/rejected": -21.526426315307617, + "loss": 0.3054, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.7627629041671753, + "rewards/margins": 0.19923460483551025, + "rewards/rejected": 1.563528299331665, + "step": 10333 + }, + { + "epoch": 1.68, + "learning_rate": 6.672468670186899e-07, + "logits/chosen": -0.9557759165763855, + "logits/rejected": -0.9411736130714417, + "logps/chosen": -72.26944732666016, + "logps/rejected": -60.550926208496094, + "loss": 0.5456, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.737067461013794, + "rewards/margins": 0.08062899112701416, + "rewards/rejected": 1.6564384698867798, + "step": 10334 + }, + { + "epoch": 1.68, + "learning_rate": 6.665910867711744e-07, + "logits/chosen": -1.2791545391082764, + "logits/rejected": -1.2354230880737305, + "logps/chosen": -68.94957733154297, + "logps/rejected": -32.634605407714844, + "loss": 0.1344, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.3840720653533936, + "rewards/margins": 1.377140760421753, + "rewards/rejected": 1.0069313049316406, + "step": 10335 + }, + { + "epoch": 1.68, + "learning_rate": 6.659356059212107e-07, + "logits/chosen": -1.083541750907898, + "logits/rejected": -0.9498478770256042, + "logps/chosen": -123.62460327148438, + "logps/rejected": -53.366180419921875, + "loss": 0.019, + "rewards/accuracies": 1.0, + "rewards/chosen": 7.2349090576171875, + "rewards/margins": 3.5717086791992188, + "rewards/rejected": 3.6632003784179688, + "step": 10336 + }, + { + "epoch": 1.68, + "learning_rate": 6.652804245140876e-07, + "logits/chosen": -1.3114584684371948, + "logits/rejected": -1.2820712327957153, + "logps/chosen": -24.41246223449707, + "logps/rejected": -61.32950973510742, + "loss": 0.5009, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.045396327972412, + "rewards/margins": -0.49065303802490234, + "rewards/rejected": 2.5360493659973145, + "step": 10337 + }, + { + "epoch": 1.68, + "learning_rate": 6.646255425950726e-07, + "logits/chosen": -1.0258488655090332, + "logits/rejected": -0.9817736744880676, + "logps/chosen": -114.44644165039062, + "logps/rejected": -58.13091278076172, + "loss": 0.0822, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.8186631202697754, + "rewards/margins": 1.8784188032150269, + "rewards/rejected": 1.9402443170547485, + "step": 10338 + }, + { + "epoch": 1.68, + "learning_rate": 6.639709602094102e-07, + "logits/chosen": -1.4840686321258545, + "logits/rejected": -1.3726762533187866, + "logps/chosen": -79.92948913574219, + "logps/rejected": -72.95686340332031, + "loss": 1.3695, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1475937366485596, + "rewards/margins": 0.44997715950012207, + "rewards/rejected": 2.6976165771484375, + "step": 10339 + }, + { + "epoch": 1.68, + "learning_rate": 6.63316677402327e-07, + "logits/chosen": -1.4764854907989502, + "logits/rejected": -1.4764854907989502, + "logps/chosen": -55.77458953857422, + "logps/rejected": -55.77458953857422, + "loss": 0.37, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.502455949783325, + "rewards/margins": 0.0, + "rewards/rejected": 2.502455949783325, + "step": 10340 + }, + { + "epoch": 1.68, + "learning_rate": 6.626626942190267e-07, + "logits/chosen": -1.3813228607177734, + "logits/rejected": -1.3997511863708496, + "logps/chosen": -159.54925537109375, + "logps/rejected": -176.45223999023438, + "loss": 0.9245, + "rewards/accuracies": 0.0, + "rewards/chosen": 7.3132476806640625, + "rewards/margins": -1.6744756698608398, + "rewards/rejected": 8.987723350524902, + "step": 10341 + }, + { + "epoch": 1.68, + "learning_rate": 6.62009010704695e-07, + "logits/chosen": -1.3122695684432983, + "logits/rejected": -1.0794482231140137, + "logps/chosen": -103.17108154296875, + "logps/rejected": -55.010986328125, + "loss": 0.1094, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.715643405914307, + "rewards/margins": 1.4316177368164062, + "rewards/rejected": 3.2840256690979004, + "step": 10342 + }, + { + "epoch": 1.68, + "learning_rate": 6.613556269044929e-07, + "logits/chosen": -1.165406584739685, + "logits/rejected": -1.039172887802124, + "logps/chosen": -106.85218811035156, + "logps/rejected": -13.593982696533203, + "loss": 3.1918, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.748792290687561, + "rewards/margins": 0.9504528045654297, + "rewards/rejected": 0.7983394861221313, + "step": 10343 + }, + { + "epoch": 1.68, + "learning_rate": 6.607025428635655e-07, + "logits/chosen": -1.4801652431488037, + "logits/rejected": -1.3527436256408691, + "logps/chosen": -88.93456268310547, + "logps/rejected": -101.72770690917969, + "loss": 0.2536, + "rewards/accuracies": 1.0, + "rewards/chosen": 7.652089595794678, + "rewards/margins": 3.491809844970703, + "rewards/rejected": 4.160279750823975, + "step": 10344 + }, + { + "epoch": 1.68, + "learning_rate": 6.600497586270327e-07, + "logits/chosen": -1.4291355609893799, + "logits/rejected": -1.4123430252075195, + "logps/chosen": -47.65253829956055, + "logps/rejected": -139.42269897460938, + "loss": 0.7371, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.9212673902511597, + "rewards/margins": -0.993132472038269, + "rewards/rejected": 2.9143998622894287, + "step": 10345 + }, + { + "epoch": 1.68, + "learning_rate": 6.593972742399973e-07, + "logits/chosen": -1.320055365562439, + "logits/rejected": -1.2729679346084595, + "logps/chosen": -70.19474792480469, + "logps/rejected": -55.40534210205078, + "loss": 1.0874, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.392746686935425, + "rewards/margins": -2.049453020095825, + "rewards/rejected": 4.44219970703125, + "step": 10346 + }, + { + "epoch": 1.68, + "learning_rate": 6.587450897475384e-07, + "logits/chosen": -1.2307096719741821, + "logits/rejected": -1.1998119354248047, + "logps/chosen": -78.87922668457031, + "logps/rejected": -48.438934326171875, + "loss": 0.4647, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.692317247390747, + "rewards/margins": 0.6062767505645752, + "rewards/rejected": 2.086040496826172, + "step": 10347 + }, + { + "epoch": 1.68, + "learning_rate": 6.580932051947181e-07, + "logits/chosen": -1.0132559537887573, + "logits/rejected": -1.0327417850494385, + "logps/chosen": -75.87373352050781, + "logps/rejected": -93.82308197021484, + "loss": 0.7103, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.5352768898010254, + "rewards/margins": -0.9124579429626465, + "rewards/rejected": 4.447734832763672, + "step": 10348 + }, + { + "epoch": 1.68, + "learning_rate": 6.574416206265721e-07, + "logits/chosen": -1.3605799674987793, + "logits/rejected": -1.3457679748535156, + "logps/chosen": -59.32830047607422, + "logps/rejected": -53.03972625732422, + "loss": 0.8137, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.1710000038146973, + "rewards/margins": 0.06696867942810059, + "rewards/rejected": 2.1040313243865967, + "step": 10349 + }, + { + "epoch": 1.68, + "learning_rate": 6.567903360881217e-07, + "logits/chosen": -0.9970598220825195, + "logits/rejected": -1.03862726688385, + "logps/chosen": -61.98468780517578, + "logps/rejected": -64.86430358886719, + "loss": 1.3367, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.7034873962402344, + "rewards/margins": -0.7824370861053467, + "rewards/rejected": 3.485924482345581, + "step": 10350 + }, + { + "epoch": 1.68, + "learning_rate": 6.561393516243619e-07, + "logits/chosen": -1.2290987968444824, + "logits/rejected": -1.048490285873413, + "logps/chosen": -146.15072631835938, + "logps/rejected": -58.45035171508789, + "loss": 0.016, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.674023628234863, + "rewards/margins": 3.9344921112060547, + "rewards/rejected": 1.739531397819519, + "step": 10351 + }, + { + "epoch": 1.68, + "learning_rate": 6.554886672802719e-07, + "logits/chosen": -0.9490722417831421, + "logits/rejected": -0.9490722417831421, + "logps/chosen": -16.41025733947754, + "logps/rejected": -16.41025733947754, + "loss": 0.7711, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.1094706058502197, + "rewards/margins": 0.0, + "rewards/rejected": 3.1094706058502197, + "step": 10352 + }, + { + "epoch": 1.68, + "learning_rate": 6.548382831008054e-07, + "logits/chosen": -1.1850024461746216, + "logits/rejected": -1.1306201219558716, + "logps/chosen": -62.993377685546875, + "logps/rejected": -51.62061309814453, + "loss": 0.1503, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.78055739402771, + "rewards/margins": 1.1597100496292114, + "rewards/rejected": 1.6208473443984985, + "step": 10353 + }, + { + "epoch": 1.68, + "learning_rate": 6.541881991309013e-07, + "logits/chosen": -1.1431465148925781, + "logits/rejected": -1.1745096445083618, + "logps/chosen": -98.38556671142578, + "logps/rejected": -116.73919677734375, + "loss": 0.3242, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.6940765380859375, + "rewards/margins": 0.43549346923828125, + "rewards/rejected": 2.2585830688476562, + "step": 10354 + }, + { + "epoch": 1.68, + "learning_rate": 6.535384154154701e-07, + "logits/chosen": -1.331563115119934, + "logits/rejected": -1.331563115119934, + "logps/chosen": -37.72945785522461, + "logps/rejected": -37.72945785522461, + "loss": 0.5261, + "rewards/accuracies": 0.0, + "rewards/chosen": 5.262808799743652, + "rewards/margins": 0.0, + "rewards/rejected": 5.262808799743652, + "step": 10355 + }, + { + "epoch": 1.68, + "learning_rate": 6.528889319994086e-07, + "logits/chosen": -0.9945945143699646, + "logits/rejected": -1.043826699256897, + "logps/chosen": -30.995777130126953, + "logps/rejected": -42.58299255371094, + "loss": 0.7457, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.9190692901611328, + "rewards/margins": -1.139730453491211, + "rewards/rejected": 3.0587997436523438, + "step": 10356 + }, + { + "epoch": 1.68, + "learning_rate": 6.522397489275894e-07, + "logits/chosen": -1.2368552684783936, + "logits/rejected": -1.2327009439468384, + "logps/chosen": -20.703962326049805, + "logps/rejected": -71.19851684570312, + "loss": 0.3367, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.723461389541626, + "rewards/margins": 0.4749796390533447, + "rewards/rejected": 1.2484817504882812, + "step": 10357 + }, + { + "epoch": 1.68, + "learning_rate": 6.515908662448645e-07, + "logits/chosen": -1.1188483238220215, + "logits/rejected": -1.0786417722702026, + "logps/chosen": -85.15288543701172, + "logps/rejected": -60.77616500854492, + "loss": 1.2459, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.456154704093933, + "rewards/margins": -1.4885128736495972, + "rewards/rejected": 2.9446675777435303, + "step": 10358 + }, + { + "epoch": 1.68, + "learning_rate": 6.509422839960661e-07, + "logits/chosen": -1.9315931797027588, + "logits/rejected": -1.944666862487793, + "logps/chosen": -199.95416259765625, + "logps/rejected": -99.9209213256836, + "loss": 0.0098, + "rewards/accuracies": 1.0, + "rewards/chosen": 7.915027141571045, + "rewards/margins": 4.128228187561035, + "rewards/rejected": 3.7867989540100098, + "step": 10359 + }, + { + "epoch": 1.68, + "learning_rate": 6.502940022260041e-07, + "logits/chosen": -1.0627071857452393, + "logits/rejected": -1.1356912851333618, + "logps/chosen": -44.257225036621094, + "logps/rejected": -107.77203369140625, + "loss": 1.9982, + "rewards/accuracies": 0.0, + "rewards/chosen": 4.374758243560791, + "rewards/margins": -0.5184593200683594, + "rewards/rejected": 4.89321756362915, + "step": 10360 + }, + { + "epoch": 1.68, + "learning_rate": 6.496460209794697e-07, + "logits/chosen": -1.4839404821395874, + "logits/rejected": -1.3755069971084595, + "logps/chosen": -56.58534240722656, + "logps/rejected": -10.987451553344727, + "loss": 0.0667, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.031707763671875, + "rewards/margins": 3.041618585586548, + "rewards/rejected": 0.9900892376899719, + "step": 10361 + }, + { + "epoch": 1.68, + "learning_rate": 6.489983403012312e-07, + "logits/chosen": -1.0371792316436768, + "logits/rejected": -1.0656664371490479, + "logps/chosen": -79.92491912841797, + "logps/rejected": -60.587982177734375, + "loss": 0.2863, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.040813446044922, + "rewards/margins": 1.2192528247833252, + "rewards/rejected": 0.8215606808662415, + "step": 10362 + }, + { + "epoch": 1.68, + "learning_rate": 6.483509602360389e-07, + "logits/chosen": -1.3234647512435913, + "logits/rejected": -1.2138116359710693, + "logps/chosen": -62.52052688598633, + "logps/rejected": -51.02896499633789, + "loss": 0.4055, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.4886226654052734, + "rewards/margins": 0.07014989852905273, + "rewards/rejected": 2.4184727668762207, + "step": 10363 + }, + { + "epoch": 1.68, + "learning_rate": 6.477038808286185e-07, + "logits/chosen": -1.5239002704620361, + "logits/rejected": -1.4783693552017212, + "logps/chosen": -104.85237121582031, + "logps/rejected": -79.71197509765625, + "loss": 0.2688, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.777461528778076, + "rewards/margins": 0.361968994140625, + "rewards/rejected": 4.415492534637451, + "step": 10364 + }, + { + "epoch": 1.68, + "learning_rate": 6.470571021236794e-07, + "logits/chosen": -1.221613883972168, + "logits/rejected": -1.254694938659668, + "logps/chosen": -78.2781753540039, + "logps/rejected": -72.71636962890625, + "loss": 1.335, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.650325059890747, + "rewards/margins": 0.09484326839447021, + "rewards/rejected": 1.5554817914962769, + "step": 10365 + }, + { + "epoch": 1.68, + "learning_rate": 6.464106241659051e-07, + "logits/chosen": -1.422762155532837, + "logits/rejected": -1.4277435541152954, + "logps/chosen": -48.990196228027344, + "logps/rejected": -61.25828552246094, + "loss": 0.2347, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.3896987438201904, + "rewards/margins": 0.5975112915039062, + "rewards/rejected": 2.792187452316284, + "step": 10366 + }, + { + "epoch": 1.68, + "learning_rate": 6.457644469999641e-07, + "logits/chosen": -1.6881622076034546, + "logits/rejected": -1.693400502204895, + "logps/chosen": -126.13892364501953, + "logps/rejected": -169.89297485351562, + "loss": 0.3838, + "rewards/accuracies": 0.0, + "rewards/chosen": 7.45983362197876, + "rewards/margins": -0.07896947860717773, + "rewards/rejected": 7.5388031005859375, + "step": 10367 + }, + { + "epoch": 1.68, + "learning_rate": 6.451185706704983e-07, + "logits/chosen": -1.4305810928344727, + "logits/rejected": -1.4410619735717773, + "logps/chosen": -65.18428039550781, + "logps/rejected": -86.38209533691406, + "loss": 1.4486, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.6164894104003906, + "rewards/margins": -2.8352441787719727, + "rewards/rejected": 4.451733589172363, + "step": 10368 + }, + { + "epoch": 1.68, + "learning_rate": 6.44472995222134e-07, + "logits/chosen": -1.4508970975875854, + "logits/rejected": -1.2991448640823364, + "logps/chosen": -60.82524108886719, + "logps/rejected": -20.648529052734375, + "loss": 0.0613, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0836212635040283, + "rewards/margins": 2.0983896255493164, + "rewards/rejected": 0.9852315783500671, + "step": 10369 + }, + { + "epoch": 1.68, + "learning_rate": 6.438277206994726e-07, + "logits/chosen": -1.047790765762329, + "logits/rejected": -1.0666230916976929, + "logps/chosen": -67.35418701171875, + "logps/rejected": -47.993770599365234, + "loss": 0.6683, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.691259741783142, + "rewards/margins": -0.5436452627182007, + "rewards/rejected": 2.2349050045013428, + "step": 10370 + }, + { + "epoch": 1.68, + "learning_rate": 6.431827471470981e-07, + "logits/chosen": -1.3625011444091797, + "logits/rejected": -1.2719361782073975, + "logps/chosen": -40.48004150390625, + "logps/rejected": -38.142642974853516, + "loss": 0.4157, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.9334484338760376, + "rewards/margins": -0.16937267780303955, + "rewards/rejected": 2.102821111679077, + "step": 10371 + }, + { + "epoch": 1.68, + "learning_rate": 6.425380746095699e-07, + "logits/chosen": -1.4196308851242065, + "logits/rejected": -1.288515567779541, + "logps/chosen": -56.0296745300293, + "logps/rejected": -49.67338562011719, + "loss": 0.0873, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.853566646575928, + "rewards/margins": 1.7254140377044678, + "rewards/rejected": 3.12815260887146, + "step": 10372 + }, + { + "epoch": 1.68, + "learning_rate": 6.418937031314315e-07, + "logits/chosen": -1.5356963872909546, + "logits/rejected": -1.476742148399353, + "logps/chosen": -130.32095336914062, + "logps/rejected": -72.35906982421875, + "loss": 0.308, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.236572265625, + "rewards/margins": 0.8822097778320312, + "rewards/rejected": 3.3543624877929688, + "step": 10373 + }, + { + "epoch": 1.68, + "learning_rate": 6.412496327571999e-07, + "logits/chosen": -1.4327588081359863, + "logits/rejected": -1.4491158723831177, + "logps/chosen": -49.785396575927734, + "logps/rejected": -72.82662200927734, + "loss": 0.7644, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.1411384344100952, + "rewards/margins": -0.19305038452148438, + "rewards/rejected": 1.3341888189315796, + "step": 10374 + }, + { + "epoch": 1.68, + "learning_rate": 6.406058635313772e-07, + "logits/chosen": -1.2224498987197876, + "logits/rejected": -1.2237759828567505, + "logps/chosen": -27.128753662109375, + "logps/rejected": -41.578956604003906, + "loss": 0.3135, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.097929835319519, + "rewards/margins": 0.1404736042022705, + "rewards/rejected": 0.9574562311172485, + "step": 10375 + }, + { + "epoch": 1.68, + "learning_rate": 6.39962395498439e-07, + "logits/chosen": -1.2359721660614014, + "logits/rejected": -1.2686989307403564, + "logps/chosen": -18.194841384887695, + "logps/rejected": -38.265933990478516, + "loss": 0.9207, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.0589170455932617, + "rewards/margins": -1.282606840133667, + "rewards/rejected": 2.3415238857269287, + "step": 10376 + }, + { + "epoch": 1.68, + "learning_rate": 6.393192287028461e-07, + "logits/chosen": -1.3492194414138794, + "logits/rejected": -1.349548578262329, + "logps/chosen": -145.72084045410156, + "logps/rejected": -104.0769271850586, + "loss": 0.0972, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.659007549285889, + "rewards/margins": 1.5857858657836914, + "rewards/rejected": 4.073221683502197, + "step": 10377 + }, + { + "epoch": 1.68, + "learning_rate": 6.386763631890313e-07, + "logits/chosen": -1.0147091150283813, + "logits/rejected": -0.9713487029075623, + "logps/chosen": -73.81704711914062, + "logps/rejected": -80.71096801757812, + "loss": 0.527, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.2586395740509033, + "rewards/margins": -0.05880892276763916, + "rewards/rejected": 1.3174484968185425, + "step": 10378 + }, + { + "epoch": 1.68, + "learning_rate": 6.380337990014141e-07, + "logits/chosen": -1.3506801128387451, + "logits/rejected": -1.3427082300186157, + "logps/chosen": -36.56295394897461, + "logps/rejected": -44.110355377197266, + "loss": 0.2522, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.7903530597686768, + "rewards/margins": 0.6346657276153564, + "rewards/rejected": 2.1556873321533203, + "step": 10379 + }, + { + "epoch": 1.68, + "learning_rate": 6.373915361843868e-07, + "logits/chosen": -0.5908098816871643, + "logits/rejected": -0.5908098816871643, + "logps/chosen": -33.163246154785156, + "logps/rejected": -33.163246154785156, + "loss": 2.5994, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.6045178174972534, + "rewards/margins": 0.0, + "rewards/rejected": 1.6045178174972534, + "step": 10380 + }, + { + "epoch": 1.68, + "learning_rate": 6.367495747823265e-07, + "logits/chosen": -1.3983465433120728, + "logits/rejected": -1.3660125732421875, + "logps/chosen": -43.292564392089844, + "logps/rejected": -21.98898696899414, + "loss": 1.7968, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.018148899078369, + "rewards/margins": -0.40871191024780273, + "rewards/rejected": 2.426860809326172, + "step": 10381 + }, + { + "epoch": 1.69, + "learning_rate": 6.361079148395838e-07, + "logits/chosen": -0.9661719799041748, + "logits/rejected": -0.9478527903556824, + "logps/chosen": -62.59090805053711, + "logps/rejected": -57.43020248413086, + "loss": 0.3934, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.4125843048095703, + "rewards/margins": 0.8143169283866882, + "rewards/rejected": 0.5982673764228821, + "step": 10382 + }, + { + "epoch": 1.69, + "learning_rate": 6.354665564004936e-07, + "logits/chosen": -0.8147918581962585, + "logits/rejected": -0.8112576007843018, + "logps/chosen": -3.1908304691314697, + "logps/rejected": -27.364871978759766, + "loss": 0.8163, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.35574230551719666, + "rewards/margins": -0.09297004342079163, + "rewards/rejected": 0.4487123489379883, + "step": 10383 + }, + { + "epoch": 1.69, + "learning_rate": 6.348254995093656e-07, + "logits/chosen": -1.1855579614639282, + "logits/rejected": -1.176903486251831, + "logps/chosen": -160.84429931640625, + "logps/rejected": -55.602638244628906, + "loss": 0.1432, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.725142002105713, + "rewards/margins": 1.401827335357666, + "rewards/rejected": 1.3233146667480469, + "step": 10384 + }, + { + "epoch": 1.69, + "learning_rate": 6.341847442104931e-07, + "logits/chosen": -1.0611461400985718, + "logits/rejected": -0.9489825963973999, + "logps/chosen": -65.67716217041016, + "logps/rejected": -74.34527587890625, + "loss": 0.537, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.968264102935791, + "rewards/margins": -0.48914098739624023, + "rewards/rejected": 3.4574050903320312, + "step": 10385 + }, + { + "epoch": 1.69, + "learning_rate": 6.335442905481442e-07, + "logits/chosen": -1.2187845706939697, + "logits/rejected": -1.2855958938598633, + "logps/chosen": -108.45850372314453, + "logps/rejected": -181.59039306640625, + "loss": 1.1834, + "rewards/accuracies": 0.0, + "rewards/chosen": 4.849391937255859, + "rewards/margins": -2.1070122718811035, + "rewards/rejected": 6.956404209136963, + "step": 10386 + }, + { + "epoch": 1.69, + "learning_rate": 6.329041385665696e-07, + "logits/chosen": -1.448111891746521, + "logits/rejected": -1.4769355058670044, + "logps/chosen": -47.64904022216797, + "logps/rejected": -102.62333679199219, + "loss": 0.8806, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.097046852111816, + "rewards/margins": 0.3394126892089844, + "rewards/rejected": 4.757634162902832, + "step": 10387 + }, + { + "epoch": 1.69, + "learning_rate": 6.322642883099966e-07, + "logits/chosen": -1.2151825428009033, + "logits/rejected": -1.1983755826950073, + "logps/chosen": -50.23485565185547, + "logps/rejected": -59.40718078613281, + "loss": 0.7849, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.178940534591675, + "rewards/margins": -0.6912102699279785, + "rewards/rejected": 3.8701508045196533, + "step": 10388 + }, + { + "epoch": 1.69, + "learning_rate": 6.316247398226343e-07, + "logits/chosen": -1.622976541519165, + "logits/rejected": -1.6302964687347412, + "logps/chosen": -112.80317687988281, + "logps/rejected": -83.26200866699219, + "loss": 0.3039, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.046858310699463, + "rewards/margins": 0.27533721923828125, + "rewards/rejected": 2.7715210914611816, + "step": 10389 + }, + { + "epoch": 1.69, + "learning_rate": 6.309854931486675e-07, + "logits/chosen": -1.6037781238555908, + "logits/rejected": -1.4438965320587158, + "logps/chosen": -98.17682647705078, + "logps/rejected": -14.823625564575195, + "loss": 0.1138, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.0436577796936035, + "rewards/margins": 4.129786014556885, + "rewards/rejected": 0.9138715863227844, + "step": 10390 + }, + { + "epoch": 1.69, + "learning_rate": 6.30346548332263e-07, + "logits/chosen": -1.6871519088745117, + "logits/rejected": -1.5960400104522705, + "logps/chosen": -133.3140411376953, + "logps/rejected": -63.80518341064453, + "loss": 0.1182, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.95608377456665, + "rewards/margins": 2.0488739013671875, + "rewards/rejected": 4.907209873199463, + "step": 10391 + }, + { + "epoch": 1.69, + "learning_rate": 6.29707905417567e-07, + "logits/chosen": -1.4617910385131836, + "logits/rejected": -1.2610808610916138, + "logps/chosen": -124.00483703613281, + "logps/rejected": -40.47039031982422, + "loss": 0.2436, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.695379734039307, + "rewards/margins": 4.033679962158203, + "rewards/rejected": 2.6616997718811035, + "step": 10392 + }, + { + "epoch": 1.69, + "learning_rate": 6.290695644487016e-07, + "logits/chosen": -1.3280763626098633, + "logits/rejected": -1.257470965385437, + "logps/chosen": -48.311920166015625, + "logps/rejected": -47.018287658691406, + "loss": 0.35, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.137790679931641, + "rewards/margins": 1.2484664916992188, + "rewards/rejected": 2.889324188232422, + "step": 10393 + }, + { + "epoch": 1.69, + "learning_rate": 6.284315254697726e-07, + "logits/chosen": -1.3783522844314575, + "logits/rejected": -1.222345232963562, + "logps/chosen": -66.16990661621094, + "logps/rejected": -20.381967544555664, + "loss": 2.9389, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.9604880809783936, + "rewards/margins": 2.8813414573669434, + "rewards/rejected": 1.0791467428207397, + "step": 10394 + }, + { + "epoch": 1.69, + "learning_rate": 6.277937885248597e-07, + "logits/chosen": -1.310341715812683, + "logits/rejected": -1.3885829448699951, + "logps/chosen": -60.35166931152344, + "logps/rejected": -88.06690979003906, + "loss": 0.7853, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.0789780616760254, + "rewards/margins": -1.3304717540740967, + "rewards/rejected": 3.409449815750122, + "step": 10395 + }, + { + "epoch": 1.69, + "learning_rate": 6.271563536580266e-07, + "logits/chosen": -1.2066456079483032, + "logits/rejected": -1.2159655094146729, + "logps/chosen": -49.2790641784668, + "logps/rejected": -135.03890991210938, + "loss": 0.9536, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.0034961700439453, + "rewards/margins": -1.0228252410888672, + "rewards/rejected": 3.0263214111328125, + "step": 10396 + }, + { + "epoch": 1.69, + "learning_rate": 6.265192209133125e-07, + "logits/chosen": -1.0469554662704468, + "logits/rejected": -0.9182712435722351, + "logps/chosen": -78.70355224609375, + "logps/rejected": -47.150142669677734, + "loss": 0.1252, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.302929878234863, + "rewards/margins": 1.3526852130889893, + "rewards/rejected": 3.950244665145874, + "step": 10397 + }, + { + "epoch": 1.69, + "learning_rate": 6.258823903347388e-07, + "logits/chosen": -1.176558017730713, + "logits/rejected": -1.1297812461853027, + "logps/chosen": -76.21194458007812, + "logps/rejected": -91.67923736572266, + "loss": 0.6275, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.155717611312866, + "rewards/margins": 0.7281327247619629, + "rewards/rejected": 2.4275848865509033, + "step": 10398 + }, + { + "epoch": 1.69, + "learning_rate": 6.252458619663032e-07, + "logits/chosen": -0.839055061340332, + "logits/rejected": -0.8388476371765137, + "logps/chosen": -1.2109291553497314, + "logps/rejected": -2.7252213954925537, + "loss": 1.5946, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.4923039376735687, + "rewards/margins": -0.1400914490222931, + "rewards/rejected": 0.6323953866958618, + "step": 10399 + }, + { + "epoch": 1.69, + "learning_rate": 6.246096358519848e-07, + "logits/chosen": -1.219377040863037, + "logits/rejected": -1.1656285524368286, + "logps/chosen": -74.15719604492188, + "logps/rejected": -13.571439743041992, + "loss": 0.4657, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.46432265639305115, + "rewards/margins": -0.4179193675518036, + "rewards/rejected": 0.8822420239448547, + "step": 10400 + }, + { + "epoch": 1.69, + "learning_rate": 6.239737120357392e-07, + "logits/chosen": -0.8460121154785156, + "logits/rejected": -0.8469939231872559, + "logps/chosen": -18.563854217529297, + "logps/rejected": -15.485954284667969, + "loss": 1.5508, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.04206428676843643, + "rewards/margins": -0.6336379647254944, + "rewards/rejected": 0.6757022738456726, + "step": 10401 + }, + { + "epoch": 1.69, + "learning_rate": 6.233380905615049e-07, + "logits/chosen": -1.0703903436660767, + "logits/rejected": -0.912916362285614, + "logps/chosen": -57.129066467285156, + "logps/rejected": -14.511757850646973, + "loss": 0.3776, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.8198859691619873, + "rewards/margins": 2.7496731281280518, + "rewards/rejected": 1.0702128410339355, + "step": 10402 + }, + { + "epoch": 1.69, + "learning_rate": 6.227027714731948e-07, + "logits/chosen": -1.2346165180206299, + "logits/rejected": -1.1856319904327393, + "logps/chosen": -61.33573913574219, + "logps/rejected": -37.76659393310547, + "loss": 1.4011, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.894024610519409, + "rewards/margins": 0.6067733764648438, + "rewards/rejected": 2.2872512340545654, + "step": 10403 + }, + { + "epoch": 1.69, + "learning_rate": 6.220677548147064e-07, + "logits/chosen": -1.6699912548065186, + "logits/rejected": -1.663428544998169, + "logps/chosen": -47.73305892944336, + "logps/rejected": -55.888336181640625, + "loss": 0.4005, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.363515377044678, + "rewards/margins": 0.7141990661621094, + "rewards/rejected": 4.649316310882568, + "step": 10404 + }, + { + "epoch": 1.69, + "learning_rate": 6.214330406299101e-07, + "logits/chosen": -1.401322841644287, + "logits/rejected": -1.0364691019058228, + "logps/chosen": -71.92085266113281, + "logps/rejected": -46.14164733886719, + "loss": 0.5821, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.780517578125, + "rewards/margins": -0.7803757190704346, + "rewards/rejected": 3.5608932971954346, + "step": 10405 + }, + { + "epoch": 1.69, + "learning_rate": 6.207986289626617e-07, + "logits/chosen": -1.4074366092681885, + "logits/rejected": -1.3534021377563477, + "logps/chosen": -41.46440124511719, + "logps/rejected": -10.771669387817383, + "loss": 0.2599, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.5638482570648193, + "rewards/margins": 1.165488600730896, + "rewards/rejected": 1.3983596563339233, + "step": 10406 + }, + { + "epoch": 1.69, + "learning_rate": 6.201645198567907e-07, + "logits/chosen": -0.8702577948570251, + "logits/rejected": -0.8654116988182068, + "logps/chosen": -1.011426329612732, + "logps/rejected": -2.405848264694214, + "loss": 0.9358, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3481921851634979, + "rewards/margins": 0.030053645372390747, + "rewards/rejected": 0.3181385397911072, + "step": 10407 + }, + { + "epoch": 1.69, + "learning_rate": 6.195307133561101e-07, + "logits/chosen": -1.1360135078430176, + "logits/rejected": -1.1476267576217651, + "logps/chosen": -81.90531921386719, + "logps/rejected": -75.31553649902344, + "loss": 1.3895, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.1291863918304443, + "rewards/margins": -0.8878364562988281, + "rewards/rejected": 3.0170228481292725, + "step": 10408 + }, + { + "epoch": 1.69, + "learning_rate": 6.188972095044071e-07, + "logits/chosen": -1.3523764610290527, + "logits/rejected": -1.2667409181594849, + "logps/chosen": -214.13140869140625, + "logps/rejected": -87.76260375976562, + "loss": 0.0786, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.518109083175659, + "rewards/margins": 2.380201816558838, + "rewards/rejected": 1.1379073858261108, + "step": 10409 + }, + { + "epoch": 1.69, + "learning_rate": 6.182640083454544e-07, + "logits/chosen": -1.1922920942306519, + "logits/rejected": -1.206737995147705, + "logps/chosen": -74.36809539794922, + "logps/rejected": -110.5286636352539, + "loss": 0.936, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.5718865394592285, + "rewards/margins": -1.08864426612854, + "rewards/rejected": 3.6605308055877686, + "step": 10410 + }, + { + "epoch": 1.69, + "learning_rate": 6.17631109922997e-07, + "logits/chosen": -1.3023521900177002, + "logits/rejected": -1.3812649250030518, + "logps/chosen": -98.34529113769531, + "logps/rejected": -148.07455444335938, + "loss": 0.2239, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.9600586891174316, + "rewards/margins": 1.0528075695037842, + "rewards/rejected": 2.9072511196136475, + "step": 10411 + }, + { + "epoch": 1.69, + "learning_rate": 6.169985142807644e-07, + "logits/chosen": -1.228023648262024, + "logits/rejected": -1.2298996448516846, + "logps/chosen": -4.828918933868408, + "logps/rejected": -16.33287239074707, + "loss": 0.524, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.32660698890686035, + "rewards/margins": -0.6147563457489014, + "rewards/rejected": 0.9413633346557617, + "step": 10412 + }, + { + "epoch": 1.69, + "learning_rate": 6.163662214624616e-07, + "logits/chosen": -1.1800204515457153, + "logits/rejected": -1.2550572156906128, + "logps/chosen": -181.2896270751953, + "logps/rejected": -138.72879028320312, + "loss": 0.2139, + "rewards/accuracies": 1.0, + "rewards/chosen": 9.214430809020996, + "rewards/margins": 0.6305809020996094, + "rewards/rejected": 8.583849906921387, + "step": 10413 + }, + { + "epoch": 1.69, + "learning_rate": 6.157342315117754e-07, + "logits/chosen": -1.4679458141326904, + "logits/rejected": -1.476304531097412, + "logps/chosen": -71.47815704345703, + "logps/rejected": -78.58909606933594, + "loss": 0.2114, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.249898672103882, + "rewards/margins": 0.9937272071838379, + "rewards/rejected": 2.256171464920044, + "step": 10414 + }, + { + "epoch": 1.69, + "learning_rate": 6.151025444723679e-07, + "logits/chosen": -1.1196675300598145, + "logits/rejected": -1.1281661987304688, + "logps/chosen": -41.48629379272461, + "logps/rejected": -78.1260986328125, + "loss": 1.1754, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.324791431427002, + "rewards/margins": -1.1636722087860107, + "rewards/rejected": 3.4884636402130127, + "step": 10415 + }, + { + "epoch": 1.69, + "learning_rate": 6.144711603878861e-07, + "logits/chosen": -1.2767115831375122, + "logits/rejected": -1.171333909034729, + "logps/chosen": -84.52763366699219, + "logps/rejected": -24.176742553710938, + "loss": 0.0998, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.9905281066894531, + "rewards/margins": 1.662173867225647, + "rewards/rejected": 0.32835426926612854, + "step": 10416 + }, + { + "epoch": 1.69, + "learning_rate": 6.138400793019494e-07, + "logits/chosen": -1.3122856616973877, + "logits/rejected": -1.2999184131622314, + "logps/chosen": -53.01593017578125, + "logps/rejected": -47.48634338378906, + "loss": 0.6911, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.234422445297241, + "rewards/margins": -0.9532811641693115, + "rewards/rejected": 4.187703609466553, + "step": 10417 + }, + { + "epoch": 1.69, + "learning_rate": 6.13209301258162e-07, + "logits/chosen": -1.2939561605453491, + "logits/rejected": -1.3664038181304932, + "logps/chosen": -64.64407348632812, + "logps/rejected": -74.49168395996094, + "loss": 1.4409, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.235095262527466, + "rewards/margins": -1.9754221439361572, + "rewards/rejected": 5.210517406463623, + "step": 10418 + }, + { + "epoch": 1.69, + "learning_rate": 6.125788263001025e-07, + "logits/chosen": -1.0259699821472168, + "logits/rejected": -1.018988013267517, + "logps/chosen": -29.592844009399414, + "logps/rejected": -20.930177688598633, + "loss": 0.3631, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.8413950204849243, + "rewards/margins": -0.04848480224609375, + "rewards/rejected": 0.8898798227310181, + "step": 10419 + }, + { + "epoch": 1.69, + "learning_rate": 6.119486544713332e-07, + "logits/chosen": -1.6034491062164307, + "logits/rejected": -1.6533328294754028, + "logps/chosen": -115.41522216796875, + "logps/rejected": -107.82322692871094, + "loss": 0.0065, + "rewards/accuracies": 1.0, + "rewards/chosen": 9.609761238098145, + "rewards/margins": 4.5290117263793945, + "rewards/rejected": 5.08074951171875, + "step": 10420 + }, + { + "epoch": 1.69, + "learning_rate": 6.113187858153907e-07, + "logits/chosen": -1.4313108921051025, + "logits/rejected": -1.3067212104797363, + "logps/chosen": -161.0774688720703, + "logps/rejected": -67.79521179199219, + "loss": 0.119, + "rewards/accuracies": 1.0, + "rewards/chosen": 7.865535259246826, + "rewards/margins": 4.5837931632995605, + "rewards/rejected": 3.2817420959472656, + "step": 10421 + }, + { + "epoch": 1.69, + "learning_rate": 6.106892203757953e-07, + "logits/chosen": -1.2897709608078003, + "logits/rejected": -1.303137183189392, + "logps/chosen": -79.75508880615234, + "logps/rejected": -44.176692962646484, + "loss": 0.1232, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.182929992675781, + "rewards/margins": 2.009437084197998, + "rewards/rejected": 2.173492908477783, + "step": 10422 + }, + { + "epoch": 1.69, + "learning_rate": 6.100599581960414e-07, + "logits/chosen": -1.2252916097640991, + "logits/rejected": -1.2430429458618164, + "logps/chosen": -54.331581115722656, + "logps/rejected": -72.66961669921875, + "loss": 0.346, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.750196099281311, + "rewards/margins": 0.08657145500183105, + "rewards/rejected": 1.66362464427948, + "step": 10423 + }, + { + "epoch": 1.69, + "learning_rate": 6.094309993196074e-07, + "logits/chosen": -1.2173054218292236, + "logits/rejected": -1.2301934957504272, + "logps/chosen": -44.88652038574219, + "logps/rejected": -91.88346099853516, + "loss": 0.6146, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.2166775465011597, + "rewards/margins": 0.4721149802207947, + "rewards/rejected": 0.744562566280365, + "step": 10424 + }, + { + "epoch": 1.69, + "learning_rate": 6.088023437899466e-07, + "logits/chosen": -1.1216546297073364, + "logits/rejected": -1.1207129955291748, + "logps/chosen": -116.16927337646484, + "logps/rejected": -53.89259338378906, + "loss": 0.3374, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.536101818084717, + "rewards/margins": 3.1639292240142822, + "rewards/rejected": 2.3721725940704346, + "step": 10425 + }, + { + "epoch": 1.69, + "learning_rate": 6.081739916504948e-07, + "logits/chosen": -1.5368292331695557, + "logits/rejected": -1.5954031944274902, + "logps/chosen": -122.49493408203125, + "logps/rejected": -80.63473510742188, + "loss": 0.2002, + "rewards/accuracies": 1.0, + "rewards/chosen": 7.604666233062744, + "rewards/margins": 0.7190427780151367, + "rewards/rejected": 6.885623455047607, + "step": 10426 + }, + { + "epoch": 1.69, + "learning_rate": 6.075459429446635e-07, + "logits/chosen": -1.382574439048767, + "logits/rejected": -1.2583394050598145, + "logps/chosen": -162.63746643066406, + "logps/rejected": -41.958805084228516, + "loss": 0.1479, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.796473979949951, + "rewards/margins": 4.826346397399902, + "rewards/rejected": 1.9701274633407593, + "step": 10427 + }, + { + "epoch": 1.69, + "learning_rate": 6.069181977158456e-07, + "logits/chosen": -1.145782470703125, + "logits/rejected": -0.7960208058357239, + "logps/chosen": -114.74496459960938, + "logps/rejected": -44.78812789916992, + "loss": 0.2798, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.0763702392578125, + "rewards/margins": 4.6018147468566895, + "rewards/rejected": 0.4745555818080902, + "step": 10428 + }, + { + "epoch": 1.69, + "learning_rate": 6.062907560074133e-07, + "logits/chosen": -1.1834874153137207, + "logits/rejected": -1.055842399597168, + "logps/chosen": -43.01374435424805, + "logps/rejected": -40.994598388671875, + "loss": 0.4319, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.9222614765167236, + "rewards/margins": -0.27693605422973633, + "rewards/rejected": 3.19919753074646, + "step": 10429 + }, + { + "epoch": 1.69, + "learning_rate": 6.056636178627157e-07, + "logits/chosen": -1.2052817344665527, + "logits/rejected": -1.1655070781707764, + "logps/chosen": -41.87407302856445, + "logps/rejected": -53.177581787109375, + "loss": 0.7166, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.0575242042541504, + "rewards/margins": -1.1028404235839844, + "rewards/rejected": 3.1603646278381348, + "step": 10430 + }, + { + "epoch": 1.69, + "learning_rate": 6.05036783325083e-07, + "logits/chosen": -0.9496351480484009, + "logits/rejected": -0.9496351480484009, + "logps/chosen": -57.7725715637207, + "logps/rejected": -57.7725715637207, + "loss": 0.5118, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.5012614727020264, + "rewards/margins": 0.0, + "rewards/rejected": 2.5012614727020264, + "step": 10431 + }, + { + "epoch": 1.69, + "learning_rate": 6.04410252437822e-07, + "logits/chosen": -1.2197574377059937, + "logits/rejected": -1.2656055688858032, + "logps/chosen": -86.08393096923828, + "logps/rejected": -86.34532165527344, + "loss": 1.0901, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.079586029052734, + "rewards/margins": 2.302145481109619, + "rewards/rejected": 1.7774406671524048, + "step": 10432 + }, + { + "epoch": 1.69, + "learning_rate": 6.037840252442223e-07, + "logits/chosen": -1.2802797555923462, + "logits/rejected": -1.4651755094528198, + "logps/chosen": -74.86006927490234, + "logps/rejected": -38.42262268066406, + "loss": 0.9264, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.3524529933929443, + "rewards/margins": -1.235743761062622, + "rewards/rejected": 4.588196754455566, + "step": 10433 + }, + { + "epoch": 1.69, + "learning_rate": 6.031581017875482e-07, + "logits/chosen": -1.3889974355697632, + "logits/rejected": -1.3523210287094116, + "logps/chosen": -126.30709838867188, + "logps/rejected": -104.26347351074219, + "loss": 0.4834, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.849465847015381, + "rewards/margins": 2.6603217124938965, + "rewards/rejected": 2.1891441345214844, + "step": 10434 + }, + { + "epoch": 1.69, + "learning_rate": 6.025324821110468e-07, + "logits/chosen": -1.288831353187561, + "logits/rejected": -1.079942226409912, + "logps/chosen": -113.37664031982422, + "logps/rejected": -17.004196166992188, + "loss": 0.265, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.513751983642578, + "rewards/margins": 2.9551680088043213, + "rewards/rejected": 0.5585840344429016, + "step": 10435 + }, + { + "epoch": 1.69, + "learning_rate": 6.019071662579407e-07, + "logits/chosen": -1.3095990419387817, + "logits/rejected": -1.239210844039917, + "logps/chosen": -33.49295425415039, + "logps/rejected": -58.09038543701172, + "loss": 0.6091, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.7324390411376953, + "rewards/margins": 0.8055683374404907, + "rewards/rejected": 1.9268707036972046, + "step": 10436 + }, + { + "epoch": 1.69, + "learning_rate": 6.012821542714353e-07, + "logits/chosen": -1.3841418027877808, + "logits/rejected": -1.3739265203475952, + "logps/chosen": -92.42494201660156, + "logps/rejected": -84.81430053710938, + "loss": 0.4865, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.2152695655822754, + "rewards/margins": -0.18192124366760254, + "rewards/rejected": 2.397190809249878, + "step": 10437 + }, + { + "epoch": 1.69, + "learning_rate": 6.006574461947107e-07, + "logits/chosen": -1.2908979654312134, + "logits/rejected": -1.3372886180877686, + "logps/chosen": -74.6268310546875, + "logps/rejected": -72.39540100097656, + "loss": 0.6749, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.2449951171875, + "rewards/margins": -1.038665771484375, + "rewards/rejected": 3.283660888671875, + "step": 10438 + }, + { + "epoch": 1.69, + "learning_rate": 6.00033042070931e-07, + "logits/chosen": -1.304510235786438, + "logits/rejected": -1.304510235786438, + "logps/chosen": -67.7996826171875, + "logps/rejected": -67.7996826171875, + "loss": 0.4007, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.8699722290039062, + "rewards/margins": 0.0, + "rewards/rejected": 3.8699722290039062, + "step": 10439 + }, + { + "epoch": 1.69, + "learning_rate": 5.994089419432341e-07, + "logits/chosen": -1.5363950729370117, + "logits/rejected": -1.4785887002944946, + "logps/chosen": -89.92745971679688, + "logps/rejected": -31.999753952026367, + "loss": 0.1447, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.011010646820068, + "rewards/margins": 4.470732688903809, + "rewards/rejected": 1.5402777194976807, + "step": 10440 + }, + { + "epoch": 1.69, + "learning_rate": 5.987851458547412e-07, + "logits/chosen": -1.2731012105941772, + "logits/rejected": -1.2619343996047974, + "logps/chosen": -38.997222900390625, + "logps/rejected": -70.76448822021484, + "loss": 1.3927, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.820124864578247, + "rewards/margins": -2.3044822216033936, + "rewards/rejected": 5.124607086181641, + "step": 10441 + }, + { + "epoch": 1.69, + "learning_rate": 5.981616538485496e-07, + "logits/chosen": -0.7608840465545654, + "logits/rejected": -0.7634169459342957, + "logps/chosen": -3.906620979309082, + "logps/rejected": -1.429152011871338, + "loss": 0.834, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.06065402179956436, + "rewards/margins": -0.1499621570110321, + "rewards/rejected": 0.21061618626117706, + "step": 10442 + }, + { + "epoch": 1.7, + "learning_rate": 5.97538465967738e-07, + "logits/chosen": -1.1784058809280396, + "logits/rejected": -1.3066333532333374, + "logps/chosen": -105.11526489257812, + "logps/rejected": -168.4206085205078, + "loss": 1.4961, + "rewards/accuracies": 0.0, + "rewards/chosen": 6.0927581787109375, + "rewards/margins": -2.9403676986694336, + "rewards/rejected": 9.033125877380371, + "step": 10443 + }, + { + "epoch": 1.7, + "learning_rate": 5.969155822553607e-07, + "logits/chosen": -0.8545485734939575, + "logits/rejected": -0.776965320110321, + "logps/chosen": -37.57421875, + "logps/rejected": -16.19474983215332, + "loss": 0.2526, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.7082618474960327, + "rewards/margins": 1.0616166591644287, + "rewards/rejected": 0.646645188331604, + "step": 10444 + }, + { + "epoch": 1.7, + "learning_rate": 5.96293002754455e-07, + "logits/chosen": -1.2699787616729736, + "logits/rejected": -1.2221989631652832, + "logps/chosen": -59.26997375488281, + "logps/rejected": -44.922523498535156, + "loss": 0.3651, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.580159902572632, + "rewards/margins": 0.42452239990234375, + "rewards/rejected": 2.155637502670288, + "step": 10445 + }, + { + "epoch": 1.7, + "learning_rate": 5.956707275080342e-07, + "logits/chosen": -1.0950125455856323, + "logits/rejected": -1.092018485069275, + "logps/chosen": -3.697335958480835, + "logps/rejected": -2.7030627727508545, + "loss": 0.6834, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.393291175365448, + "rewards/margins": 0.039420753717422485, + "rewards/rejected": 0.3538704216480255, + "step": 10446 + }, + { + "epoch": 1.7, + "learning_rate": 5.950487565590928e-07, + "logits/chosen": -1.2562283277511597, + "logits/rejected": -1.2585906982421875, + "logps/chosen": -24.92577362060547, + "logps/rejected": -38.29747772216797, + "loss": 0.9212, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.7059669494628906, + "rewards/margins": 0.6336028575897217, + "rewards/rejected": 2.072364091873169, + "step": 10447 + }, + { + "epoch": 1.7, + "learning_rate": 5.94427089950601e-07, + "logits/chosen": -1.4003121852874756, + "logits/rejected": -1.2068136930465698, + "logps/chosen": -59.0244140625, + "logps/rejected": -17.672889709472656, + "loss": 0.2167, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.369830369949341, + "rewards/margins": 1.196906328201294, + "rewards/rejected": 1.1729240417480469, + "step": 10448 + }, + { + "epoch": 1.7, + "learning_rate": 5.938057277255127e-07, + "logits/chosen": -1.3503503799438477, + "logits/rejected": -1.2466691732406616, + "logps/chosen": -128.797607421875, + "logps/rejected": -56.98329162597656, + "loss": 0.0428, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.963409423828125, + "rewards/margins": 2.4483025074005127, + "rewards/rejected": 2.5151069164276123, + "step": 10449 + }, + { + "epoch": 1.7, + "learning_rate": 5.931846699267558e-07, + "logits/chosen": -1.383404016494751, + "logits/rejected": -1.389953374862671, + "logps/chosen": -131.33273315429688, + "logps/rejected": -95.18550109863281, + "loss": 0.0847, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.234132289886475, + "rewards/margins": 1.8175857067108154, + "rewards/rejected": 2.416546583175659, + "step": 10450 + }, + { + "epoch": 1.7, + "learning_rate": 5.925639165972414e-07, + "logits/chosen": -1.031307339668274, + "logits/rejected": -0.8718400001525879, + "logps/chosen": -42.96372604370117, + "logps/rejected": -19.553470611572266, + "loss": 0.2138, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.9433979988098145, + "rewards/margins": 3.2663605213165283, + "rewards/rejected": 0.6770374178886414, + "step": 10451 + }, + { + "epoch": 1.7, + "learning_rate": 5.919434677798557e-07, + "logits/chosen": -1.1380281448364258, + "logits/rejected": -1.1643576622009277, + "logps/chosen": -59.04323196411133, + "logps/rejected": -58.995357513427734, + "loss": 0.646, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.5351712703704834, + "rewards/margins": -0.08709859848022461, + "rewards/rejected": 2.622269868850708, + "step": 10452 + }, + { + "epoch": 1.7, + "learning_rate": 5.913233235174682e-07, + "logits/chosen": -1.2587265968322754, + "logits/rejected": -1.2470120191574097, + "logps/chosen": -108.96003723144531, + "logps/rejected": -50.35218048095703, + "loss": 0.1865, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.618321180343628, + "rewards/margins": 1.1706198453903198, + "rewards/rejected": 1.447701334953308, + "step": 10453 + }, + { + "epoch": 1.7, + "learning_rate": 5.907034838529224e-07, + "logits/chosen": -1.4927228689193726, + "logits/rejected": -1.5008031129837036, + "logps/chosen": -110.31153106689453, + "logps/rejected": -83.2230224609375, + "loss": 0.2085, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.815523624420166, + "rewards/margins": 0.6592049598693848, + "rewards/rejected": 3.1563186645507812, + "step": 10454 + }, + { + "epoch": 1.7, + "learning_rate": 5.900839488290461e-07, + "logits/chosen": -0.9322944283485413, + "logits/rejected": -0.9146580100059509, + "logps/chosen": -19.56774139404297, + "logps/rejected": -13.811590194702148, + "loss": 0.6329, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.33492833375930786, + "rewards/margins": -0.03489035367965698, + "rewards/rejected": 0.36981868743896484, + "step": 10455 + }, + { + "epoch": 1.7, + "learning_rate": 5.894647184886415e-07, + "logits/chosen": -1.0979353189468384, + "logits/rejected": -1.0299949645996094, + "logps/chosen": -26.07265281677246, + "logps/rejected": -10.009208679199219, + "loss": 0.2797, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.174631357192993, + "rewards/margins": 0.8001295328140259, + "rewards/rejected": 1.3745018243789673, + "step": 10456 + }, + { + "epoch": 1.7, + "learning_rate": 5.888457928744912e-07, + "logits/chosen": -1.3432157039642334, + "logits/rejected": -1.2633405923843384, + "logps/chosen": -119.17927551269531, + "logps/rejected": -98.18234252929688, + "loss": 0.1972, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.08203125, + "rewards/margins": 2.506204128265381, + "rewards/rejected": 3.575827121734619, + "step": 10457 + }, + { + "epoch": 1.7, + "learning_rate": 5.882271720293591e-07, + "logits/chosen": -1.2550493478775024, + "logits/rejected": -1.1362203359603882, + "logps/chosen": -146.85169982910156, + "logps/rejected": -74.50407409667969, + "loss": 0.131, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.5899338722229, + "rewards/margins": 2.232067823410034, + "rewards/rejected": 3.357866048812866, + "step": 10458 + }, + { + "epoch": 1.7, + "learning_rate": 5.876088559959836e-07, + "logits/chosen": -1.2270851135253906, + "logits/rejected": -1.2302428483963013, + "logps/chosen": -52.10749053955078, + "logps/rejected": -46.91942596435547, + "loss": 0.4169, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.861309051513672, + "rewards/margins": 0.21222519874572754, + "rewards/rejected": 2.6490838527679443, + "step": 10459 + }, + { + "epoch": 1.7, + "learning_rate": 5.869908448170869e-07, + "logits/chosen": -0.8730262517929077, + "logits/rejected": -0.8730262517929077, + "logps/chosen": -35.6534423828125, + "logps/rejected": -35.6534423828125, + "loss": 0.4602, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.8379127383232117, + "rewards/margins": 0.0, + "rewards/rejected": 0.8379127383232117, + "step": 10460 + }, + { + "epoch": 1.7, + "learning_rate": 5.863731385353655e-07, + "logits/chosen": -1.3792482614517212, + "logits/rejected": -1.3027013540267944, + "logps/chosen": -86.01214599609375, + "logps/rejected": -53.852664947509766, + "loss": 0.7912, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.7707855701446533, + "rewards/margins": 0.7547085285186768, + "rewards/rejected": 2.0160770416259766, + "step": 10461 + }, + { + "epoch": 1.7, + "learning_rate": 5.857557371934991e-07, + "logits/chosen": -1.0351364612579346, + "logits/rejected": -0.9625774025917053, + "logps/chosen": -127.71159362792969, + "logps/rejected": -41.76918411254883, + "loss": 0.0818, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.070094347000122, + "rewards/margins": 1.9976764917373657, + "rewards/rejected": 1.0724178552627563, + "step": 10462 + }, + { + "epoch": 1.7, + "learning_rate": 5.85138640834142e-07, + "logits/chosen": -0.8547762036323547, + "logits/rejected": -0.7886383533477783, + "logps/chosen": -39.74312973022461, + "logps/rejected": -1.4506829977035522, + "loss": 0.3937, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5924102663993835, + "rewards/margins": 0.20193007588386536, + "rewards/rejected": 0.3904801905155182, + "step": 10463 + }, + { + "epoch": 1.7, + "learning_rate": 5.845218494999322e-07, + "logits/chosen": -1.429046869277954, + "logits/rejected": -1.4545994997024536, + "logps/chosen": -75.2448959350586, + "logps/rejected": -77.96455383300781, + "loss": 0.5473, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.5691001415252686, + "rewards/margins": 0.9640662670135498, + "rewards/rejected": 1.6050338745117188, + "step": 10464 + }, + { + "epoch": 1.7, + "learning_rate": 5.839053632334818e-07, + "logits/chosen": -1.086419939994812, + "logits/rejected": -0.9381452202796936, + "logps/chosen": -62.03203582763672, + "logps/rejected": -15.933245658874512, + "loss": 0.0163, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.446274518966675, + "rewards/margins": 3.5895349979400635, + "rewards/rejected": -0.14326058328151703, + "step": 10465 + }, + { + "epoch": 1.7, + "learning_rate": 5.832891820773868e-07, + "logits/chosen": -1.5563344955444336, + "logits/rejected": -1.5170512199401855, + "logps/chosen": -93.64741516113281, + "logps/rejected": -117.10374450683594, + "loss": 1.1636, + "rewards/accuracies": 0.0, + "rewards/chosen": 6.480937480926514, + "rewards/margins": -1.5178771018981934, + "rewards/rejected": 7.998814582824707, + "step": 10466 + }, + { + "epoch": 1.7, + "learning_rate": 5.826733060742168e-07, + "logits/chosen": -1.248368501663208, + "logits/rejected": -1.2156575918197632, + "logps/chosen": -60.90736389160156, + "logps/rejected": -58.312538146972656, + "loss": 0.4723, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.8430252075195312, + "rewards/margins": 0.6260092258453369, + "rewards/rejected": 2.2170159816741943, + "step": 10467 + }, + { + "epoch": 1.7, + "learning_rate": 5.820577352665252e-07, + "logits/chosen": -1.1202423572540283, + "logits/rejected": -1.1629531383514404, + "logps/chosen": -34.21522521972656, + "logps/rejected": -34.622745513916016, + "loss": 1.2678, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.5372909307479858, + "rewards/margins": -0.9234501123428345, + "rewards/rejected": 2.4607410430908203, + "step": 10468 + }, + { + "epoch": 1.7, + "learning_rate": 5.814424696968402e-07, + "logits/chosen": -1.0419921875, + "logits/rejected": -1.0419921875, + "logps/chosen": -18.428550720214844, + "logps/rejected": -18.428550720214844, + "loss": 1.0626, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.6551144123077393, + "rewards/margins": 0.0, + "rewards/rejected": 2.6551144123077393, + "step": 10469 + }, + { + "epoch": 1.7, + "learning_rate": 5.808275094076727e-07, + "logits/chosen": -1.1736717224121094, + "logits/rejected": -1.2520424127578735, + "logps/chosen": -56.55657196044922, + "logps/rejected": -130.94664001464844, + "loss": 0.9951, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.6749961376190186, + "rewards/margins": -1.638866662979126, + "rewards/rejected": 5.3138628005981445, + "step": 10470 + }, + { + "epoch": 1.7, + "learning_rate": 5.802128544415081e-07, + "logits/chosen": -0.9393824934959412, + "logits/rejected": -0.8998737931251526, + "logps/chosen": -72.95378112792969, + "logps/rejected": -40.07361602783203, + "loss": 0.0403, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.000860691070557, + "rewards/margins": 2.604604482650757, + "rewards/rejected": 3.3962562084198, + "step": 10471 + }, + { + "epoch": 1.7, + "learning_rate": 5.795985048408165e-07, + "logits/chosen": -1.4495887756347656, + "logits/rejected": -1.54726243019104, + "logps/chosen": -208.08767700195312, + "logps/rejected": -121.09709167480469, + "loss": 0.4097, + "rewards/accuracies": 0.0, + "rewards/chosen": 10.404044151306152, + "rewards/margins": -0.2367868423461914, + "rewards/rejected": 10.640830993652344, + "step": 10472 + }, + { + "epoch": 1.7, + "learning_rate": 5.78984460648041e-07, + "logits/chosen": -1.043230652809143, + "logits/rejected": -1.0563455820083618, + "logps/chosen": -58.27301788330078, + "logps/rejected": -60.16400146484375, + "loss": 1.1745, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.499945878982544, + "rewards/margins": 0.01233673095703125, + "rewards/rejected": 2.4876091480255127, + "step": 10473 + }, + { + "epoch": 1.7, + "learning_rate": 5.783707219056078e-07, + "logits/chosen": -1.1757335662841797, + "logits/rejected": -1.1327451467514038, + "logps/chosen": -91.52690887451172, + "logps/rejected": -50.503990173339844, + "loss": 0.2787, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.226656436920166, + "rewards/margins": 0.2981276512145996, + "rewards/rejected": 4.928528785705566, + "step": 10474 + }, + { + "epoch": 1.7, + "learning_rate": 5.777572886559196e-07, + "logits/chosen": -1.4167512655258179, + "logits/rejected": -1.2511801719665527, + "logps/chosen": -78.67343139648438, + "logps/rejected": -25.936687469482422, + "loss": 0.1511, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.5946266651153564, + "rewards/margins": 1.1287349462509155, + "rewards/rejected": 1.465891718864441, + "step": 10475 + }, + { + "epoch": 1.7, + "learning_rate": 5.771441609413597e-07, + "logits/chosen": -1.4710832834243774, + "logits/rejected": -1.273045301437378, + "logps/chosen": -82.28462219238281, + "logps/rejected": -27.164657592773438, + "loss": 0.3843, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.61687171459198, + "rewards/margins": 0.6239284873008728, + "rewards/rejected": 0.9929432272911072, + "step": 10476 + }, + { + "epoch": 1.7, + "learning_rate": 5.765313388042876e-07, + "logits/chosen": -0.9466373324394226, + "logits/rejected": -0.9466373324394226, + "logps/chosen": -26.68966293334961, + "logps/rejected": -26.68966293334961, + "loss": 0.3509, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.080312728881836, + "rewards/margins": 0.0, + "rewards/rejected": 1.080312728881836, + "step": 10477 + }, + { + "epoch": 1.7, + "learning_rate": 5.75918822287046e-07, + "logits/chosen": -1.3484315872192383, + "logits/rejected": -1.1827428340911865, + "logps/chosen": -117.60165405273438, + "logps/rejected": -57.112060546875, + "loss": 0.023, + "rewards/accuracies": 1.0, + "rewards/chosen": 7.4550018310546875, + "rewards/margins": 5.614566802978516, + "rewards/rejected": 1.8404350280761719, + "step": 10478 + }, + { + "epoch": 1.7, + "learning_rate": 5.753066114319516e-07, + "logits/chosen": -1.3201276063919067, + "logits/rejected": -1.3860974311828613, + "logps/chosen": -105.46488952636719, + "logps/rejected": -93.3029556274414, + "loss": 0.2565, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.6997604370117188, + "rewards/margins": 0.4243757724761963, + "rewards/rejected": 3.2753846645355225, + "step": 10479 + }, + { + "epoch": 1.7, + "learning_rate": 5.746947062813047e-07, + "logits/chosen": -1.6864416599273682, + "logits/rejected": -1.7161678075790405, + "logps/chosen": -94.63428497314453, + "logps/rejected": -36.81585693359375, + "loss": 0.0727, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.7462639808654785, + "rewards/margins": 2.5335216522216797, + "rewards/rejected": 0.21274223923683167, + "step": 10480 + }, + { + "epoch": 1.7, + "learning_rate": 5.7408310687738e-07, + "logits/chosen": -1.466981053352356, + "logits/rejected": -1.316269874572754, + "logps/chosen": -85.65162658691406, + "logps/rejected": -38.14606475830078, + "loss": 0.0158, + "rewards/accuracies": 1.0, + "rewards/chosen": 8.582884788513184, + "rewards/margins": 3.496211528778076, + "rewards/rejected": 5.086673259735107, + "step": 10481 + }, + { + "epoch": 1.7, + "learning_rate": 5.734718132624351e-07, + "logits/chosen": -1.409031867980957, + "logits/rejected": -1.2872658967971802, + "logps/chosen": -103.02206420898438, + "logps/rejected": -95.65460968017578, + "loss": 0.4847, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.782989501953125, + "rewards/margins": -0.4745965003967285, + "rewards/rejected": 3.2575860023498535, + "step": 10482 + }, + { + "epoch": 1.7, + "learning_rate": 5.728608254787027e-07, + "logits/chosen": -1.2524867057800293, + "logits/rejected": -1.1863107681274414, + "logps/chosen": -70.07839965820312, + "logps/rejected": -42.25053405761719, + "loss": 0.5102, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.3889999389648438, + "rewards/margins": -0.5453133583068848, + "rewards/rejected": 3.9343132972717285, + "step": 10483 + }, + { + "epoch": 1.7, + "learning_rate": 5.722501435683969e-07, + "logits/chosen": -1.5532158613204956, + "logits/rejected": -1.5630086660385132, + "logps/chosen": -82.58260345458984, + "logps/rejected": -58.44856643676758, + "loss": 0.1274, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.019028425216675, + "rewards/margins": 1.2988994121551514, + "rewards/rejected": 1.7201290130615234, + "step": 10484 + }, + { + "epoch": 1.7, + "learning_rate": 5.716397675737117e-07, + "logits/chosen": -1.4682142734527588, + "logits/rejected": -1.4177250862121582, + "logps/chosen": -73.72526550292969, + "logps/rejected": -41.1125373840332, + "loss": 0.1934, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.114501953125, + "rewards/margins": 0.774639368057251, + "rewards/rejected": 2.339862585067749, + "step": 10485 + }, + { + "epoch": 1.7, + "learning_rate": 5.710296975368163e-07, + "logits/chosen": -1.158138394355774, + "logits/rejected": -1.13178288936615, + "logps/chosen": -131.7568359375, + "logps/rejected": -107.35694122314453, + "loss": 0.7268, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.967568874359131, + "rewards/margins": 1.3028051853179932, + "rewards/rejected": 3.6647636890411377, + "step": 10486 + }, + { + "epoch": 1.7, + "learning_rate": 5.704199334998623e-07, + "logits/chosen": -1.504939079284668, + "logits/rejected": -1.478253722190857, + "logps/chosen": -92.56281280517578, + "logps/rejected": -119.19300842285156, + "loss": 0.9592, + "rewards/accuracies": 0.0, + "rewards/chosen": 5.602974891662598, + "rewards/margins": -1.7067341804504395, + "rewards/rejected": 7.309709072113037, + "step": 10487 + }, + { + "epoch": 1.7, + "learning_rate": 5.698104755049766e-07, + "logits/chosen": -1.1690647602081299, + "logits/rejected": -1.2099273204803467, + "logps/chosen": -79.33992767333984, + "logps/rejected": -118.23980712890625, + "loss": 1.8218, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.678455352783203, + "rewards/margins": 0.1105492115020752, + "rewards/rejected": 2.567906141281128, + "step": 10488 + }, + { + "epoch": 1.7, + "learning_rate": 5.692013235942694e-07, + "logits/chosen": -1.4979673624038696, + "logits/rejected": -1.4979673624038696, + "logps/chosen": -96.69947814941406, + "logps/rejected": -96.69947814941406, + "loss": 0.3851, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.2428536415100098, + "rewards/margins": 0.0, + "rewards/rejected": 2.2428536415100098, + "step": 10489 + }, + { + "epoch": 1.7, + "learning_rate": 5.685924778098256e-07, + "logits/chosen": -1.0706446170806885, + "logits/rejected": -1.0991204977035522, + "logps/chosen": -4.371218681335449, + "logps/rejected": -15.843273162841797, + "loss": 1.1296, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.4681411683559418, + "rewards/margins": -0.6067577600479126, + "rewards/rejected": 1.0748989582061768, + "step": 10490 + }, + { + "epoch": 1.7, + "learning_rate": 5.679839381937113e-07, + "logits/chosen": -1.3533823490142822, + "logits/rejected": -1.3615570068359375, + "logps/chosen": -43.49803924560547, + "logps/rejected": -107.61064910888672, + "loss": 0.4746, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.6978119611740112, + "rewards/margins": -0.42973101139068604, + "rewards/rejected": 2.1275429725646973, + "step": 10491 + }, + { + "epoch": 1.7, + "learning_rate": 5.673757047879702e-07, + "logits/chosen": -1.6296181678771973, + "logits/rejected": -1.5817495584487915, + "logps/chosen": -41.65037155151367, + "logps/rejected": -31.792701721191406, + "loss": 0.1859, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.5945274829864502, + "rewards/margins": 2.1443769931793213, + "rewards/rejected": -0.5498495101928711, + "step": 10492 + }, + { + "epoch": 1.7, + "learning_rate": 5.66767777634627e-07, + "logits/chosen": -1.3147011995315552, + "logits/rejected": -1.3300718069076538, + "logps/chosen": -88.58258819580078, + "logps/rejected": -57.3939323425293, + "loss": 0.3675, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.8504281044006348, + "rewards/margins": 1.5295475721359253, + "rewards/rejected": 1.3208805322647095, + "step": 10493 + }, + { + "epoch": 1.7, + "learning_rate": 5.661601567756819e-07, + "logits/chosen": -1.4114949703216553, + "logits/rejected": -1.2062039375305176, + "logps/chosen": -76.77881622314453, + "logps/rejected": -6.951842784881592, + "loss": 0.1474, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.270256757736206, + "rewards/margins": 2.399207592010498, + "rewards/rejected": 0.8710491061210632, + "step": 10494 + }, + { + "epoch": 1.7, + "learning_rate": 5.655528422531175e-07, + "logits/chosen": -1.3324227333068848, + "logits/rejected": -1.2888989448547363, + "logps/chosen": -39.69696044921875, + "logps/rejected": -46.31014633178711, + "loss": 0.5434, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.436513662338257, + "rewards/margins": 0.06479549407958984, + "rewards/rejected": 2.371718168258667, + "step": 10495 + }, + { + "epoch": 1.7, + "learning_rate": 5.649458341088915e-07, + "logits/chosen": -1.4400173425674438, + "logits/rejected": -1.4957072734832764, + "logps/chosen": -69.02615356445312, + "logps/rejected": -110.31402587890625, + "loss": 1.5887, + "rewards/accuracies": 0.0, + "rewards/chosen": 5.399571895599365, + "rewards/margins": -3.0268197059631348, + "rewards/rejected": 8.4263916015625, + "step": 10496 + }, + { + "epoch": 1.7, + "learning_rate": 5.643391323849445e-07, + "logits/chosen": -1.1975719928741455, + "logits/rejected": -1.2338860034942627, + "logps/chosen": -69.18479919433594, + "logps/rejected": -112.77542114257812, + "loss": 0.8548, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.300502061843872, + "rewards/margins": -1.381866693496704, + "rewards/rejected": 4.682368755340576, + "step": 10497 + }, + { + "epoch": 1.7, + "learning_rate": 5.637327371231921e-07, + "logits/chosen": -1.4734070301055908, + "logits/rejected": -1.5343271493911743, + "logps/chosen": -85.04576873779297, + "logps/rejected": -112.06265258789062, + "loss": 0.0814, + "rewards/accuracies": 1.0, + "rewards/chosen": 7.749027252197266, + "rewards/margins": 1.9155797958374023, + "rewards/rejected": 5.833447456359863, + "step": 10498 + }, + { + "epoch": 1.7, + "learning_rate": 5.631266483655317e-07, + "logits/chosen": -1.6260782480239868, + "logits/rejected": -1.5619879961013794, + "logps/chosen": -108.31451416015625, + "logps/rejected": -53.84243392944336, + "loss": 0.1941, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.246917724609375, + "rewards/margins": 0.8334553241729736, + "rewards/rejected": 2.4134624004364014, + "step": 10499 + }, + { + "epoch": 1.7, + "learning_rate": 5.625208661538373e-07, + "logits/chosen": -0.7115759253501892, + "logits/rejected": -0.70978844165802, + "logps/chosen": -9.957590103149414, + "logps/rejected": -4.222603797912598, + "loss": 1.1542, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.1871662139892578, + "rewards/margins": -0.02492547035217285, + "rewards/rejected": 0.21209168434143066, + "step": 10500 + }, + { + "epoch": 1.7, + "learning_rate": 5.619153905299641e-07, + "logits/chosen": -1.2531157732009888, + "logits/rejected": -1.2654740810394287, + "logps/chosen": -26.436906814575195, + "logps/rejected": -43.740726470947266, + "loss": 0.1571, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.5788660049438477, + "rewards/margins": 1.1837705373764038, + "rewards/rejected": 1.3950954675674438, + "step": 10501 + }, + { + "epoch": 1.7, + "learning_rate": 5.613102215357425e-07, + "logits/chosen": -1.2253202199935913, + "logits/rejected": -1.2586586475372314, + "logps/chosen": -68.27040100097656, + "logps/rejected": -76.29647064208984, + "loss": 0.6361, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.9388152956962585, + "rewards/margins": -0.9239906668663025, + "rewards/rejected": 1.862805962562561, + "step": 10502 + }, + { + "epoch": 1.7, + "learning_rate": 5.607053592129869e-07, + "logits/chosen": -1.305846929550171, + "logits/rejected": -1.3307300806045532, + "logps/chosen": -26.718326568603516, + "logps/rejected": -66.13252258300781, + "loss": 0.3688, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.807737350463867, + "rewards/margins": -0.06230664253234863, + "rewards/rejected": 2.870043992996216, + "step": 10503 + }, + { + "epoch": 1.7, + "learning_rate": 5.601008036034844e-07, + "logits/chosen": -1.298902988433838, + "logits/rejected": -1.3895957469940186, + "logps/chosen": -64.13175964355469, + "logps/rejected": -131.24148559570312, + "loss": 2.656, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.5677002668380737, + "rewards/margins": -5.305965900421143, + "rewards/rejected": 6.873666286468506, + "step": 10504 + }, + { + "epoch": 1.71, + "learning_rate": 5.594965547490067e-07, + "logits/chosen": -1.39070725440979, + "logits/rejected": -1.4022505283355713, + "logps/chosen": -76.25410461425781, + "logps/rejected": -121.93578338623047, + "loss": 2.332, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.3984735012054443, + "rewards/margins": -2.878810167312622, + "rewards/rejected": 6.277283668518066, + "step": 10505 + }, + { + "epoch": 1.71, + "learning_rate": 5.588926126913003e-07, + "logits/chosen": -1.378848671913147, + "logits/rejected": -1.304986596107483, + "logps/chosen": -55.34698486328125, + "logps/rejected": -39.738304138183594, + "loss": 0.2673, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.2959747314453125, + "rewards/margins": 0.4223487377166748, + "rewards/rejected": 3.8736259937286377, + "step": 10506 + }, + { + "epoch": 1.71, + "learning_rate": 5.582889774720923e-07, + "logits/chosen": -1.342003345489502, + "logits/rejected": -1.3259963989257812, + "logps/chosen": -157.48460388183594, + "logps/rejected": -69.23570251464844, + "loss": 0.1706, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.90597677230835, + "rewards/margins": 4.4149298667907715, + "rewards/rejected": 1.4910469055175781, + "step": 10507 + }, + { + "epoch": 1.71, + "learning_rate": 5.576856491330878e-07, + "logits/chosen": -1.335231900215149, + "logits/rejected": -1.2769055366516113, + "logps/chosen": -67.13459014892578, + "logps/rejected": -22.846494674682617, + "loss": 0.5215, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.6521774530410767, + "rewards/margins": -0.3643683195114136, + "rewards/rejected": 2.0165457725524902, + "step": 10508 + }, + { + "epoch": 1.71, + "learning_rate": 5.570826277159719e-07, + "logits/chosen": -1.352359414100647, + "logits/rejected": -1.352359414100647, + "logps/chosen": -28.345796585083008, + "logps/rejected": -28.345796585083008, + "loss": 0.6062, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.9906682968139648, + "rewards/margins": 0.0, + "rewards/rejected": 1.9906682968139648, + "step": 10509 + }, + { + "epoch": 1.71, + "learning_rate": 5.564799132624065e-07, + "logits/chosen": -1.5747981071472168, + "logits/rejected": -1.6156506538391113, + "logps/chosen": -83.050048828125, + "logps/rejected": -97.86827087402344, + "loss": 0.7583, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.783033847808838, + "rewards/margins": -0.23909759521484375, + "rewards/rejected": 3.0221314430236816, + "step": 10510 + }, + { + "epoch": 1.71, + "learning_rate": 5.558775058140353e-07, + "logits/chosen": -1.3593864440917969, + "logits/rejected": -1.3593864440917969, + "logps/chosen": -79.50558471679688, + "logps/rejected": -79.50558471679688, + "loss": 0.4408, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.768345594406128, + "rewards/margins": 0.0, + "rewards/rejected": 3.768345594406128, + "step": 10511 + }, + { + "epoch": 1.71, + "learning_rate": 5.552754054124765e-07, + "logits/chosen": -1.141951322555542, + "logits/rejected": -1.3190923929214478, + "logps/chosen": -118.04879760742188, + "logps/rejected": -159.36868286132812, + "loss": 2.7972, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.096832275390625, + "rewards/margins": -5.447024345397949, + "rewards/rejected": 8.543856620788574, + "step": 10512 + }, + { + "epoch": 1.71, + "learning_rate": 5.546736120993318e-07, + "logits/chosen": -1.0171269178390503, + "logits/rejected": -1.0171269178390503, + "logps/chosen": -44.555641174316406, + "logps/rejected": -44.555641174316406, + "loss": 1.3763, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.2622315883636475, + "rewards/margins": 0.0, + "rewards/rejected": 3.2622315883636475, + "step": 10513 + }, + { + "epoch": 1.71, + "learning_rate": 5.540721259161774e-07, + "logits/chosen": -1.2791962623596191, + "logits/rejected": -1.2889176607131958, + "logps/chosen": -73.19306945800781, + "logps/rejected": -68.54069519042969, + "loss": 1.5693, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.779296875, + "rewards/margins": -2.0651047229766846, + "rewards/rejected": 2.8444015979766846, + "step": 10514 + }, + { + "epoch": 1.71, + "learning_rate": 5.534709469045724e-07, + "logits/chosen": -1.07747483253479, + "logits/rejected": -1.0113434791564941, + "logps/chosen": -34.61902618408203, + "logps/rejected": -22.73873519897461, + "loss": 0.4976, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.3840324878692627, + "rewards/margins": 0.6460803747177124, + "rewards/rejected": 1.7379521131515503, + "step": 10515 + }, + { + "epoch": 1.71, + "learning_rate": 5.5287007510605e-07, + "logits/chosen": -1.3536005020141602, + "logits/rejected": -1.1910221576690674, + "logps/chosen": -62.56910705566406, + "logps/rejected": -71.82905578613281, + "loss": 0.0401, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.7355363368988037, + "rewards/margins": 5.204646587371826, + "rewards/rejected": -1.469110131263733, + "step": 10516 + }, + { + "epoch": 1.71, + "learning_rate": 5.522695105621278e-07, + "logits/chosen": -1.4794708490371704, + "logits/rejected": -1.4584941864013672, + "logps/chosen": -81.44947814941406, + "logps/rejected": -128.8384552001953, + "loss": 0.1207, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.558896064758301, + "rewards/margins": 1.3611559867858887, + "rewards/rejected": 5.197740077972412, + "step": 10517 + }, + { + "epoch": 1.71, + "learning_rate": 5.51669253314297e-07, + "logits/chosen": -1.3995976448059082, + "logits/rejected": -1.5732203722000122, + "logps/chosen": -52.76287841796875, + "logps/rejected": -152.78369140625, + "loss": 2.418, + "rewards/accuracies": 0.0, + "rewards/chosen": 4.248935699462891, + "rewards/margins": -4.011431694030762, + "rewards/rejected": 8.260367393493652, + "step": 10518 + }, + { + "epoch": 1.71, + "learning_rate": 5.510693034040299e-07, + "logits/chosen": -1.557655692100525, + "logits/rejected": -1.5870532989501953, + "logps/chosen": -114.1830825805664, + "logps/rejected": -98.0029067993164, + "loss": 1.0028, + "rewards/accuracies": 0.0, + "rewards/chosen": 5.241313934326172, + "rewards/margins": -1.853593349456787, + "rewards/rejected": 7.094907283782959, + "step": 10519 + }, + { + "epoch": 1.71, + "learning_rate": 5.504696608727783e-07, + "logits/chosen": -1.3937008380889893, + "logits/rejected": -1.350839614868164, + "logps/chosen": -42.367130279541016, + "logps/rejected": -42.405609130859375, + "loss": 0.4072, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.5112438201904297, + "rewards/margins": 0.08300900459289551, + "rewards/rejected": 2.428234815597534, + "step": 10520 + }, + { + "epoch": 1.71, + "learning_rate": 5.498703257619709e-07, + "logits/chosen": -1.1908254623413086, + "logits/rejected": -1.1871798038482666, + "logps/chosen": -5.368841648101807, + "logps/rejected": -7.045937538146973, + "loss": 0.388, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.3213496804237366, + "rewards/margins": -0.11110186576843262, + "rewards/rejected": 0.4324515461921692, + "step": 10521 + }, + { + "epoch": 1.71, + "learning_rate": 5.492712981130171e-07, + "logits/chosen": -1.0781234502792358, + "logits/rejected": -1.0763564109802246, + "logps/chosen": -2.6458654403686523, + "logps/rejected": -1.9749876260757446, + "loss": 0.4031, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.3279649317264557, + "rewards/margins": -0.1016741693019867, + "rewards/rejected": 0.4296391010284424, + "step": 10522 + }, + { + "epoch": 1.71, + "learning_rate": 5.486725779673024e-07, + "logits/chosen": -1.0944640636444092, + "logits/rejected": -1.1510530710220337, + "logps/chosen": -46.49254608154297, + "logps/rejected": -86.92491912841797, + "loss": 0.8029, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.010411024093628, + "rewards/margins": 0.36304616928100586, + "rewards/rejected": 2.647364854812622, + "step": 10523 + }, + { + "epoch": 1.71, + "learning_rate": 5.480741653661953e-07, + "logits/chosen": -1.4790902137756348, + "logits/rejected": -1.4815481901168823, + "logps/chosen": -62.36785125732422, + "logps/rejected": -70.74927520751953, + "loss": 0.3724, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.430530548095703, + "rewards/margins": -0.02699136734008789, + "rewards/rejected": 2.457521915435791, + "step": 10524 + }, + { + "epoch": 1.71, + "learning_rate": 5.474760603510376e-07, + "logits/chosen": -0.9732973575592041, + "logits/rejected": -0.9423074126243591, + "logps/chosen": -37.50963592529297, + "logps/rejected": -36.69010925292969, + "loss": 0.3068, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.897013545036316, + "rewards/margins": 0.7224301099777222, + "rewards/rejected": 1.1745834350585938, + "step": 10525 + }, + { + "epoch": 1.71, + "learning_rate": 5.468782629631547e-07, + "logits/chosen": -1.164156198501587, + "logits/rejected": -1.0402514934539795, + "logps/chosen": -56.79621505737305, + "logps/rejected": -33.366188049316406, + "loss": 0.3794, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.4420878887176514, + "rewards/margins": -0.09676480293273926, + "rewards/rejected": 2.5388526916503906, + "step": 10526 + }, + { + "epoch": 1.71, + "learning_rate": 5.462807732438469e-07, + "logits/chosen": -1.179751992225647, + "logits/rejected": -1.136650800704956, + "logps/chosen": -47.53467559814453, + "logps/rejected": -63.004669189453125, + "loss": 0.6539, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.703137159347534, + "rewards/margins": 0.00788259506225586, + "rewards/rejected": 2.6952545642852783, + "step": 10527 + }, + { + "epoch": 1.71, + "learning_rate": 5.456835912343977e-07, + "logits/chosen": -1.4526821374893188, + "logits/rejected": -1.4695982933044434, + "logps/chosen": -56.16276550292969, + "logps/rejected": -105.17010498046875, + "loss": 1.046, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.8059403896331787, + "rewards/margins": -1.91367506980896, + "rewards/rejected": 5.719615459442139, + "step": 10528 + }, + { + "epoch": 1.71, + "learning_rate": 5.450867169760638e-07, + "logits/chosen": -1.7877776622772217, + "logits/rejected": -1.7493457794189453, + "logps/chosen": -90.9808349609375, + "logps/rejected": -20.728425979614258, + "loss": 0.3767, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.0124549865722656, + "rewards/margins": -0.06274199485778809, + "rewards/rejected": 2.0751969814300537, + "step": 10529 + }, + { + "epoch": 1.71, + "learning_rate": 5.444901505100858e-07, + "logits/chosen": -1.207602620124817, + "logits/rejected": -1.2813584804534912, + "logps/chosen": -59.38445281982422, + "logps/rejected": -108.55172729492188, + "loss": 1.6353, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.4828057289123535, + "rewards/margins": -1.1026954650878906, + "rewards/rejected": 4.585501194000244, + "step": 10530 + }, + { + "epoch": 1.71, + "learning_rate": 5.438938918776793e-07, + "logits/chosen": -1.0214521884918213, + "logits/rejected": -1.0458810329437256, + "logps/chosen": -75.6903076171875, + "logps/rejected": -71.56859588623047, + "loss": 0.3592, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.406874179840088, + "rewards/margins": 0.023879289627075195, + "rewards/rejected": 3.3829948902130127, + "step": 10531 + }, + { + "epoch": 1.71, + "learning_rate": 5.432979411200417e-07, + "logits/chosen": -1.323285460472107, + "logits/rejected": -1.310236930847168, + "logps/chosen": -41.171791076660156, + "logps/rejected": -81.20909118652344, + "loss": 2.5743, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.4133248329162598, + "rewards/margins": 1.019749402999878, + "rewards/rejected": 2.393575429916382, + "step": 10532 + }, + { + "epoch": 1.71, + "learning_rate": 5.427022982783459e-07, + "logits/chosen": -1.4163540601730347, + "logits/rejected": -1.5625520944595337, + "logps/chosen": -113.09980773925781, + "logps/rejected": -121.98130798339844, + "loss": 0.9653, + "rewards/accuracies": 0.0, + "rewards/chosen": 7.910093784332275, + "rewards/margins": -1.6452183723449707, + "rewards/rejected": 9.555312156677246, + "step": 10533 + }, + { + "epoch": 1.71, + "learning_rate": 5.42106963393747e-07, + "logits/chosen": -1.126883625984192, + "logits/rejected": -1.0686025619506836, + "logps/chosen": -74.93943786621094, + "logps/rejected": -55.2086296081543, + "loss": 0.4551, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.3594627380371094, + "rewards/margins": 1.3131023645401, + "rewards/rejected": 1.0463603734970093, + "step": 10534 + }, + { + "epoch": 1.71, + "learning_rate": 5.415119365073746e-07, + "logits/chosen": -1.296498417854309, + "logits/rejected": -1.1535508632659912, + "logps/chosen": -124.73661804199219, + "logps/rejected": -84.00645446777344, + "loss": 0.0471, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.8632493019104, + "rewards/margins": 4.861701011657715, + "rewards/rejected": 2.0015480518341064, + "step": 10535 + }, + { + "epoch": 1.71, + "learning_rate": 5.409172176603422e-07, + "logits/chosen": -1.0846420526504517, + "logits/rejected": -1.0970253944396973, + "logps/chosen": -110.44126892089844, + "logps/rejected": -52.181007385253906, + "loss": 0.6177, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.9205429553985596, + "rewards/margins": -0.4380638599395752, + "rewards/rejected": 2.3586068153381348, + "step": 10536 + }, + { + "epoch": 1.71, + "learning_rate": 5.403228068937361e-07, + "logits/chosen": -1.1122353076934814, + "logits/rejected": -1.215207576751709, + "logps/chosen": -52.824554443359375, + "logps/rejected": -107.03341674804688, + "loss": 2.2911, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.2246499061584473, + "rewards/margins": -3.9435462951660156, + "rewards/rejected": 6.168196201324463, + "step": 10537 + }, + { + "epoch": 1.71, + "learning_rate": 5.397287042486277e-07, + "logits/chosen": -1.4264109134674072, + "logits/rejected": -1.2398762702941895, + "logps/chosen": -62.25996017456055, + "logps/rejected": -50.79356384277344, + "loss": 0.4667, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.725492477416992, + "rewards/margins": 3.57612943649292, + "rewards/rejected": 2.1493630409240723, + "step": 10538 + }, + { + "epoch": 1.71, + "learning_rate": 5.391349097660614e-07, + "logits/chosen": -1.2943779230117798, + "logits/rejected": -1.2440677881240845, + "logps/chosen": -94.0025634765625, + "logps/rejected": -106.94860076904297, + "loss": 0.5114, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.813864231109619, + "rewards/margins": 3.7812767028808594, + "rewards/rejected": 3.0325875282287598, + "step": 10539 + }, + { + "epoch": 1.71, + "learning_rate": 5.385414234870645e-07, + "logits/chosen": -0.870449960231781, + "logits/rejected": -0.9423702955245972, + "logps/chosen": -63.069786071777344, + "logps/rejected": -57.72607421875, + "loss": 0.3999, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.320955753326416, + "rewards/margins": -0.16502594947814941, + "rewards/rejected": 2.4859817028045654, + "step": 10540 + }, + { + "epoch": 1.71, + "learning_rate": 5.379482454526396e-07, + "logits/chosen": -1.1630491018295288, + "logits/rejected": -1.1825072765350342, + "logps/chosen": -56.689659118652344, + "logps/rejected": -161.93893432617188, + "loss": 0.6346, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.539357900619507, + "rewards/margins": 0.24990320205688477, + "rewards/rejected": 2.289454698562622, + "step": 10541 + }, + { + "epoch": 1.71, + "learning_rate": 5.37355375703772e-07, + "logits/chosen": -1.672379732131958, + "logits/rejected": -1.6586277484893799, + "logps/chosen": -44.43524932861328, + "logps/rejected": -55.31641387939453, + "loss": 0.5871, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.284078359603882, + "rewards/margins": -0.625544548034668, + "rewards/rejected": 3.90962290763855, + "step": 10542 + }, + { + "epoch": 1.71, + "learning_rate": 5.367628142814208e-07, + "logits/chosen": -1.2439818382263184, + "logits/rejected": -1.2017513513565063, + "logps/chosen": -70.40567016601562, + "logps/rejected": -52.92387008666992, + "loss": 0.4024, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.695361375808716, + "rewards/margins": 1.2115932703018188, + "rewards/rejected": 1.483768105506897, + "step": 10543 + }, + { + "epoch": 1.71, + "learning_rate": 5.361705612265284e-07, + "logits/chosen": -1.3138248920440674, + "logits/rejected": -1.3815346956253052, + "logps/chosen": -133.84336853027344, + "logps/rejected": -135.65223693847656, + "loss": 1.148, + "rewards/accuracies": 0.0, + "rewards/chosen": 6.5907440185546875, + "rewards/margins": -2.1524906158447266, + "rewards/rejected": 8.743234634399414, + "step": 10544 + }, + { + "epoch": 1.71, + "learning_rate": 5.35578616580012e-07, + "logits/chosen": -1.0706787109375, + "logits/rejected": -1.0911279916763306, + "logps/chosen": -125.70155334472656, + "logps/rejected": -75.65165710449219, + "loss": 1.1354, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.482588291168213, + "rewards/margins": 0.44516825675964355, + "rewards/rejected": 3.0374200344085693, + "step": 10545 + }, + { + "epoch": 1.71, + "learning_rate": 5.349869803827717e-07, + "logits/chosen": -1.073618769645691, + "logits/rejected": -1.0402419567108154, + "logps/chosen": -46.506961822509766, + "logps/rejected": -50.97660446166992, + "loss": 0.4222, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.5470471382141113, + "rewards/margins": 1.4825714826583862, + "rewards/rejected": 1.064475655555725, + "step": 10546 + }, + { + "epoch": 1.71, + "learning_rate": 5.343956526756816e-07, + "logits/chosen": -1.1340128183364868, + "logits/rejected": -1.0538052320480347, + "logps/chosen": -73.26225280761719, + "logps/rejected": -72.93222045898438, + "loss": 0.2091, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.3549301624298096, + "rewards/margins": 0.6567963361740112, + "rewards/rejected": 1.6981338262557983, + "step": 10547 + }, + { + "epoch": 1.71, + "learning_rate": 5.338046334995989e-07, + "logits/chosen": -1.713060736656189, + "logits/rejected": -1.5667715072631836, + "logps/chosen": -145.0616912841797, + "logps/rejected": -68.29684448242188, + "loss": 0.0009, + "rewards/accuracies": 1.0, + "rewards/chosen": 8.983672142028809, + "rewards/margins": 6.438443183898926, + "rewards/rejected": 2.5452287197113037, + "step": 10548 + }, + { + "epoch": 1.71, + "learning_rate": 5.332139228953554e-07, + "logits/chosen": -0.9791818261146545, + "logits/rejected": -0.9791818261146545, + "logps/chosen": -1.0458447933197021, + "logps/rejected": -1.0458447933197021, + "loss": 0.5448, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.16512265801429749, + "rewards/margins": 0.0, + "rewards/rejected": 0.16512265801429749, + "step": 10549 + }, + { + "epoch": 1.71, + "learning_rate": 5.326235209037656e-07, + "logits/chosen": -1.4719191789627075, + "logits/rejected": -1.0820027589797974, + "logps/chosen": -256.88983154296875, + "logps/rejected": -49.72393798828125, + "loss": 0.1178, + "rewards/accuracies": 1.0, + "rewards/chosen": 7.707818508148193, + "rewards/margins": 3.3615894317626953, + "rewards/rejected": 4.346229076385498, + "step": 10550 + }, + { + "epoch": 1.71, + "learning_rate": 5.320334275656191e-07, + "logits/chosen": -1.1506234407424927, + "logits/rejected": -1.1506234407424927, + "logps/chosen": -38.04119873046875, + "logps/rejected": -38.04119873046875, + "loss": 0.3741, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.350106120109558, + "rewards/margins": 0.0, + "rewards/rejected": 1.350106120109558, + "step": 10551 + }, + { + "epoch": 1.71, + "learning_rate": 5.314436429216874e-07, + "logits/chosen": -1.2345595359802246, + "logits/rejected": -1.0817610025405884, + "logps/chosen": -48.223777770996094, + "logps/rejected": -28.194597244262695, + "loss": 0.1202, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.2341995239257812, + "rewards/margins": 2.014127731323242, + "rewards/rejected": 0.22007179260253906, + "step": 10552 + }, + { + "epoch": 1.71, + "learning_rate": 5.308541670127171e-07, + "logits/chosen": -1.4193731546401978, + "logits/rejected": -1.4706189632415771, + "logps/chosen": -46.9079704284668, + "logps/rejected": -66.80912017822266, + "loss": 0.2419, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.8814456462860107, + "rewards/margins": 2.6313624382019043, + "rewards/rejected": 1.2500832080841064, + "step": 10553 + }, + { + "epoch": 1.71, + "learning_rate": 5.302649998794368e-07, + "logits/chosen": -1.0032322406768799, + "logits/rejected": -1.0032322406768799, + "logps/chosen": -15.261445999145508, + "logps/rejected": -15.261445999145508, + "loss": 0.3685, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.8532697558403015, + "rewards/margins": 0.0, + "rewards/rejected": 0.8532697558403015, + "step": 10554 + }, + { + "epoch": 1.71, + "learning_rate": 5.296761415625523e-07, + "logits/chosen": -1.2274059057235718, + "logits/rejected": -1.1757892370224, + "logps/chosen": -55.024436950683594, + "logps/rejected": -95.81500244140625, + "loss": 0.6321, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.7911019325256348, + "rewards/margins": 0.24543070793151855, + "rewards/rejected": 3.545671224594116, + "step": 10555 + }, + { + "epoch": 1.71, + "learning_rate": 5.290875921027471e-07, + "logits/chosen": -1.5381449460983276, + "logits/rejected": -1.5453664064407349, + "logps/chosen": -36.93529510498047, + "logps/rejected": -84.4297866821289, + "loss": 1.4068, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.6141510009765625, + "rewards/margins": 0.09194564819335938, + "rewards/rejected": 2.522205352783203, + "step": 10556 + }, + { + "epoch": 1.71, + "learning_rate": 5.284993515406861e-07, + "logits/chosen": -1.316792368888855, + "logits/rejected": -1.316792368888855, + "logps/chosen": -59.88601303100586, + "logps/rejected": -59.88601303100586, + "loss": 0.3528, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.873403549194336, + "rewards/margins": 0.0, + "rewards/rejected": 3.873403549194336, + "step": 10557 + }, + { + "epoch": 1.71, + "learning_rate": 5.279114199170094e-07, + "logits/chosen": -1.5581008195877075, + "logits/rejected": -1.4641902446746826, + "logps/chosen": -106.40863800048828, + "logps/rejected": -29.628320693969727, + "loss": 0.0708, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.5572807788848877, + "rewards/margins": 2.120170831680298, + "rewards/rejected": 0.43710994720458984, + "step": 10558 + }, + { + "epoch": 1.71, + "learning_rate": 5.273237972723388e-07, + "logits/chosen": -1.6304727792739868, + "logits/rejected": -1.5706536769866943, + "logps/chosen": -105.89981079101562, + "logps/rejected": -22.284217834472656, + "loss": 0.2018, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.943307638168335, + "rewards/margins": 1.8428699970245361, + "rewards/rejected": 2.100437641143799, + "step": 10559 + }, + { + "epoch": 1.71, + "learning_rate": 5.267364836472727e-07, + "logits/chosen": -1.5253887176513672, + "logits/rejected": -1.3972034454345703, + "logps/chosen": -89.77751159667969, + "logps/rejected": -29.48522186279297, + "loss": 1.1759, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.354780673980713, + "rewards/margins": 3.7969837188720703, + "rewards/rejected": 0.5577968955039978, + "step": 10560 + }, + { + "epoch": 1.71, + "learning_rate": 5.261494790823896e-07, + "logits/chosen": -1.2415498495101929, + "logits/rejected": -1.2317243814468384, + "logps/chosen": -41.52122116088867, + "logps/rejected": -41.27195739746094, + "loss": 0.7545, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.8093106746673584, + "rewards/margins": -0.14916419982910156, + "rewards/rejected": 3.95847487449646, + "step": 10561 + }, + { + "epoch": 1.71, + "learning_rate": 5.255627836182453e-07, + "logits/chosen": -1.3357467651367188, + "logits/rejected": -1.3774505853652954, + "logps/chosen": -112.79804992675781, + "logps/rejected": -163.52716064453125, + "loss": 0.7753, + "rewards/accuracies": 0.0, + "rewards/chosen": 5.6409010887146, + "rewards/margins": -1.308546543121338, + "rewards/rejected": 6.9494476318359375, + "step": 10562 + }, + { + "epoch": 1.71, + "learning_rate": 5.249763972953758e-07, + "logits/chosen": -1.138757586479187, + "logits/rejected": -1.2121086120605469, + "logps/chosen": -27.189796447753906, + "logps/rejected": -108.87472534179688, + "loss": 1.4879, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.4425346851348877, + "rewards/margins": -1.1221764087677002, + "rewards/rejected": 3.564711093902588, + "step": 10563 + }, + { + "epoch": 1.71, + "learning_rate": 5.243903201542927e-07, + "logits/chosen": -1.4468450546264648, + "logits/rejected": -1.5162631273269653, + "logps/chosen": -191.87008666992188, + "logps/rejected": -144.84060668945312, + "loss": 0.3141, + "rewards/accuracies": 1.0, + "rewards/chosen": 7.634756565093994, + "rewards/margins": 0.1355729103088379, + "rewards/rejected": 7.499183654785156, + "step": 10564 + }, + { + "epoch": 1.71, + "learning_rate": 5.238045522354912e-07, + "logits/chosen": -1.1503318548202515, + "logits/rejected": -1.1906028985977173, + "logps/chosen": -144.72381591796875, + "logps/rejected": -91.38453674316406, + "loss": 0.5954, + "rewards/accuracies": 0.0, + "rewards/chosen": 5.549215793609619, + "rewards/margins": -0.8145904541015625, + "rewards/rejected": 6.363806247711182, + "step": 10565 + }, + { + "epoch": 1.71, + "learning_rate": 5.232190935794396e-07, + "logits/chosen": -0.9433849453926086, + "logits/rejected": -0.9433849453926086, + "logps/chosen": -28.895736694335938, + "logps/rejected": -28.895736694335938, + "loss": 1.1817, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.0239067077636719, + "rewards/margins": 0.0, + "rewards/rejected": 1.0239067077636719, + "step": 10566 + }, + { + "epoch": 1.72, + "learning_rate": 5.226339442265904e-07, + "logits/chosen": -1.4871799945831299, + "logits/rejected": -1.506292700767517, + "logps/chosen": -56.00605010986328, + "logps/rejected": -62.79988098144531, + "loss": 1.6279, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.4380104541778564, + "rewards/margins": 0.37238240242004395, + "rewards/rejected": 2.0656280517578125, + "step": 10567 + }, + { + "epoch": 1.72, + "learning_rate": 5.220491042173692e-07, + "logits/chosen": -1.3344850540161133, + "logits/rejected": -1.3898675441741943, + "logps/chosen": -64.42719268798828, + "logps/rejected": -70.81411743164062, + "loss": 0.2569, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.1467278003692627, + "rewards/margins": 1.0215644836425781, + "rewards/rejected": 1.1251633167266846, + "step": 10568 + }, + { + "epoch": 1.72, + "learning_rate": 5.214645735921847e-07, + "logits/chosen": -0.8558882474899292, + "logits/rejected": -0.850665271282196, + "logps/chosen": -8.437239646911621, + "logps/rejected": -2.3846912384033203, + "loss": 0.3641, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.3778873383998871, + "rewards/margins": -0.002772003412246704, + "rewards/rejected": 0.3806593418121338, + "step": 10569 + }, + { + "epoch": 1.72, + "learning_rate": 5.208803523914213e-07, + "logits/chosen": -1.1082828044891357, + "logits/rejected": -1.0561217069625854, + "logps/chosen": -31.334747314453125, + "logps/rejected": -36.974422454833984, + "loss": 1.5463, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.3484100103378296, + "rewards/margins": -1.9703739881515503, + "rewards/rejected": 3.31878399848938, + "step": 10570 + }, + { + "epoch": 1.72, + "learning_rate": 5.20296440655445e-07, + "logits/chosen": -0.858575165271759, + "logits/rejected": -0.858575165271759, + "logps/chosen": -50.88150405883789, + "logps/rejected": -50.88150405883789, + "loss": 0.9353, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.823434054851532, + "rewards/margins": 0.0, + "rewards/rejected": 0.823434054851532, + "step": 10571 + }, + { + "epoch": 1.72, + "learning_rate": 5.197128384245959e-07, + "logits/chosen": -1.5176653861999512, + "logits/rejected": -1.4994885921478271, + "logps/chosen": -39.12755584716797, + "logps/rejected": -49.48054504394531, + "loss": 0.9556, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.467139482498169, + "rewards/margins": -1.5749778747558594, + "rewards/rejected": 3.0421173572540283, + "step": 10572 + }, + { + "epoch": 1.72, + "learning_rate": 5.191295457391976e-07, + "logits/chosen": -1.5284595489501953, + "logits/rejected": -1.4442001581192017, + "logps/chosen": -130.84271240234375, + "logps/rejected": -102.86078643798828, + "loss": 0.1911, + "rewards/accuracies": 1.0, + "rewards/chosen": 7.402026653289795, + "rewards/margins": 0.8163766860961914, + "rewards/rejected": 6.5856499671936035, + "step": 10573 + }, + { + "epoch": 1.72, + "learning_rate": 5.185465626395486e-07, + "logits/chosen": -1.3350341320037842, + "logits/rejected": -1.3600091934204102, + "logps/chosen": -77.56947326660156, + "logps/rejected": -95.05110168457031, + "loss": 1.98, + "rewards/accuracies": 0.0, + "rewards/chosen": 5.682933330535889, + "rewards/margins": -3.937610149383545, + "rewards/rejected": 9.620543479919434, + "step": 10574 + }, + { + "epoch": 1.72, + "learning_rate": 5.179638891659283e-07, + "logits/chosen": -1.330798625946045, + "logits/rejected": -1.1972603797912598, + "logps/chosen": -75.61662292480469, + "logps/rejected": -38.03537368774414, + "loss": 0.12, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.58830189704895, + "rewards/margins": 1.55256986618042, + "rewards/rejected": 2.0357320308685303, + "step": 10575 + }, + { + "epoch": 1.72, + "learning_rate": 5.173815253585951e-07, + "logits/chosen": -1.325163722038269, + "logits/rejected": -1.3526090383529663, + "logps/chosen": -56.43265151977539, + "logps/rejected": -63.11412048339844, + "loss": 0.4584, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.938609004020691, + "rewards/margins": 0.362521767616272, + "rewards/rejected": 1.576087236404419, + "step": 10576 + }, + { + "epoch": 1.72, + "learning_rate": 5.167994712577823e-07, + "logits/chosen": -1.3348884582519531, + "logits/rejected": -1.3235269784927368, + "logps/chosen": -74.68641662597656, + "logps/rejected": -104.55778503417969, + "loss": 0.1938, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.0381805896759033, + "rewards/margins": 0.8169349431991577, + "rewards/rejected": 1.2212456464767456, + "step": 10577 + }, + { + "epoch": 1.72, + "learning_rate": 5.162177269037061e-07, + "logits/chosen": -1.4221892356872559, + "logits/rejected": -1.4346535205841064, + "logps/chosen": -108.7492446899414, + "logps/rejected": -123.6334228515625, + "loss": 2.8293, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.24962463974952698, + "rewards/margins": -5.647302150726318, + "rewards/rejected": 5.8969268798828125, + "step": 10578 + }, + { + "epoch": 1.72, + "learning_rate": 5.156362923365587e-07, + "logits/chosen": -0.8074179291725159, + "logits/rejected": -0.8750815391540527, + "logps/chosen": -1.9969395399093628, + "logps/rejected": -58.99747085571289, + "loss": 1.0606, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.4853452742099762, + "rewards/margins": -0.9494504928588867, + "rewards/rejected": 1.4347957372665405, + "step": 10579 + }, + { + "epoch": 1.72, + "learning_rate": 5.150551675965126e-07, + "logits/chosen": -0.8267631530761719, + "logits/rejected": -0.8791599273681641, + "logps/chosen": -47.65945816040039, + "logps/rejected": -71.03010559082031, + "loss": 0.4084, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.9243505001068115, + "rewards/margins": -0.08736205101013184, + "rewards/rejected": 4.011712551116943, + "step": 10580 + }, + { + "epoch": 1.72, + "learning_rate": 5.144743527237167e-07, + "logits/chosen": -1.020480751991272, + "logits/rejected": -0.9308223128318787, + "logps/chosen": -48.79267883300781, + "logps/rejected": -36.737037658691406, + "loss": 1.1627, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.6876411437988281, + "rewards/margins": -1.9146974086761475, + "rewards/rejected": 3.6023385524749756, + "step": 10581 + }, + { + "epoch": 1.72, + "learning_rate": 5.138938477583017e-07, + "logits/chosen": -1.609432339668274, + "logits/rejected": -1.391165852546692, + "logps/chosen": -152.10549926757812, + "logps/rejected": -40.42026138305664, + "loss": 0.2303, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.311051845550537, + "rewards/margins": 4.372251510620117, + "rewards/rejected": 1.9388004541397095, + "step": 10582 + }, + { + "epoch": 1.72, + "learning_rate": 5.133136527403725e-07, + "logits/chosen": -1.293798565864563, + "logits/rejected": -1.2900867462158203, + "logps/chosen": -20.290803909301758, + "logps/rejected": -110.91656494140625, + "loss": 0.8465, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.416410207748413, + "rewards/margins": -0.4203300476074219, + "rewards/rejected": 3.836740255355835, + "step": 10583 + }, + { + "epoch": 1.72, + "learning_rate": 5.12733767710018e-07, + "logits/chosen": -1.3602898120880127, + "logits/rejected": -1.3602898120880127, + "logps/chosen": -48.734004974365234, + "logps/rejected": -48.734004974365234, + "loss": 0.4683, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.391667604446411, + "rewards/margins": 0.0, + "rewards/rejected": 2.391667604446411, + "step": 10584 + }, + { + "epoch": 1.72, + "learning_rate": 5.121541927072998e-07, + "logits/chosen": -1.3269116878509521, + "logits/rejected": -1.3269116878509521, + "logps/chosen": -62.85332489013672, + "logps/rejected": -62.85332489013672, + "loss": 2.126, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.5421340465545654, + "rewards/margins": 0.0, + "rewards/rejected": 2.5421340465545654, + "step": 10585 + }, + { + "epoch": 1.72, + "learning_rate": 5.115749277722637e-07, + "logits/chosen": -1.0374467372894287, + "logits/rejected": -1.0763026475906372, + "logps/chosen": -50.289573669433594, + "logps/rejected": -72.08141326904297, + "loss": 0.4286, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.989064931869507, + "rewards/margins": 1.7240723371505737, + "rewards/rejected": 1.264992594718933, + "step": 10586 + }, + { + "epoch": 1.72, + "learning_rate": 5.109959729449293e-07, + "logits/chosen": -1.4153764247894287, + "logits/rejected": -1.36381196975708, + "logps/chosen": -65.48416137695312, + "logps/rejected": -50.39338684082031, + "loss": 0.3937, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.8514955043792725, + "rewards/margins": -0.13336634635925293, + "rewards/rejected": 2.9848618507385254, + "step": 10587 + }, + { + "epoch": 1.72, + "learning_rate": 5.104173282652985e-07, + "logits/chosen": -1.4733495712280273, + "logits/rejected": -1.54750394821167, + "logps/chosen": -54.670372009277344, + "logps/rejected": -84.06802368164062, + "loss": 0.4592, + "rewards/accuracies": 0.0, + "rewards/chosen": 4.746753692626953, + "rewards/margins": -0.32648229598999023, + "rewards/rejected": 5.073235988616943, + "step": 10588 + }, + { + "epoch": 1.72, + "learning_rate": 5.098389937733489e-07, + "logits/chosen": -1.5395805835723877, + "logits/rejected": -1.5801286697387695, + "logps/chosen": -87.02081298828125, + "logps/rejected": -123.18666076660156, + "loss": 2.3553, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.8829056024551392, + "rewards/margins": -4.670631408691406, + "rewards/rejected": 6.553536891937256, + "step": 10589 + }, + { + "epoch": 1.72, + "learning_rate": 5.092609695090395e-07, + "logits/chosen": -1.3955656290054321, + "logits/rejected": -1.3955656290054321, + "logps/chosen": -4.372325897216797, + "logps/rejected": -4.372325897216797, + "loss": 1.2421, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.6218711137771606, + "rewards/margins": 0.0, + "rewards/rejected": 0.6218711137771606, + "step": 10590 + }, + { + "epoch": 1.72, + "learning_rate": 5.086832555123039e-07, + "logits/chosen": -0.9508644342422485, + "logits/rejected": -0.9583256840705872, + "logps/chosen": -158.04052734375, + "logps/rejected": -90.08088684082031, + "loss": 0.408, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.508435249328613, + "rewards/margins": 3.8264153003692627, + "rewards/rejected": 2.6820199489593506, + "step": 10591 + }, + { + "epoch": 1.72, + "learning_rate": 5.081058518230592e-07, + "logits/chosen": -1.3427879810333252, + "logits/rejected": -1.272239089012146, + "logps/chosen": -53.902339935302734, + "logps/rejected": -73.56716918945312, + "loss": 1.4955, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.1180455684661865, + "rewards/margins": -0.9740216732025146, + "rewards/rejected": 4.092067241668701, + "step": 10592 + }, + { + "epoch": 1.72, + "learning_rate": 5.075287584811966e-07, + "logits/chosen": -1.237607479095459, + "logits/rejected": -1.252299189567566, + "logps/chosen": -42.995262145996094, + "logps/rejected": -53.12181854248047, + "loss": 0.1929, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.1363625526428223, + "rewards/margins": 0.8830300569534302, + "rewards/rejected": 1.253332495689392, + "step": 10593 + }, + { + "epoch": 1.72, + "learning_rate": 5.0695197552659e-07, + "logits/chosen": -1.5348368883132935, + "logits/rejected": -1.4084357023239136, + "logps/chosen": -124.13534545898438, + "logps/rejected": -65.9747085571289, + "loss": 0.0284, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.283276557922363, + "rewards/margins": 4.032602787017822, + "rewards/rejected": 2.250673770904541, + "step": 10594 + }, + { + "epoch": 1.72, + "learning_rate": 5.063755029990869e-07, + "logits/chosen": -1.1478451490402222, + "logits/rejected": -1.1081936359405518, + "logps/chosen": -45.85828399658203, + "logps/rejected": -6.082748889923096, + "loss": 0.7368, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.7358078360557556, + "rewards/margins": 0.0796552300453186, + "rewards/rejected": 0.656152606010437, + "step": 10595 + }, + { + "epoch": 1.72, + "learning_rate": 5.057993409385181e-07, + "logits/chosen": -1.1876975297927856, + "logits/rejected": -1.1853083372116089, + "logps/chosen": -183.38497924804688, + "logps/rejected": -62.9236946105957, + "loss": 0.1055, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.549357891082764, + "rewards/margins": 1.8550825119018555, + "rewards/rejected": 4.694275379180908, + "step": 10596 + }, + { + "epoch": 1.72, + "learning_rate": 5.052234893846902e-07, + "logits/chosen": -1.0686674118041992, + "logits/rejected": -0.9628720283508301, + "logps/chosen": -34.243797302246094, + "logps/rejected": -6.224656105041504, + "loss": 0.7163, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.6377930641174316, + "rewards/margins": 1.7633600234985352, + "rewards/rejected": 0.8744329810142517, + "step": 10597 + }, + { + "epoch": 1.72, + "learning_rate": 5.046479483773897e-07, + "logits/chosen": -1.2053265571594238, + "logits/rejected": -1.221337080001831, + "logps/chosen": -77.08845520019531, + "logps/rejected": -97.48831176757812, + "loss": 2.9626, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.700152635574341, + "rewards/margins": 1.0216988325119019, + "rewards/rejected": 1.678453803062439, + "step": 10598 + }, + { + "epoch": 1.72, + "learning_rate": 5.040727179563793e-07, + "logits/chosen": -1.3750922679901123, + "logits/rejected": -1.2901008129119873, + "logps/chosen": -34.04911804199219, + "logps/rejected": -25.831771850585938, + "loss": 0.1734, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.9797443151474, + "rewards/margins": 1.190385103225708, + "rewards/rejected": 0.7893592715263367, + "step": 10599 + }, + { + "epoch": 1.72, + "learning_rate": 5.034977981614048e-07, + "logits/chosen": -1.043467402458191, + "logits/rejected": -0.857020914554596, + "logps/chosen": -83.09516906738281, + "logps/rejected": -14.18301010131836, + "loss": 0.1517, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.8930153846740723, + "rewards/margins": 2.445728063583374, + "rewards/rejected": 0.447287380695343, + "step": 10600 + }, + { + "epoch": 1.72, + "learning_rate": 5.029231890321845e-07, + "logits/chosen": -1.072512149810791, + "logits/rejected": -1.058289647102356, + "logps/chosen": -43.311195373535156, + "logps/rejected": -58.4943962097168, + "loss": 0.2715, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.9676883816719055, + "rewards/margins": 0.34189069271087646, + "rewards/rejected": 0.625797688961029, + "step": 10601 + }, + { + "epoch": 1.72, + "learning_rate": 5.023488906084217e-07, + "logits/chosen": -1.0565625429153442, + "logits/rejected": -1.0624573230743408, + "logps/chosen": -2.0132505893707275, + "logps/rejected": -0.8390624523162842, + "loss": 0.4843, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.2481711208820343, + "rewards/margins": -0.014954537153244019, + "rewards/rejected": 0.2631256580352783, + "step": 10602 + }, + { + "epoch": 1.72, + "learning_rate": 5.017749029297919e-07, + "logits/chosen": -1.6288716793060303, + "logits/rejected": -1.506207823753357, + "logps/chosen": -105.48666381835938, + "logps/rejected": -57.96098327636719, + "loss": 0.1726, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.819821357727051, + "rewards/margins": 4.410551071166992, + "rewards/rejected": 2.4092705249786377, + "step": 10603 + }, + { + "epoch": 1.72, + "learning_rate": 5.01201226035955e-07, + "logits/chosen": -1.1215938329696655, + "logits/rejected": -0.8526661396026611, + "logps/chosen": -52.06764221191406, + "logps/rejected": -38.15132141113281, + "loss": 0.1289, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9119231700897217, + "rewards/margins": 1.2491084337234497, + "rewards/rejected": 1.662814736366272, + "step": 10604 + }, + { + "epoch": 1.72, + "learning_rate": 5.006278599665443e-07, + "logits/chosen": -1.3128767013549805, + "logits/rejected": -1.3299351930618286, + "logps/chosen": -71.47869110107422, + "logps/rejected": -96.70082092285156, + "loss": 1.0269, + "rewards/accuracies": 0.0, + "rewards/chosen": 4.15010404586792, + "rewards/margins": -1.583970546722412, + "rewards/rejected": 5.734074592590332, + "step": 10605 + }, + { + "epoch": 1.72, + "learning_rate": 5.000548047611764e-07, + "logits/chosen": -0.7671501636505127, + "logits/rejected": -0.7636752128601074, + "logps/chosen": -3.346961498260498, + "logps/rejected": -1.568401575088501, + "loss": 0.5551, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.3596888482570648, + "rewards/margins": -0.003923892974853516, + "rewards/rejected": 0.36361274123191833, + "step": 10606 + }, + { + "epoch": 1.72, + "learning_rate": 4.994820604594408e-07, + "logits/chosen": -1.3603391647338867, + "logits/rejected": -1.387250304222107, + "logps/chosen": -132.35147094726562, + "logps/rejected": -70.27336883544922, + "loss": 0.2116, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.9049134254455566, + "rewards/margins": 0.7057900428771973, + "rewards/rejected": 3.1991233825683594, + "step": 10607 + }, + { + "epoch": 1.72, + "learning_rate": 4.98909627100912e-07, + "logits/chosen": -1.7446128129959106, + "logits/rejected": -1.632920503616333, + "logps/chosen": -78.48090362548828, + "logps/rejected": -47.810089111328125, + "loss": 0.0489, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.757012367248535, + "rewards/margins": 2.317556858062744, + "rewards/rejected": 2.439455509185791, + "step": 10608 + }, + { + "epoch": 1.72, + "learning_rate": 4.983375047251377e-07, + "logits/chosen": -1.2634586095809937, + "logits/rejected": -1.2536014318466187, + "logps/chosen": -88.15170288085938, + "logps/rejected": -44.6838264465332, + "loss": 0.369, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.95597767829895, + "rewards/margins": 0.2452259063720703, + "rewards/rejected": 2.71075177192688, + "step": 10609 + }, + { + "epoch": 1.72, + "learning_rate": 4.977656933716457e-07, + "logits/chosen": -1.3041828870773315, + "logits/rejected": -1.276869535446167, + "logps/chosen": -44.26758575439453, + "logps/rejected": -54.5189323425293, + "loss": 0.7306, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.6319373846054077, + "rewards/margins": -0.2494068145751953, + "rewards/rejected": 1.881344199180603, + "step": 10610 + }, + { + "epoch": 1.72, + "learning_rate": 4.971941930799456e-07, + "logits/chosen": -1.321280598640442, + "logits/rejected": -1.3753162622451782, + "logps/chosen": -71.50479125976562, + "logps/rejected": -172.04537963867188, + "loss": 0.3489, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.987689197063446, + "rewards/margins": 0.00863337516784668, + "rewards/rejected": 0.9790558218955994, + "step": 10611 + }, + { + "epoch": 1.72, + "learning_rate": 4.966230038895192e-07, + "logits/chosen": -1.4590626955032349, + "logits/rejected": -1.4590626955032349, + "logps/chosen": -49.01544952392578, + "logps/rejected": -49.01544952392578, + "loss": 0.3754, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.5961556434631348, + "rewards/margins": 0.0, + "rewards/rejected": 3.5961556434631348, + "step": 10612 + }, + { + "epoch": 1.72, + "learning_rate": 4.960521258398333e-07, + "logits/chosen": -1.2740185260772705, + "logits/rejected": -1.139807105064392, + "logps/chosen": -71.55535125732422, + "logps/rejected": -36.471275329589844, + "loss": 0.1249, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.090144395828247, + "rewards/margins": 1.286939263343811, + "rewards/rejected": 1.803205132484436, + "step": 10613 + }, + { + "epoch": 1.72, + "learning_rate": 4.954815589703277e-07, + "logits/chosen": -1.418075680732727, + "logits/rejected": -1.1559242010116577, + "logps/chosen": -84.19110107421875, + "logps/rejected": -89.71121215820312, + "loss": 0.3438, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.549991130828857, + "rewards/margins": 0.35833168029785156, + "rewards/rejected": 5.191659450531006, + "step": 10614 + }, + { + "epoch": 1.72, + "learning_rate": 4.94911303320425e-07, + "logits/chosen": -1.4478703737258911, + "logits/rejected": -1.4491080045700073, + "logps/chosen": -79.51617431640625, + "logps/rejected": -87.39625549316406, + "loss": 0.8006, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.515589952468872, + "rewards/margins": -0.8002800941467285, + "rewards/rejected": 3.3158700466156006, + "step": 10615 + }, + { + "epoch": 1.72, + "learning_rate": 4.943413589295231e-07, + "logits/chosen": -1.3960685729980469, + "logits/rejected": -1.4721075296401978, + "logps/chosen": -180.72312927246094, + "logps/rejected": -136.25265502929688, + "loss": 1.2585, + "rewards/accuracies": 0.0, + "rewards/chosen": 4.745611667633057, + "rewards/margins": -1.3821382522583008, + "rewards/rejected": 6.127749919891357, + "step": 10616 + }, + { + "epoch": 1.72, + "learning_rate": 4.937717258370012e-07, + "logits/chosen": -0.9979566931724548, + "logits/rejected": -1.0025501251220703, + "logps/chosen": -1.2336971759796143, + "logps/rejected": -2.049009084701538, + "loss": 0.8678, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.18568943440914154, + "rewards/margins": -0.33690041303634644, + "rewards/rejected": 0.5225898623466492, + "step": 10617 + }, + { + "epoch": 1.72, + "learning_rate": 4.93202404082213e-07, + "logits/chosen": -1.3744319677352905, + "logits/rejected": -1.3897770643234253, + "logps/chosen": -100.90461730957031, + "logps/rejected": -80.76341247558594, + "loss": 0.1991, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.2554779052734375, + "rewards/margins": 0.8884948492050171, + "rewards/rejected": 1.3669830560684204, + "step": 10618 + }, + { + "epoch": 1.72, + "learning_rate": 4.92633393704497e-07, + "logits/chosen": -1.4504272937774658, + "logits/rejected": -1.4516701698303223, + "logps/chosen": -70.42196655273438, + "logps/rejected": -120.00308227539062, + "loss": 0.7984, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.39119029045105, + "rewards/margins": 2.4242498874664307, + "rewards/rejected": 0.9669403433799744, + "step": 10619 + }, + { + "epoch": 1.72, + "learning_rate": 4.920646947431628e-07, + "logits/chosen": -1.0519225597381592, + "logits/rejected": -1.0104857683181763, + "logps/chosen": -29.20652198791504, + "logps/rejected": -4.023317813873291, + "loss": 0.2131, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.727420687675476, + "rewards/margins": 1.034454584121704, + "rewards/rejected": 0.6929660439491272, + "step": 10620 + }, + { + "epoch": 1.72, + "learning_rate": 4.914963072375051e-07, + "logits/chosen": -1.086186408996582, + "logits/rejected": -1.1477659940719604, + "logps/chosen": -27.09042739868164, + "logps/rejected": -127.31745910644531, + "loss": 0.1559, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.720166802406311, + "rewards/margins": 1.011751651763916, + "rewards/rejected": 0.7084152102470398, + "step": 10621 + }, + { + "epoch": 1.72, + "learning_rate": 4.909282312267915e-07, + "logits/chosen": -1.5321204662322998, + "logits/rejected": -1.3721463680267334, + "logps/chosen": -173.07894897460938, + "logps/rejected": -69.94593048095703, + "loss": 0.252, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.698240756988525, + "rewards/margins": 2.0038084983825684, + "rewards/rejected": 4.694432258605957, + "step": 10622 + }, + { + "epoch": 1.72, + "learning_rate": 4.903604667502726e-07, + "logits/chosen": -0.8035420775413513, + "logits/rejected": -0.8006919026374817, + "logps/chosen": -1.6646606922149658, + "logps/rejected": -1.108452320098877, + "loss": 0.3248, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.41836977005004883, + "rewards/margins": 0.09719142317771912, + "rewards/rejected": 0.3211783468723297, + "step": 10623 + }, + { + "epoch": 1.72, + "learning_rate": 4.897930138471746e-07, + "logits/chosen": -1.1878916025161743, + "logits/rejected": -1.0361005067825317, + "logps/chosen": -95.50068664550781, + "logps/rejected": -56.19978332519531, + "loss": 0.0367, + "rewards/accuracies": 1.0, + "rewards/chosen": 7.923892498016357, + "rewards/margins": 6.275415420532227, + "rewards/rejected": 1.6484771966934204, + "step": 10624 + }, + { + "epoch": 1.72, + "learning_rate": 4.892258725567034e-07, + "logits/chosen": -0.8201619982719421, + "logits/rejected": -0.8201619982719421, + "logps/chosen": -69.78521728515625, + "logps/rejected": -69.78521728515625, + "loss": 0.3485, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.8595451712608337, + "rewards/margins": 0.0, + "rewards/rejected": 0.8595451712608337, + "step": 10625 + }, + { + "epoch": 1.72, + "learning_rate": 4.886590429180426e-07, + "logits/chosen": -1.673254132270813, + "logits/rejected": -1.6535651683807373, + "logps/chosen": -110.10574340820312, + "logps/rejected": -67.02063751220703, + "loss": 0.7133, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.3991286754608154, + "rewards/margins": -1.0857856273651123, + "rewards/rejected": 4.484914302825928, + "step": 10626 + }, + { + "epoch": 1.72, + "learning_rate": 4.880925249703566e-07, + "logits/chosen": -1.1072208881378174, + "logits/rejected": -1.0480401515960693, + "logps/chosen": -87.55831909179688, + "logps/rejected": -107.12745666503906, + "loss": 0.4515, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.571760654449463, + "rewards/margins": 1.6321334838867188, + "rewards/rejected": 0.9396271109580994, + "step": 10627 + }, + { + "epoch": 1.73, + "learning_rate": 4.875263187527834e-07, + "logits/chosen": -1.0879074335098267, + "logits/rejected": -1.1139023303985596, + "logps/chosen": -44.437034606933594, + "logps/rejected": -57.60784912109375, + "loss": 0.8734, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.7312417030334473, + "rewards/margins": -1.4982047080993652, + "rewards/rejected": 4.2294464111328125, + "step": 10628 + }, + { + "epoch": 1.73, + "learning_rate": 4.869604243044456e-07, + "logits/chosen": -1.370892882347107, + "logits/rejected": -1.445595383644104, + "logps/chosen": -69.05057525634766, + "logps/rejected": -87.0420913696289, + "loss": 1.2263, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.5152747631073, + "rewards/margins": -2.3301498889923096, + "rewards/rejected": 5.845424652099609, + "step": 10629 + }, + { + "epoch": 1.73, + "learning_rate": 4.863948416644382e-07, + "logits/chosen": -1.5757367610931396, + "logits/rejected": -1.1350003480911255, + "logps/chosen": -131.3234405517578, + "logps/rejected": -136.28707885742188, + "loss": 0.1027, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.679131984710693, + "rewards/margins": 2.094245672225952, + "rewards/rejected": 2.584886312484741, + "step": 10630 + }, + { + "epoch": 1.73, + "learning_rate": 4.858295708718408e-07, + "logits/chosen": -1.0004856586456299, + "logits/rejected": -1.0004856586456299, + "logps/chosen": -43.922340393066406, + "logps/rejected": -43.922340393066406, + "loss": 0.4935, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.817066192626953, + "rewards/margins": 0.0, + "rewards/rejected": 2.817066192626953, + "step": 10631 + }, + { + "epoch": 1.73, + "learning_rate": 4.852646119657051e-07, + "logits/chosen": -1.6048316955566406, + "logits/rejected": -1.5015875101089478, + "logps/chosen": -52.440711975097656, + "logps/rejected": -38.48812484741211, + "loss": 0.3234, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.3161041736602783, + "rewards/margins": 0.10075187683105469, + "rewards/rejected": 3.2153522968292236, + "step": 10632 + }, + { + "epoch": 1.73, + "learning_rate": 4.84699964985067e-07, + "logits/chosen": -1.7139695882797241, + "logits/rejected": -1.7778276205062866, + "logps/chosen": -165.024169921875, + "logps/rejected": -80.01484680175781, + "loss": 0.0111, + "rewards/accuracies": 1.0, + "rewards/chosen": 9.518780708312988, + "rewards/margins": 4.308711528778076, + "rewards/rejected": 5.210069179534912, + "step": 10633 + }, + { + "epoch": 1.73, + "learning_rate": 4.841356299689359e-07, + "logits/chosen": -0.7992249727249146, + "logits/rejected": -0.7730336785316467, + "logps/chosen": -42.60816192626953, + "logps/rejected": -21.542869567871094, + "loss": 0.774, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.7846527099609375, + "rewards/margins": -0.3009093999862671, + "rewards/rejected": 1.0855621099472046, + "step": 10634 + }, + { + "epoch": 1.73, + "learning_rate": 4.835716069563046e-07, + "logits/chosen": -1.2304388284683228, + "logits/rejected": -1.2295377254486084, + "logps/chosen": -4.603633403778076, + "logps/rejected": -6.685519695281982, + "loss": 0.3671, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.1952148973941803, + "rewards/margins": 0.12107572704553604, + "rewards/rejected": 0.07413917034864426, + "step": 10635 + }, + { + "epoch": 1.73, + "learning_rate": 4.830078959861395e-07, + "logits/chosen": -1.568074107170105, + "logits/rejected": -0.968116044998169, + "logps/chosen": -178.30397033691406, + "logps/rejected": -95.66064453125, + "loss": 0.6153, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.2740373611450195, + "rewards/margins": 2.987401008605957, + "rewards/rejected": 3.2866363525390625, + "step": 10636 + }, + { + "epoch": 1.73, + "learning_rate": 4.824444970973891e-07, + "logits/chosen": -1.156362533569336, + "logits/rejected": -1.0127997398376465, + "logps/chosen": -56.376708984375, + "logps/rejected": -17.453205108642578, + "loss": 0.1858, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.6600807309150696, + "rewards/margins": 0.8279860615730286, + "rewards/rejected": -0.16790533065795898, + "step": 10637 + }, + { + "epoch": 1.73, + "learning_rate": 4.818814103289782e-07, + "logits/chosen": -1.2586253881454468, + "logits/rejected": -1.2966479063034058, + "logps/chosen": -40.031288146972656, + "logps/rejected": -103.72181701660156, + "loss": 2.3628, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.770338535308838, + "rewards/margins": -4.699937343597412, + "rewards/rejected": 8.47027587890625, + "step": 10638 + }, + { + "epoch": 1.73, + "learning_rate": 4.813186357198113e-07, + "logits/chosen": -1.3752027750015259, + "logits/rejected": -1.2023481130599976, + "logps/chosen": -79.72026824951172, + "logps/rejected": -113.8309326171875, + "loss": 1.2571, + "rewards/accuracies": 0.0, + "rewards/chosen": 4.276975154876709, + "rewards/margins": -0.2765922546386719, + "rewards/rejected": 4.553567409515381, + "step": 10639 + }, + { + "epoch": 1.73, + "learning_rate": 4.807561733087695e-07, + "logits/chosen": -1.2283875942230225, + "logits/rejected": -1.2397361993789673, + "logps/chosen": -58.315128326416016, + "logps/rejected": -13.929144859313965, + "loss": 0.4078, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.1756908893585205, + "rewards/margins": 0.4368909001350403, + "rewards/rejected": 0.7387999892234802, + "step": 10640 + }, + { + "epoch": 1.73, + "learning_rate": 4.801940231347164e-07, + "logits/chosen": -1.2716176509857178, + "logits/rejected": -1.1998881101608276, + "logps/chosen": -49.48265838623047, + "logps/rejected": -30.678232192993164, + "loss": 0.0627, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.203533172607422, + "rewards/margins": 2.729051113128662, + "rewards/rejected": 0.47448214888572693, + "step": 10641 + }, + { + "epoch": 1.73, + "learning_rate": 4.796321852364877e-07, + "logits/chosen": -1.2539476156234741, + "logits/rejected": -1.0334742069244385, + "logps/chosen": -58.237403869628906, + "logps/rejected": -44.16115951538086, + "loss": 0.2513, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.8879966735839844, + "rewards/margins": 2.279026746749878, + "rewards/rejected": 0.6089698672294617, + "step": 10642 + }, + { + "epoch": 1.73, + "learning_rate": 4.790706596529043e-07, + "logits/chosen": -0.7175810933113098, + "logits/rejected": -0.7629578113555908, + "logps/chosen": -93.12360382080078, + "logps/rejected": -78.21601867675781, + "loss": 0.7053, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.355787754058838, + "rewards/margins": -0.7109389305114746, + "rewards/rejected": 3.0667266845703125, + "step": 10643 + }, + { + "epoch": 1.73, + "learning_rate": 4.785094464227597e-07, + "logits/chosen": -1.5815502405166626, + "logits/rejected": -1.5867795944213867, + "logps/chosen": -84.87594604492188, + "logps/rejected": -49.20361328125, + "loss": 0.3316, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.8872551918029785, + "rewards/margins": 0.23950743675231934, + "rewards/rejected": 2.647747755050659, + "step": 10644 + }, + { + "epoch": 1.73, + "learning_rate": 4.77948545584831e-07, + "logits/chosen": -1.203460454940796, + "logits/rejected": -1.1772754192352295, + "logps/chosen": -82.37828826904297, + "logps/rejected": -38.01408004760742, + "loss": 0.8472, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.19217848777771, + "rewards/margins": -1.4843173027038574, + "rewards/rejected": 3.6764957904815674, + "step": 10645 + }, + { + "epoch": 1.73, + "learning_rate": 4.773879571778684e-07, + "logits/chosen": -1.4279507398605347, + "logits/rejected": -1.4319536685943604, + "logps/chosen": -163.88641357421875, + "logps/rejected": -83.0960693359375, + "loss": 0.1118, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.775494337081909, + "rewards/margins": 1.5593338012695312, + "rewards/rejected": 2.216160535812378, + "step": 10646 + }, + { + "epoch": 1.73, + "learning_rate": 4.7682768124060476e-07, + "logits/chosen": -1.2215625047683716, + "logits/rejected": -1.1383705139160156, + "logps/chosen": -34.38981628417969, + "logps/rejected": -16.201431274414062, + "loss": 0.3192, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.430561065673828, + "rewards/margins": 0.15100312232971191, + "rewards/rejected": 2.279557943344116, + "step": 10647 + }, + { + "epoch": 1.73, + "learning_rate": 4.762677178117503e-07, + "logits/chosen": -1.476819396018982, + "logits/rejected": -1.508293867111206, + "logps/chosen": -39.47133255004883, + "logps/rejected": -107.4859848022461, + "loss": 2.639, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.592620611190796, + "rewards/margins": -1.8823392391204834, + "rewards/rejected": 5.474959850311279, + "step": 10648 + }, + { + "epoch": 1.73, + "learning_rate": 4.757080669299924e-07, + "logits/chosen": -1.368751049041748, + "logits/rejected": -1.1742323637008667, + "logps/chosen": -162.74964904785156, + "logps/rejected": -204.39224243164062, + "loss": 0.4521, + "rewards/accuracies": 0.0, + "rewards/chosen": 6.291984558105469, + "rewards/margins": -0.011052131652832031, + "rewards/rejected": 6.303036689758301, + "step": 10649 + }, + { + "epoch": 1.73, + "learning_rate": 4.7514872863399787e-07, + "logits/chosen": -1.1875507831573486, + "logits/rejected": -1.1615177392959595, + "logps/chosen": -72.20442199707031, + "logps/rejected": -23.10932159423828, + "loss": 0.9873, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.2200927734375, + "rewards/margins": -1.8197579383850098, + "rewards/rejected": 4.03985071182251, + "step": 10650 + }, + { + "epoch": 1.73, + "learning_rate": 4.745897029624114e-07, + "logits/chosen": -1.1651344299316406, + "logits/rejected": -1.167690396308899, + "logps/chosen": -26.451379776000977, + "logps/rejected": -24.058303833007812, + "loss": 0.4259, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.9262232184410095, + "rewards/margins": 0.1226457953453064, + "rewards/rejected": 0.8035774230957031, + "step": 10651 + }, + { + "epoch": 1.73, + "learning_rate": 4.74030989953857e-07, + "logits/chosen": -0.9839181900024414, + "logits/rejected": -1.011457085609436, + "logps/chosen": -48.57038116455078, + "logps/rejected": -72.98867797851562, + "loss": 0.282, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.681567430496216, + "rewards/margins": 0.3573188781738281, + "rewards/rejected": 2.3242485523223877, + "step": 10652 + }, + { + "epoch": 1.73, + "learning_rate": 4.7347258964693553e-07, + "logits/chosen": -1.020275592803955, + "logits/rejected": -1.020275592803955, + "logps/chosen": -54.06725311279297, + "logps/rejected": -54.06725311279297, + "loss": 0.3589, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.8955705165863037, + "rewards/margins": 0.0, + "rewards/rejected": 2.8955705165863037, + "step": 10653 + }, + { + "epoch": 1.73, + "learning_rate": 4.7291450208022836e-07, + "logits/chosen": -0.8326171636581421, + "logits/rejected": -0.8326171636581421, + "logps/chosen": -61.43037796020508, + "logps/rejected": -61.43037796020508, + "loss": 1.8104, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.7138142585754395, + "rewards/margins": 0.0, + "rewards/rejected": 2.7138142585754395, + "step": 10654 + }, + { + "epoch": 1.73, + "learning_rate": 4.72356727292293e-07, + "logits/chosen": -1.5466582775115967, + "logits/rejected": -1.5231951475143433, + "logps/chosen": -119.22959899902344, + "logps/rejected": -97.38839721679688, + "loss": 0.5973, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.328451633453369, + "rewards/margins": -0.18595266342163086, + "rewards/rejected": 2.514404296875, + "step": 10655 + }, + { + "epoch": 1.73, + "learning_rate": 4.7179926532166744e-07, + "logits/chosen": -1.2600531578063965, + "logits/rejected": -1.3197911977767944, + "logps/chosen": -61.185081481933594, + "logps/rejected": -53.657745361328125, + "loss": 0.5626, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.544689178466797, + "rewards/margins": 0.17220830917358398, + "rewards/rejected": 2.372480869293213, + "step": 10656 + }, + { + "epoch": 1.73, + "learning_rate": 4.7124211620686533e-07, + "logits/chosen": -1.3314039707183838, + "logits/rejected": -1.2623310089111328, + "logps/chosen": -68.24142456054688, + "logps/rejected": -31.818462371826172, + "loss": 0.6393, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.3270881175994873, + "rewards/margins": -0.0766446590423584, + "rewards/rejected": 2.4037327766418457, + "step": 10657 + }, + { + "epoch": 1.73, + "learning_rate": 4.706852799863826e-07, + "logits/chosen": -1.378301739692688, + "logits/rejected": -1.3364437818527222, + "logps/chosen": -108.40326690673828, + "logps/rejected": -91.49824523925781, + "loss": 0.307, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9922311305999756, + "rewards/margins": 0.19478225708007812, + "rewards/rejected": 2.7974488735198975, + "step": 10658 + }, + { + "epoch": 1.73, + "learning_rate": 4.701287566986895e-07, + "logits/chosen": -1.2158395051956177, + "logits/rejected": -1.2033131122589111, + "logps/chosen": -46.06482696533203, + "logps/rejected": -84.62059020996094, + "loss": 1.7613, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.64226233959198, + "rewards/margins": 0.2955803871154785, + "rewards/rejected": 1.3466819524765015, + "step": 10659 + }, + { + "epoch": 1.73, + "learning_rate": 4.695725463822376e-07, + "logits/chosen": -1.1432322263717651, + "logits/rejected": -1.091536521911621, + "logps/chosen": -104.54676818847656, + "logps/rejected": -51.742347717285156, + "loss": 0.1485, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.844505310058594, + "rewards/margins": 4.109941005706787, + "rewards/rejected": 1.734564185142517, + "step": 10660 + }, + { + "epoch": 1.73, + "learning_rate": 4.69016649075455e-07, + "logits/chosen": -0.7206127643585205, + "logits/rejected": -0.7193549871444702, + "logps/chosen": -1.6294060945510864, + "logps/rejected": -4.324997901916504, + "loss": 0.3635, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.26597291231155396, + "rewards/margins": -0.013468414545059204, + "rewards/rejected": 0.27944132685661316, + "step": 10661 + }, + { + "epoch": 1.73, + "learning_rate": 4.6846106481675035e-07, + "logits/chosen": -1.299057126045227, + "logits/rejected": -1.3199520111083984, + "logps/chosen": -66.95794677734375, + "logps/rejected": -97.31310272216797, + "loss": 0.6354, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.8134872913360596, + "rewards/margins": 1.8992087841033936, + "rewards/rejected": 0.9142784476280212, + "step": 10662 + }, + { + "epoch": 1.73, + "learning_rate": 4.6790579364450694e-07, + "logits/chosen": -1.649194359779358, + "logits/rejected": -1.6864901781082153, + "logps/chosen": -78.17362976074219, + "logps/rejected": -75.43527221679688, + "loss": 0.2873, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9440956115722656, + "rewards/margins": 0.39023351669311523, + "rewards/rejected": 2.5538620948791504, + "step": 10663 + }, + { + "epoch": 1.73, + "learning_rate": 4.673508355970918e-07, + "logits/chosen": -1.4606318473815918, + "logits/rejected": -1.4774389266967773, + "logps/chosen": -152.5376739501953, + "logps/rejected": -25.045801162719727, + "loss": 0.0591, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.442169189453125, + "rewards/margins": 2.104649066925049, + "rewards/rejected": 0.33752021193504333, + "step": 10664 + }, + { + "epoch": 1.73, + "learning_rate": 4.667961907128438e-07, + "logits/chosen": -1.325792670249939, + "logits/rejected": -1.3908926248550415, + "logps/chosen": -93.47531127929688, + "logps/rejected": -96.70804595947266, + "loss": 1.1041, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.8837921619415283, + "rewards/margins": -0.027907609939575195, + "rewards/rejected": 3.9116997718811035, + "step": 10665 + }, + { + "epoch": 1.73, + "learning_rate": 4.6624185903008713e-07, + "logits/chosen": -0.8790669441223145, + "logits/rejected": -0.8790669441223145, + "logps/chosen": -2.1025044918060303, + "logps/rejected": -2.1025044918060303, + "loss": 0.894, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.28730309009552, + "rewards/margins": 0.0, + "rewards/rejected": 0.28730309009552, + "step": 10666 + }, + { + "epoch": 1.73, + "learning_rate": 4.656878405871179e-07, + "logits/chosen": -1.0352978706359863, + "logits/rejected": -1.0381426811218262, + "logps/chosen": -6.309914588928223, + "logps/rejected": -7.082230091094971, + "loss": 0.5096, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.45495644211769104, + "rewards/margins": -0.16853931546211243, + "rewards/rejected": 0.6234957575798035, + "step": 10667 + }, + { + "epoch": 1.73, + "learning_rate": 4.6513413542221554e-07, + "logits/chosen": -0.8819365501403809, + "logits/rejected": -0.9586806893348694, + "logps/chosen": -53.027469635009766, + "logps/rejected": -35.00385284423828, + "loss": 0.7668, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.2858288288116455, + "rewards/margins": -0.5428668260574341, + "rewards/rejected": 1.8286956548690796, + "step": 10668 + }, + { + "epoch": 1.73, + "learning_rate": 4.645807435736355e-07, + "logits/chosen": -0.9522435069084167, + "logits/rejected": -0.9467426538467407, + "logps/chosen": -1.4535751342773438, + "logps/rejected": -10.249022483825684, + "loss": 0.273, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.38047486543655396, + "rewards/margins": 0.33775463700294495, + "rewards/rejected": 0.04272022470831871, + "step": 10669 + }, + { + "epoch": 1.73, + "learning_rate": 4.640276650796105e-07, + "logits/chosen": -1.210658311843872, + "logits/rejected": -1.0485512018203735, + "logps/chosen": -74.9117202758789, + "logps/rejected": -11.77987289428711, + "loss": 0.852, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.647813558578491, + "rewards/margins": 3.2389109134674072, + "rewards/rejected": 0.40890273451805115, + "step": 10670 + }, + { + "epoch": 1.73, + "learning_rate": 4.63474899978355e-07, + "logits/chosen": -1.2219303846359253, + "logits/rejected": -1.208395004272461, + "logps/chosen": -41.010440826416016, + "logps/rejected": -77.03494262695312, + "loss": 0.2212, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.31235933303833, + "rewards/margins": 1.6505208015441895, + "rewards/rejected": 1.6618385314941406, + "step": 10671 + }, + { + "epoch": 1.73, + "learning_rate": 4.62922448308058e-07, + "logits/chosen": -1.9476395845413208, + "logits/rejected": -1.9090914726257324, + "logps/chosen": -47.66004943847656, + "logps/rejected": -178.323974609375, + "loss": 2.6657, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.5871803760528564, + "rewards/margins": -4.54872989654541, + "rewards/rejected": 7.1359100341796875, + "step": 10672 + }, + { + "epoch": 1.73, + "learning_rate": 4.6237031010689047e-07, + "logits/chosen": -1.5123802423477173, + "logits/rejected": -1.4689457416534424, + "logps/chosen": -53.22129821777344, + "logps/rejected": -80.91679382324219, + "loss": 0.2882, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.867315649986267, + "rewards/margins": 0.2878890037536621, + "rewards/rejected": 1.579426646232605, + "step": 10673 + }, + { + "epoch": 1.73, + "learning_rate": 4.618184854129981e-07, + "logits/chosen": -1.139055848121643, + "logits/rejected": -1.139055848121643, + "logps/chosen": -12.442521095275879, + "logps/rejected": -12.442521095275879, + "loss": 0.3659, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.1319631338119507, + "rewards/margins": 0.0, + "rewards/rejected": 1.1319631338119507, + "step": 10674 + }, + { + "epoch": 1.73, + "learning_rate": 4.6126697426450873e-07, + "logits/chosen": -1.5759353637695312, + "logits/rejected": -1.425362229347229, + "logps/chosen": -54.89889144897461, + "logps/rejected": -70.33513641357422, + "loss": 1.4699, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.9941914081573486, + "rewards/margins": -2.6395442485809326, + "rewards/rejected": 4.633735656738281, + "step": 10675 + }, + { + "epoch": 1.73, + "learning_rate": 4.607157766995246e-07, + "logits/chosen": -1.4964723587036133, + "logits/rejected": -1.4803581237792969, + "logps/chosen": -89.23103332519531, + "logps/rejected": -72.46467590332031, + "loss": 1.6496, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.360160827636719, + "rewards/margins": 1.1853225231170654, + "rewards/rejected": 3.1748383045196533, + "step": 10676 + }, + { + "epoch": 1.73, + "learning_rate": 4.6016489275612977e-07, + "logits/chosen": -1.1245594024658203, + "logits/rejected": -1.1245594024658203, + "logps/chosen": -123.8007583618164, + "logps/rejected": -123.8007583618164, + "loss": 0.3578, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.7999749183654785, + "rewards/margins": 0.0, + "rewards/rejected": 2.7999749183654785, + "step": 10677 + }, + { + "epoch": 1.73, + "learning_rate": 4.596143224723842e-07, + "logits/chosen": -1.3544745445251465, + "logits/rejected": -1.337897539138794, + "logps/chosen": -79.16484069824219, + "logps/rejected": -122.67005920410156, + "loss": 0.2607, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.600727915763855, + "rewards/margins": 0.9873444437980652, + "rewards/rejected": 0.6133834719657898, + "step": 10678 + }, + { + "epoch": 1.73, + "learning_rate": 4.5906406588632766e-07, + "logits/chosen": -1.1542389392852783, + "logits/rejected": -0.9475511312484741, + "logps/chosen": -57.6246337890625, + "logps/rejected": -11.692729949951172, + "loss": 0.0995, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.6788766384124756, + "rewards/margins": 1.9072175025939941, + "rewards/rejected": 0.7716590762138367, + "step": 10679 + }, + { + "epoch": 1.73, + "learning_rate": 4.5851412303597685e-07, + "logits/chosen": -1.1270766258239746, + "logits/rejected": -1.1270766258239746, + "logps/chosen": -15.898200988769531, + "logps/rejected": -15.898200988769531, + "loss": 0.3581, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.4456583261489868, + "rewards/margins": 0.0, + "rewards/rejected": 1.4456583261489868, + "step": 10680 + }, + { + "epoch": 1.73, + "learning_rate": 4.579644939593292e-07, + "logits/chosen": -1.3832275867462158, + "logits/rejected": -1.2844607830047607, + "logps/chosen": -173.0738983154297, + "logps/rejected": -95.82707214355469, + "loss": 0.2749, + "rewards/accuracies": 1.0, + "rewards/chosen": 7.804103374481201, + "rewards/margins": 0.6740221977233887, + "rewards/rejected": 7.1300811767578125, + "step": 10681 + }, + { + "epoch": 1.73, + "learning_rate": 4.57415178694357e-07, + "logits/chosen": -1.0471357107162476, + "logits/rejected": -0.9794835448265076, + "logps/chosen": -86.34347534179688, + "logps/rejected": -50.834869384765625, + "loss": 0.5448, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.5000221729278564, + "rewards/margins": -0.35839009284973145, + "rewards/rejected": 2.858412265777588, + "step": 10682 + }, + { + "epoch": 1.73, + "learning_rate": 4.568661772790145e-07, + "logits/chosen": -1.5967934131622314, + "logits/rejected": -1.5956770181655884, + "logps/chosen": -44.330841064453125, + "logps/rejected": -62.014530181884766, + "loss": 0.4146, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.7831337451934814, + "rewards/margins": 0.9089428186416626, + "rewards/rejected": 0.8741909265518188, + "step": 10683 + }, + { + "epoch": 1.73, + "learning_rate": 4.563174897512307e-07, + "logits/chosen": -1.5273171663284302, + "logits/rejected": -1.6474581956863403, + "logps/chosen": -183.63560485839844, + "logps/rejected": -172.23968505859375, + "loss": 0.6333, + "rewards/accuracies": 1.0, + "rewards/chosen": 8.857853889465332, + "rewards/margins": 0.1988048553466797, + "rewards/rejected": 8.659049034118652, + "step": 10684 + }, + { + "epoch": 1.73, + "learning_rate": 4.5576911614891637e-07, + "logits/chosen": -1.1117351055145264, + "logits/rejected": -1.1596695184707642, + "logps/chosen": -108.21626281738281, + "logps/rejected": -127.23182678222656, + "loss": 0.8207, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.98101806640625, + "rewards/margins": 1.6430083513259888, + "rewards/rejected": 1.3380097150802612, + "step": 10685 + }, + { + "epoch": 1.73, + "learning_rate": 4.55221056509958e-07, + "logits/chosen": -1.3673168420791626, + "logits/rejected": -1.2995798587799072, + "logps/chosen": -146.44479370117188, + "logps/rejected": -45.230674743652344, + "loss": 1.3867, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.2347259521484375, + "rewards/margins": 0.8435173034667969, + "rewards/rejected": 4.391208648681641, + "step": 10686 + }, + { + "epoch": 1.73, + "learning_rate": 4.5467331087222177e-07, + "logits/chosen": -1.5611135959625244, + "logits/rejected": -1.6324232816696167, + "logps/chosen": -103.0594482421875, + "logps/rejected": -145.68788146972656, + "loss": 0.9991, + "rewards/accuracies": 0.0, + "rewards/chosen": 7.357141017913818, + "rewards/margins": -1.8019061088562012, + "rewards/rejected": 9.15904712677002, + "step": 10687 + }, + { + "epoch": 1.73, + "learning_rate": 4.5412587927355033e-07, + "logits/chosen": -1.3244616985321045, + "logits/rejected": -1.3244616985321045, + "logps/chosen": -47.645538330078125, + "logps/rejected": -47.645538330078125, + "loss": 0.4042, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.39314690232276917, + "rewards/margins": 0.0, + "rewards/rejected": 0.39314690232276917, + "step": 10688 + }, + { + "epoch": 1.73, + "learning_rate": 4.535787617517684e-07, + "logits/chosen": -1.5348200798034668, + "logits/rejected": -1.5541913509368896, + "logps/chosen": -12.513381004333496, + "logps/rejected": -18.799848556518555, + "loss": 0.6775, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.6937355995178223, + "rewards/margins": -0.94565749168396, + "rewards/rejected": 1.6393930912017822, + "step": 10689 + }, + { + "epoch": 1.74, + "learning_rate": 4.5303195834467463e-07, + "logits/chosen": -1.251897931098938, + "logits/rejected": -1.2330021858215332, + "logps/chosen": -52.81196212768555, + "logps/rejected": -84.29983520507812, + "loss": 0.5539, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.3310048580169678, + "rewards/margins": 1.2115544080734253, + "rewards/rejected": 1.1194504499435425, + "step": 10690 + }, + { + "epoch": 1.74, + "learning_rate": 4.5248546909004984e-07, + "logits/chosen": -1.3444958925247192, + "logits/rejected": -1.459324598312378, + "logps/chosen": -64.40276336669922, + "logps/rejected": -70.84542846679688, + "loss": 2.6868, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.452693223953247, + "rewards/margins": -3.785423517227173, + "rewards/rejected": 6.23811674118042, + "step": 10691 + }, + { + "epoch": 1.74, + "learning_rate": 4.5193929402564894e-07, + "logits/chosen": -1.547837257385254, + "logits/rejected": -1.4670125246047974, + "logps/chosen": -78.85432434082031, + "logps/rejected": -14.285791397094727, + "loss": 0.045, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.721486806869507, + "rewards/margins": 2.4337780475616455, + "rewards/rejected": 0.2877088487148285, + "step": 10692 + }, + { + "epoch": 1.74, + "learning_rate": 4.5139343318920946e-07, + "logits/chosen": -1.5066823959350586, + "logits/rejected": -1.4965541362762451, + "logps/chosen": -55.55243682861328, + "logps/rejected": -55.51688003540039, + "loss": 0.3233, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0166404247283936, + "rewards/margins": 0.7570657730102539, + "rewards/rejected": 2.2595746517181396, + "step": 10693 + }, + { + "epoch": 1.74, + "learning_rate": 4.508478866184435e-07, + "logits/chosen": -1.0445082187652588, + "logits/rejected": -1.0332729816436768, + "logps/chosen": -48.02572250366211, + "logps/rejected": -54.44256591796875, + "loss": 0.4285, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.188854694366455, + "rewards/margins": -0.26533079147338867, + "rewards/rejected": 2.4541854858398438, + "step": 10694 + }, + { + "epoch": 1.74, + "learning_rate": 4.5030265435104523e-07, + "logits/chosen": -0.8113870024681091, + "logits/rejected": -0.8113870024681091, + "logps/chosen": -0.20799168944358826, + "logps/rejected": -0.20799168944358826, + "loss": 0.4752, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.10455633699893951, + "rewards/margins": 0.0, + "rewards/rejected": 0.10455633699893951, + "step": 10695 + }, + { + "epoch": 1.74, + "learning_rate": 4.49757736424683e-07, + "logits/chosen": -1.1653746366500854, + "logits/rejected": -1.106590747833252, + "logps/chosen": -65.7847671508789, + "logps/rejected": -71.76075744628906, + "loss": 0.8706, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.781482696533203, + "rewards/margins": -1.2181823253631592, + "rewards/rejected": 3.9996650218963623, + "step": 10696 + }, + { + "epoch": 1.74, + "learning_rate": 4.492131328770072e-07, + "logits/chosen": -1.283508539199829, + "logits/rejected": -1.2873104810714722, + "logps/chosen": -58.44812774658203, + "logps/rejected": -51.85807800292969, + "loss": 1.1109, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.8279266357421875, + "rewards/margins": 0.3866363763809204, + "rewards/rejected": 1.441290259361267, + "step": 10697 + }, + { + "epoch": 1.74, + "learning_rate": 4.4866884374564267e-07, + "logits/chosen": -1.190504550933838, + "logits/rejected": -1.1946848630905151, + "logps/chosen": -71.93276977539062, + "logps/rejected": -73.19611358642578, + "loss": 0.5568, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.18974232673645, + "rewards/margins": -0.7061607837677002, + "rewards/rejected": 3.8959031105041504, + "step": 10698 + }, + { + "epoch": 1.74, + "learning_rate": 4.4812486906819774e-07, + "logits/chosen": -1.447317361831665, + "logits/rejected": -1.3578598499298096, + "logps/chosen": -174.576904296875, + "logps/rejected": -71.96802520751953, + "loss": 0.1138, + "rewards/accuracies": 1.0, + "rewards/chosen": 7.951241970062256, + "rewards/margins": 4.319761276245117, + "rewards/rejected": 3.6314804553985596, + "step": 10699 + }, + { + "epoch": 1.74, + "learning_rate": 4.4758120888225234e-07, + "logits/chosen": -1.4310500621795654, + "logits/rejected": -1.22581946849823, + "logps/chosen": -75.50387573242188, + "logps/rejected": -33.839176177978516, + "loss": 0.0696, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.337116241455078, + "rewards/margins": 3.314976453781128, + "rewards/rejected": -0.9778602719306946, + "step": 10700 + }, + { + "epoch": 1.74, + "learning_rate": 4.470378632253708e-07, + "logits/chosen": -1.0921823978424072, + "logits/rejected": -1.1900010108947754, + "logps/chosen": -57.71912384033203, + "logps/rejected": -52.569122314453125, + "loss": 0.614, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.000263214111328, + "rewards/margins": -0.3322021961212158, + "rewards/rejected": 3.332465410232544, + "step": 10701 + }, + { + "epoch": 1.74, + "learning_rate": 4.4649483213509257e-07, + "logits/chosen": -1.2930089235305786, + "logits/rejected": -1.2546186447143555, + "logps/chosen": -40.20942306518555, + "logps/rejected": -56.624664306640625, + "loss": 0.2084, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9200589656829834, + "rewards/margins": 0.7477734088897705, + "rewards/rejected": 2.172285556793213, + "step": 10702 + }, + { + "epoch": 1.74, + "learning_rate": 4.459521156489355e-07, + "logits/chosen": -1.3650459051132202, + "logits/rejected": -1.240421175956726, + "logps/chosen": -114.78550720214844, + "logps/rejected": -115.12548065185547, + "loss": 0.0746, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.858165264129639, + "rewards/margins": 1.8324930667877197, + "rewards/rejected": 3.025672197341919, + "step": 10703 + }, + { + "epoch": 1.74, + "learning_rate": 4.454097138043967e-07, + "logits/chosen": -1.6602967977523804, + "logits/rejected": -1.5001928806304932, + "logps/chosen": -121.34841918945312, + "logps/rejected": -15.354421615600586, + "loss": 0.0089, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.51821756362915, + "rewards/margins": 4.51608419418335, + "rewards/rejected": 1.0021333694458008, + "step": 10704 + }, + { + "epoch": 1.74, + "learning_rate": 4.448676266389501e-07, + "logits/chosen": -1.3398361206054688, + "logits/rejected": -1.2635211944580078, + "logps/chosen": -48.045867919921875, + "logps/rejected": -25.710857391357422, + "loss": 0.0128, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.07504415512085, + "rewards/margins": 3.8345279693603516, + "rewards/rejected": 1.2405163049697876, + "step": 10705 + }, + { + "epoch": 1.74, + "learning_rate": 4.443258541900508e-07, + "logits/chosen": -1.2717359066009521, + "logits/rejected": -1.2717359066009521, + "logps/chosen": -64.13594055175781, + "logps/rejected": -64.13594055175781, + "loss": 1.3887, + "rewards/accuracies": 0.0, + "rewards/chosen": 4.144903659820557, + "rewards/margins": 0.0, + "rewards/rejected": 4.144903659820557, + "step": 10706 + }, + { + "epoch": 1.74, + "learning_rate": 4.4378439649512716e-07, + "logits/chosen": -1.5260857343673706, + "logits/rejected": -1.3854535818099976, + "logps/chosen": -167.35916137695312, + "logps/rejected": -180.56626892089844, + "loss": 0.2738, + "rewards/accuracies": 1.0, + "rewards/chosen": 7.2150163650512695, + "rewards/margins": 0.3648042678833008, + "rewards/rejected": 6.850212097167969, + "step": 10707 + }, + { + "epoch": 1.74, + "learning_rate": 4.4324325359159146e-07, + "logits/chosen": -1.2966228723526, + "logits/rejected": -1.3079357147216797, + "logps/chosen": -92.19064331054688, + "logps/rejected": -81.51847839355469, + "loss": 0.6082, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.957684338092804, + "rewards/margins": -0.8536773324012756, + "rewards/rejected": 1.8113616704940796, + "step": 10708 + }, + { + "epoch": 1.74, + "learning_rate": 4.4270242551682993e-07, + "logits/chosen": -1.0320463180541992, + "logits/rejected": -1.0276615619659424, + "logps/chosen": -3.0143637657165527, + "logps/rejected": -1.7766107320785522, + "loss": 0.6686, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.33783864974975586, + "rewards/margins": 0.10089902579784393, + "rewards/rejected": 0.23693962395191193, + "step": 10709 + }, + { + "epoch": 1.74, + "learning_rate": 4.421619123082099e-07, + "logits/chosen": -1.3899050951004028, + "logits/rejected": -1.2811510562896729, + "logps/chosen": -98.36334228515625, + "logps/rejected": -18.570457458496094, + "loss": 0.9095, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.366198778152466, + "rewards/margins": 2.2478480339050293, + "rewards/rejected": 0.1183507964015007, + "step": 10710 + }, + { + "epoch": 1.74, + "learning_rate": 4.416217140030743e-07, + "logits/chosen": -1.6046746969223022, + "logits/rejected": -1.586133360862732, + "logps/chosen": -57.30167770385742, + "logps/rejected": -62.65498352050781, + "loss": 0.4367, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.104985475540161, + "rewards/margins": 0.09552574157714844, + "rewards/rejected": 2.0094597339630127, + "step": 10711 + }, + { + "epoch": 1.74, + "learning_rate": 4.4108183063874765e-07, + "logits/chosen": -1.2810815572738647, + "logits/rejected": -1.1028798818588257, + "logps/chosen": -110.3272476196289, + "logps/rejected": -43.02198028564453, + "loss": 0.1551, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.337107181549072, + "rewards/margins": 3.7747650146484375, + "rewards/rejected": 2.5623421669006348, + "step": 10712 + }, + { + "epoch": 1.74, + "learning_rate": 4.4054226225252847e-07, + "logits/chosen": -1.0115333795547485, + "logits/rejected": -0.9570935368537903, + "logps/chosen": -64.75353240966797, + "logps/rejected": -97.10646057128906, + "loss": 1.4227, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.020918369293213, + "rewards/margins": -0.29896092414855957, + "rewards/rejected": 3.3198792934417725, + "step": 10713 + }, + { + "epoch": 1.74, + "learning_rate": 4.4000300888169753e-07, + "logits/chosen": -1.4145921468734741, + "logits/rejected": -1.410791277885437, + "logps/chosen": -60.96300506591797, + "logps/rejected": -52.863800048828125, + "loss": 0.4586, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.2305610179901123, + "rewards/margins": 0.2469397783279419, + "rewards/rejected": 1.9836212396621704, + "step": 10714 + }, + { + "epoch": 1.74, + "learning_rate": 4.3946407056351115e-07, + "logits/chosen": -0.9707031846046448, + "logits/rejected": -0.9707031846046448, + "logps/chosen": -21.524171829223633, + "logps/rejected": -21.524171829223633, + "loss": 0.8049, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.442056179046631, + "rewards/margins": 0.0, + "rewards/rejected": 2.442056179046631, + "step": 10715 + }, + { + "epoch": 1.74, + "learning_rate": 4.3892544733520617e-07, + "logits/chosen": -1.6083862781524658, + "logits/rejected": -1.4941471815109253, + "logps/chosen": -86.89568328857422, + "logps/rejected": -161.74880981445312, + "loss": 0.9526, + "rewards/accuracies": 0.0, + "rewards/chosen": 5.348701000213623, + "rewards/margins": -1.6969900131225586, + "rewards/rejected": 7.045691013336182, + "step": 10716 + }, + { + "epoch": 1.74, + "learning_rate": 4.3838713923399443e-07, + "logits/chosen": -1.5247948169708252, + "logits/rejected": -1.4873906373977661, + "logps/chosen": -65.5516586303711, + "logps/rejected": -58.42820739746094, + "loss": 0.3723, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.56575083732605, + "rewards/margins": 0.17468023300170898, + "rewards/rejected": 2.391070604324341, + "step": 10717 + }, + { + "epoch": 1.74, + "learning_rate": 4.3784914629706964e-07, + "logits/chosen": -0.9020330905914307, + "logits/rejected": -0.8946773409843445, + "logps/chosen": -1.274284839630127, + "logps/rejected": -2.0751469135284424, + "loss": 0.3921, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2681696116924286, + "rewards/margins": 0.08426158130168915, + "rewards/rejected": 0.18390803039073944, + "step": 10718 + }, + { + "epoch": 1.74, + "learning_rate": 4.373114685616009e-07, + "logits/chosen": -1.084159016609192, + "logits/rejected": -1.0661907196044922, + "logps/chosen": -55.90900802612305, + "logps/rejected": -30.137842178344727, + "loss": 0.6375, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.035634994506836, + "rewards/margins": -0.5397558212280273, + "rewards/rejected": 2.5753908157348633, + "step": 10719 + }, + { + "epoch": 1.74, + "learning_rate": 4.367741060647379e-07, + "logits/chosen": -1.0475666522979736, + "logits/rejected": -1.0495750904083252, + "logps/chosen": -52.093231201171875, + "logps/rejected": -106.9498519897461, + "loss": 0.1062, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.1947219371795654, + "rewards/margins": 1.716375708580017, + "rewards/rejected": 0.4783462584018707, + "step": 10720 + }, + { + "epoch": 1.74, + "learning_rate": 4.3623705884360544e-07, + "logits/chosen": -1.3257988691329956, + "logits/rejected": -1.2958704233169556, + "logps/chosen": -83.17108154296875, + "logps/rejected": -51.488712310791016, + "loss": 0.6213, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.4356887340545654, + "rewards/margins": -0.5143461227416992, + "rewards/rejected": 2.9500348567962646, + "step": 10721 + }, + { + "epoch": 1.74, + "learning_rate": 4.357003269353105e-07, + "logits/chosen": -1.3932474851608276, + "logits/rejected": -1.3245725631713867, + "logps/chosen": -110.4460220336914, + "logps/rejected": -61.93576431274414, + "loss": 0.0457, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.790865421295166, + "rewards/margins": 2.392058849334717, + "rewards/rejected": 4.398806571960449, + "step": 10722 + }, + { + "epoch": 1.74, + "learning_rate": 4.351639103769345e-07, + "logits/chosen": -1.4240614175796509, + "logits/rejected": -1.368031620979309, + "logps/chosen": -66.1741943359375, + "logps/rejected": -44.292537689208984, + "loss": 0.044, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.187652587890625, + "rewards/margins": 2.425645112991333, + "rewards/rejected": 2.762007474899292, + "step": 10723 + }, + { + "epoch": 1.74, + "learning_rate": 4.3462780920554e-07, + "logits/chosen": -1.165704369544983, + "logits/rejected": -1.0842254161834717, + "logps/chosen": -90.91856384277344, + "logps/rejected": -88.51554107666016, + "loss": 0.453, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.419694662094116, + "rewards/margins": -0.3216848373413086, + "rewards/rejected": 2.741379499435425, + "step": 10724 + }, + { + "epoch": 1.74, + "learning_rate": 4.340920234581653e-07, + "logits/chosen": -1.7050238847732544, + "logits/rejected": -1.6656330823898315, + "logps/chosen": -65.21731567382812, + "logps/rejected": -70.40303039550781, + "loss": 0.112, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.670114278793335, + "rewards/margins": 1.3917512893676758, + "rewards/rejected": 2.278362989425659, + "step": 10725 + }, + { + "epoch": 1.74, + "learning_rate": 4.3355655317182944e-07, + "logits/chosen": -1.6111267805099487, + "logits/rejected": -1.5011917352676392, + "logps/chosen": -67.80078125, + "logps/rejected": -44.580902099609375, + "loss": 0.3124, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.2300087213516235, + "rewards/margins": 0.7392898797988892, + "rewards/rejected": 0.4907188415527344, + "step": 10726 + }, + { + "epoch": 1.74, + "learning_rate": 4.3302139838352687e-07, + "logits/chosen": -1.4243216514587402, + "logits/rejected": -1.3520961999893188, + "logps/chosen": -134.52377319335938, + "logps/rejected": -78.02603149414062, + "loss": 0.1502, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.311259746551514, + "rewards/margins": 1.0634827613830566, + "rewards/rejected": 5.247776985168457, + "step": 10727 + }, + { + "epoch": 1.74, + "learning_rate": 4.3248655913023353e-07, + "logits/chosen": -1.1276935338974, + "logits/rejected": -1.1350220441818237, + "logps/chosen": -52.174434661865234, + "logps/rejected": -79.89375305175781, + "loss": 0.2409, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.9631344079971313, + "rewards/margins": 0.7309825420379639, + "rewards/rejected": 1.2321518659591675, + "step": 10728 + }, + { + "epoch": 1.74, + "learning_rate": 4.319520354488993e-07, + "logits/chosen": -1.6281061172485352, + "logits/rejected": -1.793489933013916, + "logps/chosen": -130.519287109375, + "logps/rejected": -229.46043395996094, + "loss": 2.7604, + "rewards/accuracies": 0.0, + "rewards/chosen": 4.18544340133667, + "rewards/margins": -3.7964491844177246, + "rewards/rejected": 7.9818925857543945, + "step": 10729 + }, + { + "epoch": 1.74, + "learning_rate": 4.3141782737645743e-07, + "logits/chosen": -0.9124028086662292, + "logits/rejected": -0.9498122930526733, + "logps/chosen": -6.336236000061035, + "logps/rejected": -28.043201446533203, + "loss": 0.5142, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.41141054034233093, + "rewards/margins": -0.4930221736431122, + "rewards/rejected": 0.9044327139854431, + "step": 10730 + }, + { + "epoch": 1.74, + "learning_rate": 4.308839349498145e-07, + "logits/chosen": -1.4356657266616821, + "logits/rejected": -1.450701355934143, + "logps/chosen": -108.36103057861328, + "logps/rejected": -84.6402587890625, + "loss": 0.6828, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.9804253578186035, + "rewards/margins": -0.4387390613555908, + "rewards/rejected": 3.4191644191741943, + "step": 10731 + }, + { + "epoch": 1.74, + "learning_rate": 4.3035035820585935e-07, + "logits/chosen": -0.8161901831626892, + "logits/rejected": -0.8006702065467834, + "logps/chosen": -40.375526428222656, + "logps/rejected": -75.5294189453125, + "loss": 0.3812, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.7608788013458252, + "rewards/margins": 0.038210272789001465, + "rewards/rejected": 1.7226685285568237, + "step": 10732 + }, + { + "epoch": 1.74, + "learning_rate": 4.298170971814547e-07, + "logits/chosen": -1.409779667854309, + "logits/rejected": -1.3778990507125854, + "logps/chosen": -38.44999313354492, + "logps/rejected": -39.944374084472656, + "loss": 0.3015, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.679917573928833, + "rewards/margins": 0.3020205497741699, + "rewards/rejected": 2.377897024154663, + "step": 10733 + }, + { + "epoch": 1.74, + "learning_rate": 4.2928415191344664e-07, + "logits/chosen": -1.3043121099472046, + "logits/rejected": -1.3066437244415283, + "logps/chosen": -69.47010803222656, + "logps/rejected": -107.45732116699219, + "loss": 0.402, + "rewards/accuracies": 0.0, + "rewards/chosen": 6.331231594085693, + "rewards/margins": -0.20013904571533203, + "rewards/rejected": 6.531370639801025, + "step": 10734 + }, + { + "epoch": 1.74, + "learning_rate": 4.2875152243865347e-07, + "logits/chosen": -1.2836734056472778, + "logits/rejected": -1.2620803117752075, + "logps/chosen": -35.798221588134766, + "logps/rejected": -12.877374649047852, + "loss": 0.7833, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.454480767250061, + "rewards/margins": 0.4556400179862976, + "rewards/rejected": 0.9988407492637634, + "step": 10735 + }, + { + "epoch": 1.74, + "learning_rate": 4.28219208793878e-07, + "logits/chosen": -1.5216165781021118, + "logits/rejected": -1.4798736572265625, + "logps/chosen": -121.43767547607422, + "logps/rejected": -84.45249938964844, + "loss": 0.1494, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.726180553436279, + "rewards/margins": 1.2150568962097168, + "rewards/rejected": 4.5111236572265625, + "step": 10736 + }, + { + "epoch": 1.74, + "learning_rate": 4.276872110158958e-07, + "logits/chosen": -0.9774675965309143, + "logits/rejected": -0.9498783946037292, + "logps/chosen": -43.81619644165039, + "logps/rejected": -84.88850402832031, + "loss": 0.1263, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.4668545722961426, + "rewards/margins": 1.8949825763702393, + "rewards/rejected": 0.5718719363212585, + "step": 10737 + }, + { + "epoch": 1.74, + "learning_rate": 4.271555291414636e-07, + "logits/chosen": -1.6006314754486084, + "logits/rejected": -1.4068005084991455, + "logps/chosen": -169.49725341796875, + "logps/rejected": -90.8387680053711, + "loss": 0.0925, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.326266765594482, + "rewards/margins": 1.5946006774902344, + "rewards/rejected": 4.731666088104248, + "step": 10738 + }, + { + "epoch": 1.74, + "learning_rate": 4.2662416320731645e-07, + "logits/chosen": -1.7240358591079712, + "logits/rejected": -1.7011380195617676, + "logps/chosen": -33.58014678955078, + "logps/rejected": -15.513442039489746, + "loss": 0.1218, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.301008701324463, + "rewards/margins": 1.348425269126892, + "rewards/rejected": 0.9525834321975708, + "step": 10739 + }, + { + "epoch": 1.74, + "learning_rate": 4.2609311325016503e-07, + "logits/chosen": -1.0448708534240723, + "logits/rejected": -1.0770227909088135, + "logps/chosen": -66.84504699707031, + "logps/rejected": -97.13957214355469, + "loss": 0.0584, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.964885711669922, + "rewards/margins": 2.732199192047119, + "rewards/rejected": 1.2326866388320923, + "step": 10740 + }, + { + "epoch": 1.74, + "learning_rate": 4.2556237930670153e-07, + "logits/chosen": -1.4879547357559204, + "logits/rejected": -1.4453843832015991, + "logps/chosen": -186.78323364257812, + "logps/rejected": -24.863636016845703, + "loss": 0.0697, + "rewards/accuracies": 1.0, + "rewards/chosen": 8.560736656188965, + "rewards/margins": 7.86461067199707, + "rewards/rejected": 0.6961258053779602, + "step": 10741 + }, + { + "epoch": 1.74, + "learning_rate": 4.250319614135934e-07, + "logits/chosen": -1.6409649848937988, + "logits/rejected": -1.7070140838623047, + "logps/chosen": -181.40158081054688, + "logps/rejected": -78.67190551757812, + "loss": 0.1603, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.348483562469482, + "rewards/margins": 3.0036609172821045, + "rewards/rejected": 3.344822645187378, + "step": 10742 + }, + { + "epoch": 1.74, + "learning_rate": 4.2450185960748847e-07, + "logits/chosen": -0.7641183137893677, + "logits/rejected": -0.789301872253418, + "logps/chosen": -6.931506156921387, + "logps/rejected": -47.26203155517578, + "loss": 0.4566, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.28575000166893005, + "rewards/margins": -0.38604411482810974, + "rewards/rejected": 0.6717941164970398, + "step": 10743 + }, + { + "epoch": 1.74, + "learning_rate": 4.239720739250108e-07, + "logits/chosen": -1.0499159097671509, + "logits/rejected": -1.0322691202163696, + "logps/chosen": -46.33771514892578, + "logps/rejected": -15.60981273651123, + "loss": 0.4483, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.6498172879219055, + "rewards/margins": -0.025337696075439453, + "rewards/rejected": 0.675154983997345, + "step": 10744 + }, + { + "epoch": 1.74, + "learning_rate": 4.2344260440276455e-07, + "logits/chosen": -1.7809984683990479, + "logits/rejected": -1.5977116823196411, + "logps/chosen": -128.98480224609375, + "logps/rejected": -57.61004638671875, + "loss": 0.0591, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.857120037078857, + "rewards/margins": 2.212472915649414, + "rewards/rejected": 4.644647121429443, + "step": 10745 + }, + { + "epoch": 1.74, + "learning_rate": 4.229134510773297e-07, + "logits/chosen": -1.4824345111846924, + "logits/rejected": -1.374982476234436, + "logps/chosen": -173.26284790039062, + "logps/rejected": -22.57806968688965, + "loss": 0.114, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.922253608703613, + "rewards/margins": 6.2299113273620605, + "rewards/rejected": 0.6923424005508423, + "step": 10746 + }, + { + "epoch": 1.74, + "learning_rate": 4.223846139852678e-07, + "logits/chosen": -0.8071216940879822, + "logits/rejected": -0.8847324848175049, + "logps/chosen": -74.1966323852539, + "logps/rejected": -53.80131530761719, + "loss": 0.3667, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.449608564376831, + "rewards/margins": -0.058643341064453125, + "rewards/rejected": 2.508251905441284, + "step": 10747 + }, + { + "epoch": 1.74, + "learning_rate": 4.2185609316311383e-07, + "logits/chosen": -1.526497721672058, + "logits/rejected": -1.5302613973617554, + "logps/chosen": -113.83621978759766, + "logps/rejected": -57.02879333496094, + "loss": 0.2149, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.160354137420654, + "rewards/margins": 0.625704288482666, + "rewards/rejected": 5.534649848937988, + "step": 10748 + }, + { + "epoch": 1.74, + "learning_rate": 4.213278886473865e-07, + "logits/chosen": -1.1254218816757202, + "logits/rejected": -1.132000207901001, + "logps/chosen": -32.59956359863281, + "logps/rejected": -55.361873626708984, + "loss": 0.7799, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.1046369075775146, + "rewards/margins": -0.3213026523590088, + "rewards/rejected": 2.4259395599365234, + "step": 10749 + }, + { + "epoch": 1.74, + "learning_rate": 4.2080000047457716e-07, + "logits/chosen": -1.2111462354660034, + "logits/rejected": -1.2200745344161987, + "logps/chosen": -50.07149887084961, + "logps/rejected": -72.54312133789062, + "loss": 1.5489, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.6874606609344482, + "rewards/margins": -0.06959056854248047, + "rewards/rejected": 2.7570512294769287, + "step": 10750 + }, + { + "epoch": 1.75, + "learning_rate": 4.202724286811599e-07, + "logits/chosen": -1.4925774335861206, + "logits/rejected": -1.4207184314727783, + "logps/chosen": -149.67344665527344, + "logps/rejected": -122.59219360351562, + "loss": 1.5839, + "rewards/accuracies": 1.0, + "rewards/chosen": 7.09567403793335, + "rewards/margins": 0.03971672058105469, + "rewards/rejected": 7.055957317352295, + "step": 10751 + }, + { + "epoch": 1.75, + "learning_rate": 4.197451733035829e-07, + "logits/chosen": -1.58106529712677, + "logits/rejected": -1.5753529071807861, + "logps/chosen": -53.918739318847656, + "logps/rejected": -51.718326568603516, + "loss": 0.8378, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.621106743812561, + "rewards/margins": -1.465816855430603, + "rewards/rejected": 3.086923599243164, + "step": 10752 + }, + { + "epoch": 1.75, + "learning_rate": 4.1921823437827694e-07, + "logits/chosen": -0.7979653477668762, + "logits/rejected": -0.6998756527900696, + "logps/chosen": -55.952842712402344, + "logps/rejected": -55.807823181152344, + "loss": 0.447, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.8134591579437256, + "rewards/margins": 0.3707313537597656, + "rewards/rejected": 2.44272780418396, + "step": 10753 + }, + { + "epoch": 1.75, + "learning_rate": 4.1869161194164565e-07, + "logits/chosen": -1.0505399703979492, + "logits/rejected": -1.0332975387573242, + "logps/chosen": -50.83778381347656, + "logps/rejected": -98.44631958007812, + "loss": 0.2824, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.4223495721817017, + "rewards/margins": 1.372381567955017, + "rewards/rejected": 0.04996795579791069, + "step": 10754 + }, + { + "epoch": 1.75, + "learning_rate": 4.1816530603007667e-07, + "logits/chosen": -1.2736635208129883, + "logits/rejected": -1.2802026271820068, + "logps/chosen": -81.21086120605469, + "logps/rejected": -134.39845275878906, + "loss": 1.6204, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.8181328773498535, + "rewards/margins": -2.4661307334899902, + "rewards/rejected": 5.284263610839844, + "step": 10755 + }, + { + "epoch": 1.75, + "learning_rate": 4.176393166799303e-07, + "logits/chosen": -1.3111557960510254, + "logits/rejected": -1.1427098512649536, + "logps/chosen": -68.48149108886719, + "logps/rejected": -8.310029983520508, + "loss": 0.4249, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.5280654430389404, + "rewards/margins": 1.8540520668029785, + "rewards/rejected": 0.6740134358406067, + "step": 10756 + }, + { + "epoch": 1.75, + "learning_rate": 4.171136439275497e-07, + "logits/chosen": -1.478165626525879, + "logits/rejected": -1.4472005367279053, + "logps/chosen": -59.17424774169922, + "logps/rejected": -67.93301391601562, + "loss": 1.5296, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.2458138465881348, + "rewards/margins": -0.7894377708435059, + "rewards/rejected": 4.035251617431641, + "step": 10757 + }, + { + "epoch": 1.75, + "learning_rate": 4.1658828780925145e-07, + "logits/chosen": -1.4697902202606201, + "logits/rejected": -1.3352733850479126, + "logps/chosen": -76.84691619873047, + "logps/rejected": -33.05868148803711, + "loss": 0.1526, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.2888023853302, + "rewards/margins": 1.0618629455566406, + "rewards/rejected": 1.2269394397735596, + "step": 10758 + }, + { + "epoch": 1.75, + "learning_rate": 4.1606324836133474e-07, + "logits/chosen": -1.2886296510696411, + "logits/rejected": -1.3454865217208862, + "logps/chosen": -44.12188720703125, + "logps/rejected": -80.95189666748047, + "loss": 0.3279, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.6414618492126465, + "rewards/margins": 2.1552932262420654, + "rewards/rejected": 2.486168622970581, + "step": 10759 + }, + { + "epoch": 1.75, + "learning_rate": 4.155385256200728e-07, + "logits/chosen": -1.152165174484253, + "logits/rejected": -1.0180881023406982, + "logps/chosen": -24.504714965820312, + "logps/rejected": -12.154609680175781, + "loss": 0.2278, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.8600791692733765, + "rewards/margins": 0.874198853969574, + "rewards/rejected": 0.9858803153038025, + "step": 10760 + }, + { + "epoch": 1.75, + "learning_rate": 4.150141196217217e-07, + "logits/chosen": -1.3936389684677124, + "logits/rejected": -1.070648193359375, + "logps/chosen": -9.638535499572754, + "logps/rejected": -172.26400756835938, + "loss": 3.9637, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.058672141283750534, + "rewards/margins": -5.249003887176514, + "rewards/rejected": 5.190331935882568, + "step": 10761 + }, + { + "epoch": 1.75, + "learning_rate": 4.144900304025101e-07, + "logits/chosen": -1.1225190162658691, + "logits/rejected": -1.1225190162658691, + "logps/chosen": -38.158355712890625, + "logps/rejected": -38.158355712890625, + "loss": 0.3596, + "rewards/accuracies": 0.0, + "rewards/chosen": 4.02877950668335, + "rewards/margins": 0.0, + "rewards/rejected": 4.02877950668335, + "step": 10762 + }, + { + "epoch": 1.75, + "learning_rate": 4.1396625799864975e-07, + "logits/chosen": -1.5072075128555298, + "logits/rejected": -1.4973835945129395, + "logps/chosen": -64.4732666015625, + "logps/rejected": -105.8427734375, + "loss": 2.4526, + "rewards/accuracies": 0.0, + "rewards/chosen": 4.176751136779785, + "rewards/margins": -4.870645523071289, + "rewards/rejected": 9.047396659851074, + "step": 10763 + }, + { + "epoch": 1.75, + "learning_rate": 4.134428024463266e-07, + "logits/chosen": -1.3526636362075806, + "logits/rejected": -1.074668288230896, + "logps/chosen": -132.68563842773438, + "logps/rejected": -74.04488372802734, + "loss": 1.5168, + "rewards/accuracies": 0.0, + "rewards/chosen": 4.762336730957031, + "rewards/margins": -0.4625053405761719, + "rewards/rejected": 5.224842071533203, + "step": 10764 + }, + { + "epoch": 1.75, + "learning_rate": 4.1291966378170846e-07, + "logits/chosen": -1.6785300970077515, + "logits/rejected": -1.5431338548660278, + "logps/chosen": -136.60659790039062, + "logps/rejected": -76.89242553710938, + "loss": 0.6078, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.4133195877075195, + "rewards/margins": 2.891927480697632, + "rewards/rejected": 1.5213921070098877, + "step": 10765 + }, + { + "epoch": 1.75, + "learning_rate": 4.1239684204093747e-07, + "logits/chosen": -1.1194509267807007, + "logits/rejected": -1.094414472579956, + "logps/chosen": -64.61695861816406, + "logps/rejected": -73.95234680175781, + "loss": 0.1828, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.5020997524261475, + "rewards/margins": 0.900955319404602, + "rewards/rejected": 1.6011444330215454, + "step": 10766 + }, + { + "epoch": 1.75, + "learning_rate": 4.1187433726013695e-07, + "logits/chosen": -1.2228376865386963, + "logits/rejected": -1.1558321714401245, + "logps/chosen": -61.73262405395508, + "logps/rejected": -77.34625244140625, + "loss": 0.6503, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.3368122577667236, + "rewards/margins": 0.9740269184112549, + "rewards/rejected": 2.3627853393554688, + "step": 10767 + }, + { + "epoch": 1.75, + "learning_rate": 4.1135214947540524e-07, + "logits/chosen": -1.2772047519683838, + "logits/rejected": -1.2502069473266602, + "logps/chosen": -81.93445587158203, + "logps/rejected": -118.09033966064453, + "loss": 0.5313, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.3517234325408936, + "rewards/margins": -0.6264512538909912, + "rewards/rejected": 2.9781746864318848, + "step": 10768 + }, + { + "epoch": 1.75, + "learning_rate": 4.1083027872282354e-07, + "logits/chosen": -1.266424298286438, + "logits/rejected": -1.3179718255996704, + "logps/chosen": -59.12093734741211, + "logps/rejected": -75.18521118164062, + "loss": 0.4725, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.4937808513641357, + "rewards/margins": 0.16387653350830078, + "rewards/rejected": 3.329904317855835, + "step": 10769 + }, + { + "epoch": 1.75, + "learning_rate": 4.103087250384452e-07, + "logits/chosen": -1.3656119108200073, + "logits/rejected": -1.4118716716766357, + "logps/chosen": -178.95423889160156, + "logps/rejected": -99.10676574707031, + "loss": 2.0738, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.3748185634613037, + "rewards/margins": -2.0170562267303467, + "rewards/rejected": 5.39187479019165, + "step": 10770 + }, + { + "epoch": 1.75, + "learning_rate": 4.097874884583064e-07, + "logits/chosen": -1.350282073020935, + "logits/rejected": -1.283797025680542, + "logps/chosen": -113.48799133300781, + "logps/rejected": -74.90743255615234, + "loss": 1.2818, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.627681016921997, + "rewards/margins": 1.6680198907852173, + "rewards/rejected": 0.9596611261367798, + "step": 10771 + }, + { + "epoch": 1.75, + "learning_rate": 4.092665690184189e-07, + "logits/chosen": -1.33356773853302, + "logits/rejected": -1.3014870882034302, + "logps/chosen": -78.75520324707031, + "logps/rejected": -58.56231689453125, + "loss": 0.6049, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.778059482574463, + "rewards/margins": -0.12343597412109375, + "rewards/rejected": 2.9014954566955566, + "step": 10772 + }, + { + "epoch": 1.75, + "learning_rate": 4.087459667547733e-07, + "logits/chosen": -1.2106211185455322, + "logits/rejected": -1.202157974243164, + "logps/chosen": -63.179283142089844, + "logps/rejected": -53.170650482177734, + "loss": 0.2756, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.243725776672363, + "rewards/margins": 2.550118923187256, + "rewards/rejected": 1.693606972694397, + "step": 10773 + }, + { + "epoch": 1.75, + "learning_rate": 4.082256817033392e-07, + "logits/chosen": -1.2855110168457031, + "logits/rejected": -1.2070472240447998, + "logps/chosen": -55.480411529541016, + "logps/rejected": -22.45987892150879, + "loss": 0.4388, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.3650128841400146, + "rewards/margins": 0.622477650642395, + "rewards/rejected": 1.7425352334976196, + "step": 10774 + }, + { + "epoch": 1.75, + "learning_rate": 4.077057139000629e-07, + "logits/chosen": -1.2308658361434937, + "logits/rejected": -1.1741840839385986, + "logps/chosen": -77.2306900024414, + "logps/rejected": -24.514169692993164, + "loss": 0.525, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.0928025245666504, + "rewards/margins": -0.5637199878692627, + "rewards/rejected": 2.656522512435913, + "step": 10775 + }, + { + "epoch": 1.75, + "learning_rate": 4.0718606338086884e-07, + "logits/chosen": -1.5848796367645264, + "logits/rejected": -1.6081979274749756, + "logps/chosen": -168.72117614746094, + "logps/rejected": -119.18632507324219, + "loss": 0.1678, + "rewards/accuracies": 1.0, + "rewards/chosen": 8.278373718261719, + "rewards/margins": 1.000727653503418, + "rewards/rejected": 7.277646064758301, + "step": 10776 + }, + { + "epoch": 1.75, + "learning_rate": 4.066667301816596e-07, + "logits/chosen": -1.3443522453308105, + "logits/rejected": -1.150996208190918, + "logps/chosen": -239.53445434570312, + "logps/rejected": -111.74089050292969, + "loss": 0.1551, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.605987548828125, + "rewards/margins": 1.1338224411010742, + "rewards/rejected": 5.472165107727051, + "step": 10777 + }, + { + "epoch": 1.75, + "learning_rate": 4.06147714338318e-07, + "logits/chosen": -1.3258569240570068, + "logits/rejected": -1.1586353778839111, + "logps/chosen": -50.613895416259766, + "logps/rejected": -23.460418701171875, + "loss": 0.1462, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.5032627582550049, + "rewards/margins": 1.6227689981460571, + "rewards/rejected": -0.11950626224279404, + "step": 10778 + }, + { + "epoch": 1.75, + "learning_rate": 4.0562901588670044e-07, + "logits/chosen": -1.5750832557678223, + "logits/rejected": -1.4931434392929077, + "logps/chosen": -157.72076416015625, + "logps/rejected": -62.26219940185547, + "loss": 0.2385, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.055037021636963, + "rewards/margins": 0.8885002136230469, + "rewards/rejected": 2.166536808013916, + "step": 10779 + }, + { + "epoch": 1.75, + "learning_rate": 4.0511063486264655e-07, + "logits/chosen": -1.3687944412231445, + "logits/rejected": -1.3719745874404907, + "logps/chosen": -24.408687591552734, + "logps/rejected": -62.025901794433594, + "loss": 0.7147, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.6274276971817017, + "rewards/margins": -1.0850058794021606, + "rewards/rejected": 2.7124335765838623, + "step": 10780 + }, + { + "epoch": 1.75, + "learning_rate": 4.0459257130196995e-07, + "logits/chosen": -1.4579319953918457, + "logits/rejected": -1.4126635789871216, + "logps/chosen": -36.83872604370117, + "logps/rejected": -148.18280029296875, + "loss": 0.7101, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.382056951522827, + "rewards/margins": 0.7446668148040771, + "rewards/rejected": 1.63739013671875, + "step": 10781 + }, + { + "epoch": 1.75, + "learning_rate": 4.0407482524046527e-07, + "logits/chosen": -1.203100323677063, + "logits/rejected": -1.204125165939331, + "logps/chosen": -3.643500328063965, + "logps/rejected": -41.57622528076172, + "loss": 0.4949, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5559412240982056, + "rewards/margins": 0.14064085483551025, + "rewards/rejected": 0.4153003692626953, + "step": 10782 + }, + { + "epoch": 1.75, + "learning_rate": 4.0355739671390226e-07, + "logits/chosen": -1.4362480640411377, + "logits/rejected": -1.4779027700424194, + "logps/chosen": -78.57518005371094, + "logps/rejected": -62.83503723144531, + "loss": 1.7125, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.7639449834823608, + "rewards/margins": -2.3374648094177246, + "rewards/rejected": 4.101409912109375, + "step": 10783 + }, + { + "epoch": 1.75, + "learning_rate": 4.0304028575803177e-07, + "logits/chosen": -1.1227667331695557, + "logits/rejected": -1.1809996366500854, + "logps/chosen": -67.4034423828125, + "logps/rejected": -84.85066223144531, + "loss": 2.0006, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.054114580154419, + "rewards/margins": -1.458467960357666, + "rewards/rejected": 3.512582540512085, + "step": 10784 + }, + { + "epoch": 1.75, + "learning_rate": 4.0252349240858026e-07, + "logits/chosen": -1.1866525411605835, + "logits/rejected": -1.2413092851638794, + "logps/chosen": -39.627010345458984, + "logps/rejected": -51.30109405517578, + "loss": 0.6721, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.7269229888916016, + "rewards/margins": -0.6521632671356201, + "rewards/rejected": 3.3790862560272217, + "step": 10785 + }, + { + "epoch": 1.75, + "learning_rate": 4.020070167012541e-07, + "logits/chosen": -1.635316252708435, + "logits/rejected": -1.5785337686538696, + "logps/chosen": -171.16799926757812, + "logps/rejected": -26.971473693847656, + "loss": 0.033, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.0924530029296875, + "rewards/margins": 6.2683868408203125, + "rewards/rejected": -0.17593364417552948, + "step": 10786 + }, + { + "epoch": 1.75, + "learning_rate": 4.014908586717359e-07, + "logits/chosen": -1.2067338228225708, + "logits/rejected": -1.2067338228225708, + "logps/chosen": -17.163238525390625, + "logps/rejected": -17.163238525390625, + "loss": 0.8067, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.663639783859253, + "rewards/margins": 0.0, + "rewards/rejected": 2.663639783859253, + "step": 10787 + }, + { + "epoch": 1.75, + "learning_rate": 4.0097501835568874e-07, + "logits/chosen": -1.2653553485870361, + "logits/rejected": -1.2316138744354248, + "logps/chosen": -45.53786849975586, + "logps/rejected": -33.26405334472656, + "loss": 0.2319, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.3663876056671143, + "rewards/margins": 0.5488636493682861, + "rewards/rejected": 1.8175239562988281, + "step": 10788 + }, + { + "epoch": 1.75, + "learning_rate": 4.004594957887503e-07, + "logits/chosen": -1.097114086151123, + "logits/rejected": -1.1238938570022583, + "logps/chosen": -25.730031967163086, + "logps/rejected": -89.45243835449219, + "loss": 1.708, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.1998804807662964, + "rewards/margins": 0.29292017221450806, + "rewards/rejected": 0.9069603085517883, + "step": 10789 + }, + { + "epoch": 1.75, + "learning_rate": 3.9994429100654044e-07, + "logits/chosen": -0.7363104820251465, + "logits/rejected": -0.636794924736023, + "logps/chosen": -40.7049560546875, + "logps/rejected": -0.9258345365524292, + "loss": 0.5471, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.2444190979003906, + "rewards/margins": 0.7697098255157471, + "rewards/rejected": 0.47470924258232117, + "step": 10790 + }, + { + "epoch": 1.75, + "learning_rate": 3.9942940404465333e-07, + "logits/chosen": -1.1857552528381348, + "logits/rejected": -1.2776391506195068, + "logps/chosen": -71.139404296875, + "logps/rejected": -127.32374572753906, + "loss": 1.4102, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.342181444168091, + "rewards/margins": -2.5402605533599854, + "rewards/rejected": 5.882441997528076, + "step": 10791 + }, + { + "epoch": 1.75, + "learning_rate": 3.98914834938664e-07, + "logits/chosen": -1.2690932750701904, + "logits/rejected": -1.1821329593658447, + "logps/chosen": -35.838157653808594, + "logps/rejected": -68.05934143066406, + "loss": 1.6914, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.074526309967041, + "rewards/margins": -0.010303497314453125, + "rewards/rejected": 3.084829807281494, + "step": 10792 + }, + { + "epoch": 1.75, + "learning_rate": 3.984005837241228e-07, + "logits/chosen": -1.5218111276626587, + "logits/rejected": -1.4372339248657227, + "logps/chosen": -95.49807739257812, + "logps/rejected": -66.24058532714844, + "loss": 0.7099, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.4543198347091675, + "rewards/margins": -0.06595838069915771, + "rewards/rejected": 1.5202782154083252, + "step": 10793 + }, + { + "epoch": 1.75, + "learning_rate": 3.9788665043656083e-07, + "logits/chosen": -1.0986404418945312, + "logits/rejected": -1.1337732076644897, + "logps/chosen": -35.62275695800781, + "logps/rejected": -72.70068359375, + "loss": 0.5346, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.6363651752471924, + "rewards/margins": -0.014677047729492188, + "rewards/rejected": 1.6510422229766846, + "step": 10794 + }, + { + "epoch": 1.75, + "learning_rate": 3.973730351114863e-07, + "logits/chosen": -1.2839815616607666, + "logits/rejected": -1.2015550136566162, + "logps/chosen": -33.40501403808594, + "logps/rejected": -14.720464706420898, + "loss": 0.6373, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.7729988098144531, + "rewards/margins": -0.1264505386352539, + "rewards/rejected": 0.899449348449707, + "step": 10795 + }, + { + "epoch": 1.75, + "learning_rate": 3.9685973778438413e-07, + "logits/chosen": -1.0780115127563477, + "logits/rejected": -1.0575305223464966, + "logps/chosen": -74.00831604003906, + "logps/rejected": -57.90936279296875, + "loss": 0.4411, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.3405158519744873, + "rewards/margins": -0.28302550315856934, + "rewards/rejected": 3.6235413551330566, + "step": 10796 + }, + { + "epoch": 1.75, + "learning_rate": 3.9634675849071934e-07, + "logits/chosen": -1.380739450454712, + "logits/rejected": -1.3064528703689575, + "logps/chosen": -75.8145751953125, + "logps/rejected": -49.49275207519531, + "loss": 0.9286, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.0282211303710938, + "rewards/margins": -0.5924363136291504, + "rewards/rejected": 2.620657444000244, + "step": 10797 + }, + { + "epoch": 1.75, + "learning_rate": 3.9583409726593246e-07, + "logits/chosen": -1.163496494293213, + "logits/rejected": -1.3450281620025635, + "logps/chosen": -74.34008026123047, + "logps/rejected": -184.43145751953125, + "loss": 1.7227, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.9518487453460693, + "rewards/margins": -3.4110357761383057, + "rewards/rejected": 7.362884521484375, + "step": 10798 + }, + { + "epoch": 1.75, + "learning_rate": 3.9532175414544517e-07, + "logits/chosen": -1.5190857648849487, + "logits/rejected": -1.5731531381607056, + "logps/chosen": -163.2686004638672, + "logps/rejected": -59.28015899658203, + "loss": 0.0167, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.821742534637451, + "rewards/margins": 3.3905489444732666, + "rewards/rejected": 1.4311935901641846, + "step": 10799 + }, + { + "epoch": 1.75, + "learning_rate": 3.948097291646541e-07, + "logits/chosen": -1.3655418157577515, + "logits/rejected": -1.333596110343933, + "logps/chosen": -111.09820556640625, + "logps/rejected": -81.32218170166016, + "loss": 0.3182, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.284732341766357, + "rewards/margins": 2.060967445373535, + "rewards/rejected": 4.223764896392822, + "step": 10800 + }, + { + "epoch": 1.75, + "learning_rate": 3.942980223589371e-07, + "logits/chosen": -1.3682228326797485, + "logits/rejected": -1.3648653030395508, + "logps/chosen": -98.84235382080078, + "logps/rejected": -56.582763671875, + "loss": 0.3662, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.047994375228882, + "rewards/margins": 0.5433725118637085, + "rewards/rejected": 1.5046218633651733, + "step": 10801 + }, + { + "epoch": 1.75, + "learning_rate": 3.937866337636459e-07, + "logits/chosen": -1.0557804107666016, + "logits/rejected": -1.1039189100265503, + "logps/chosen": -49.287864685058594, + "logps/rejected": -43.01167297363281, + "loss": 2.4183, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.7210686206817627, + "rewards/margins": 0.6342003345489502, + "rewards/rejected": 2.0868682861328125, + "step": 10802 + }, + { + "epoch": 1.75, + "learning_rate": 3.93275563414115e-07, + "logits/chosen": -1.504698634147644, + "logits/rejected": -1.4103056192398071, + "logps/chosen": -66.97296905517578, + "logps/rejected": -20.90550994873047, + "loss": 0.1015, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.704319715499878, + "rewards/margins": 1.6960856914520264, + "rewards/rejected": 1.0082340240478516, + "step": 10803 + }, + { + "epoch": 1.75, + "learning_rate": 3.9276481134565224e-07, + "logits/chosen": -1.1070942878723145, + "logits/rejected": -1.1690025329589844, + "logps/chosen": -89.19502258300781, + "logps/rejected": -43.511993408203125, + "loss": 0.4, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.7928237915039062, + "rewards/margins": -0.03940272331237793, + "rewards/rejected": 2.832226514816284, + "step": 10804 + }, + { + "epoch": 1.75, + "learning_rate": 3.9225437759354836e-07, + "logits/chosen": -1.5019201040267944, + "logits/rejected": -1.4928616285324097, + "logps/chosen": -144.5287322998047, + "logps/rejected": -108.55404663085938, + "loss": 0.1547, + "rewards/accuracies": 1.0, + "rewards/chosen": 7.0024871826171875, + "rewards/margins": 1.3055343627929688, + "rewards/rejected": 5.696952819824219, + "step": 10805 + }, + { + "epoch": 1.75, + "learning_rate": 3.917442621930673e-07, + "logits/chosen": -1.1598882675170898, + "logits/rejected": -1.1598882675170898, + "logps/chosen": -66.56450653076172, + "logps/rejected": -66.56450653076172, + "loss": 0.874, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.7369918823242188, + "rewards/margins": 0.0, + "rewards/rejected": 3.7369918823242188, + "step": 10806 + }, + { + "epoch": 1.75, + "learning_rate": 3.912344651794542e-07, + "logits/chosen": -1.322554349899292, + "logits/rejected": -1.2788759469985962, + "logps/chosen": -144.18771362304688, + "logps/rejected": -72.4427490234375, + "loss": 0.3347, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.7723464965820312, + "rewards/margins": 1.139715552330017, + "rewards/rejected": 1.6326309442520142, + "step": 10807 + }, + { + "epoch": 1.75, + "learning_rate": 3.9072498658793036e-07, + "logits/chosen": -1.2475392818450928, + "logits/rejected": -1.187997579574585, + "logps/chosen": -114.06401824951172, + "logps/rejected": -90.072021484375, + "loss": 0.7828, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.959747314453125, + "rewards/margins": -1.060272216796875, + "rewards/rejected": 2.02001953125, + "step": 10808 + }, + { + "epoch": 1.75, + "learning_rate": 3.902158264536976e-07, + "logits/chosen": -1.3363250494003296, + "logits/rejected": -1.467714786529541, + "logps/chosen": -31.33237075805664, + "logps/rejected": -119.91567993164062, + "loss": 3.5144, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.360365629196167, + "rewards/margins": -5.076172828674316, + "rewards/rejected": 7.4365386962890625, + "step": 10809 + }, + { + "epoch": 1.75, + "learning_rate": 3.8970698481193225e-07, + "logits/chosen": -0.9713796973228455, + "logits/rejected": -1.0484669208526611, + "logps/chosen": -73.549560546875, + "logps/rejected": -109.93821716308594, + "loss": 0.839, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.2009963989257812, + "rewards/margins": -1.4355697631835938, + "rewards/rejected": 3.636566162109375, + "step": 10810 + }, + { + "epoch": 1.75, + "learning_rate": 3.891984616977923e-07, + "logits/chosen": -1.4427400827407837, + "logits/rejected": -1.259569764137268, + "logps/chosen": -116.66134643554688, + "logps/rejected": -70.69206237792969, + "loss": 0.0527, + "rewards/accuracies": 1.0, + "rewards/chosen": 8.429483413696289, + "rewards/margins": 5.484358787536621, + "rewards/rejected": 2.945124864578247, + "step": 10811 + }, + { + "epoch": 1.75, + "learning_rate": 3.8869025714640965e-07, + "logits/chosen": -1.4577735662460327, + "logits/rejected": -1.2826344966888428, + "logps/chosen": -114.46499633789062, + "logps/rejected": -42.786231994628906, + "loss": 0.1383, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.914105415344238, + "rewards/margins": 4.483615875244141, + "rewards/rejected": 1.4304897785186768, + "step": 10812 + }, + { + "epoch": 1.76, + "learning_rate": 3.8818237119289836e-07, + "logits/chosen": -1.1457165479660034, + "logits/rejected": -1.0768967866897583, + "logps/chosen": -40.53587341308594, + "logps/rejected": -27.98797607421875, + "loss": 0.4522, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.329977035522461, + "rewards/margins": -0.308047890663147, + "rewards/rejected": 1.638024926185608, + "step": 10813 + }, + { + "epoch": 1.76, + "learning_rate": 3.8767480387234714e-07, + "logits/chosen": -1.2925693988800049, + "logits/rejected": -1.305151104927063, + "logps/chosen": -110.359619140625, + "logps/rejected": -144.83856201171875, + "loss": 0.773, + "rewards/accuracies": 0.0, + "rewards/chosen": 7.782958984375, + "rewards/margins": -1.3013839721679688, + "rewards/rejected": 9.084342956542969, + "step": 10814 + }, + { + "epoch": 1.76, + "learning_rate": 3.871675552198256e-07, + "logits/chosen": -1.2233856916427612, + "logits/rejected": -1.056187391281128, + "logps/chosen": -66.33702087402344, + "logps/rejected": -45.24267578125, + "loss": 0.2089, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.273846387863159, + "rewards/margins": 1.008856177330017, + "rewards/rejected": 1.264990210533142, + "step": 10815 + }, + { + "epoch": 1.76, + "learning_rate": 3.8666062527037804e-07, + "logits/chosen": -1.328133225440979, + "logits/rejected": -1.2785223722457886, + "logps/chosen": -106.31129455566406, + "logps/rejected": -45.636619567871094, + "loss": 0.3433, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.6158173084259033, + "rewards/margins": 0.8265464305877686, + "rewards/rejected": 2.7892708778381348, + "step": 10816 + }, + { + "epoch": 1.76, + "learning_rate": 3.861540140590303e-07, + "logits/chosen": -1.0283787250518799, + "logits/rejected": -0.8367457389831543, + "logps/chosen": -87.241943359375, + "logps/rejected": -59.353736877441406, + "loss": 0.0553, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.660083770751953, + "rewards/margins": 3.3070883750915527, + "rewards/rejected": 0.3529953062534332, + "step": 10817 + }, + { + "epoch": 1.76, + "learning_rate": 3.8564772162078325e-07, + "logits/chosen": -1.2090908288955688, + "logits/rejected": -1.0686205625534058, + "logps/chosen": -58.1544189453125, + "logps/rejected": -32.923789978027344, + "loss": 0.234, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.0877243280410767, + "rewards/margins": 1.7744297981262207, + "rewards/rejected": -0.6867054104804993, + "step": 10818 + }, + { + "epoch": 1.76, + "learning_rate": 3.851417479906172e-07, + "logits/chosen": -1.2210005521774292, + "logits/rejected": -1.1326062679290771, + "logps/chosen": -112.55384826660156, + "logps/rejected": -68.29182434082031, + "loss": 1.3296, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.795983910560608, + "rewards/margins": 1.111036777496338, + "rewards/rejected": 0.6849471926689148, + "step": 10819 + }, + { + "epoch": 1.76, + "learning_rate": 3.846360932034898e-07, + "logits/chosen": -1.3338148593902588, + "logits/rejected": -1.2947062253952026, + "logps/chosen": -52.11402130126953, + "logps/rejected": -44.566619873046875, + "loss": 0.7505, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.8043686151504517, + "rewards/margins": -0.9509469270706177, + "rewards/rejected": 2.7553155422210693, + "step": 10820 + }, + { + "epoch": 1.76, + "learning_rate": 3.841307572943381e-07, + "logits/chosen": -1.5275856256484985, + "logits/rejected": -1.4829347133636475, + "logps/chosen": -72.56381225585938, + "logps/rejected": -68.50425720214844, + "loss": 0.1934, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.81524658203125, + "rewards/margins": 0.8418121337890625, + "rewards/rejected": 3.9734344482421875, + "step": 10821 + }, + { + "epoch": 1.76, + "learning_rate": 3.8362574029807477e-07, + "logits/chosen": -1.1515353918075562, + "logits/rejected": -1.1360111236572266, + "logps/chosen": -89.36820220947266, + "logps/rejected": -74.32099914550781, + "loss": 0.2204, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.958838701248169, + "rewards/margins": 1.0842399597167969, + "rewards/rejected": 1.874598741531372, + "step": 10822 + }, + { + "epoch": 1.76, + "learning_rate": 3.831210422495929e-07, + "logits/chosen": -1.0295854806900024, + "logits/rejected": -1.0993990898132324, + "logps/chosen": -44.16954040527344, + "logps/rejected": -53.96502685546875, + "loss": 0.9286, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.019461154937744, + "rewards/margins": -0.984548807144165, + "rewards/rejected": 3.004009962081909, + "step": 10823 + }, + { + "epoch": 1.76, + "learning_rate": 3.8261666318376077e-07, + "logits/chosen": -1.3555421829223633, + "logits/rejected": -1.310581922531128, + "logps/chosen": -188.663330078125, + "logps/rejected": -67.90853881835938, + "loss": 0.2497, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.087497234344482, + "rewards/margins": 3.6261186599731445, + "rewards/rejected": 2.461378574371338, + "step": 10824 + }, + { + "epoch": 1.76, + "learning_rate": 3.8211260313542775e-07, + "logits/chosen": -1.0210984945297241, + "logits/rejected": -1.0210984945297241, + "logps/chosen": -48.30126190185547, + "logps/rejected": -48.30126190185547, + "loss": 0.4557, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.2037228345870972, + "rewards/margins": 0.0, + "rewards/rejected": 1.2037228345870972, + "step": 10825 + }, + { + "epoch": 1.76, + "learning_rate": 3.816088621394187e-07, + "logits/chosen": -1.150514841079712, + "logits/rejected": -1.1383730173110962, + "logps/chosen": -186.38851928710938, + "logps/rejected": -84.91519927978516, + "loss": 0.0219, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.019717693328857, + "rewards/margins": 3.5689432621002197, + "rewards/rejected": 2.4507744312286377, + "step": 10826 + }, + { + "epoch": 1.76, + "learning_rate": 3.811054402305381e-07, + "logits/chosen": -1.5985568761825562, + "logits/rejected": -1.623549222946167, + "logps/chosen": -87.64228820800781, + "logps/rejected": -80.52323913574219, + "loss": 1.2406, + "rewards/accuracies": 0.0, + "rewards/chosen": 5.66140604019165, + "rewards/margins": -1.9898896217346191, + "rewards/rejected": 7.6512956619262695, + "step": 10827 + }, + { + "epoch": 1.76, + "learning_rate": 3.8060233744356634e-07, + "logits/chosen": -1.6231318712234497, + "logits/rejected": -1.5267016887664795, + "logps/chosen": -39.21961975097656, + "logps/rejected": -9.712875366210938, + "loss": 0.3994, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.641277313232422, + "rewards/margins": 2.215207815170288, + "rewards/rejected": 0.4260694682598114, + "step": 10828 + }, + { + "epoch": 1.76, + "learning_rate": 3.80099553813264e-07, + "logits/chosen": -0.9738601446151733, + "logits/rejected": -1.0373488664627075, + "logps/chosen": -50.44730758666992, + "logps/rejected": -45.200889587402344, + "loss": 0.2996, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.0685641765594482, + "rewards/margins": 0.3951137065887451, + "rewards/rejected": 1.6734504699707031, + "step": 10829 + }, + { + "epoch": 1.76, + "learning_rate": 3.7959708937436946e-07, + "logits/chosen": -1.056604266166687, + "logits/rejected": -1.0455118417739868, + "logps/chosen": -15.941861152648926, + "logps/rejected": -5.819546222686768, + "loss": 0.4374, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5205731391906738, + "rewards/margins": 0.21696171164512634, + "rewards/rejected": 0.3036114275455475, + "step": 10830 + }, + { + "epoch": 1.76, + "learning_rate": 3.7909494416159655e-07, + "logits/chosen": -1.348964810371399, + "logits/rejected": -1.348964810371399, + "logps/chosen": -60.42593002319336, + "logps/rejected": -60.42593002319336, + "loss": 0.4599, + "rewards/accuracies": 0.0, + "rewards/chosen": 4.443201065063477, + "rewards/margins": 0.0, + "rewards/rejected": 4.443201065063477, + "step": 10831 + }, + { + "epoch": 1.76, + "learning_rate": 3.7859311820964027e-07, + "logits/chosen": -1.049154281616211, + "logits/rejected": -1.068394422531128, + "logps/chosen": -39.59918212890625, + "logps/rejected": -57.48609161376953, + "loss": 0.9206, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.6603341102600098, + "rewards/margins": -1.028010368347168, + "rewards/rejected": 4.688344478607178, + "step": 10832 + }, + { + "epoch": 1.76, + "learning_rate": 3.7809161155317073e-07, + "logits/chosen": -1.202656865119934, + "logits/rejected": -1.2469878196716309, + "logps/chosen": -44.03670883178711, + "logps/rejected": -88.77765655517578, + "loss": 0.5621, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.0651336908340454, + "rewards/margins": 0.11279678344726562, + "rewards/rejected": 0.9523369073867798, + "step": 10833 + }, + { + "epoch": 1.76, + "learning_rate": 3.775904242268391e-07, + "logits/chosen": -1.1002657413482666, + "logits/rejected": -1.1354495286941528, + "logps/chosen": -22.506380081176758, + "logps/rejected": -46.86469268798828, + "loss": 0.4271, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.6772162914276123, + "rewards/margins": 0.08024287223815918, + "rewards/rejected": 2.596973419189453, + "step": 10834 + }, + { + "epoch": 1.76, + "learning_rate": 3.770895562652699e-07, + "logits/chosen": -0.9913496375083923, + "logits/rejected": -1.0469027757644653, + "logps/chosen": -61.4034423828125, + "logps/rejected": -76.9000244140625, + "loss": 0.8876, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.927725315093994, + "rewards/margins": -0.8483932018280029, + "rewards/rejected": 3.776118516921997, + "step": 10835 + }, + { + "epoch": 1.76, + "learning_rate": 3.765890077030715e-07, + "logits/chosen": -1.2512874603271484, + "logits/rejected": -1.2567073106765747, + "logps/chosen": -25.43344497680664, + "logps/rejected": -41.02660369873047, + "loss": 0.326, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.2792224884033203, + "rewards/margins": 0.16281390190124512, + "rewards/rejected": 2.116408586502075, + "step": 10836 + }, + { + "epoch": 1.76, + "learning_rate": 3.760887785748241e-07, + "logits/chosen": -1.4900524616241455, + "logits/rejected": -1.4238649606704712, + "logps/chosen": -88.36272430419922, + "logps/rejected": -83.824462890625, + "loss": 0.4142, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.3933968544006348, + "rewards/margins": -0.0714104175567627, + "rewards/rejected": 2.4648072719573975, + "step": 10837 + }, + { + "epoch": 1.76, + "learning_rate": 3.7558886891509163e-07, + "logits/chosen": -1.2044529914855957, + "logits/rejected": -1.2386107444763184, + "logps/chosen": -93.30531311035156, + "logps/rejected": -68.52703094482422, + "loss": 0.7025, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.9209916591644287, + "rewards/margins": -1.0809624195098877, + "rewards/rejected": 4.001954078674316, + "step": 10838 + }, + { + "epoch": 1.76, + "learning_rate": 3.750892787584104e-07, + "logits/chosen": -1.261978030204773, + "logits/rejected": -1.2296946048736572, + "logps/chosen": -135.20291137695312, + "logps/rejected": -96.29728698730469, + "loss": 1.1935, + "rewards/accuracies": 0.0, + "rewards/chosen": 4.930462837219238, + "rewards/margins": -2.1755919456481934, + "rewards/rejected": 7.106054782867432, + "step": 10839 + }, + { + "epoch": 1.76, + "learning_rate": 3.745900081393e-07, + "logits/chosen": -0.839954137802124, + "logits/rejected": -0.8423547744750977, + "logps/chosen": -1.960262417793274, + "logps/rejected": -11.092827796936035, + "loss": 0.752, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.3193584382534027, + "rewards/margins": -0.32008180022239685, + "rewards/rejected": 0.6394402384757996, + "step": 10840 + }, + { + "epoch": 1.76, + "learning_rate": 3.740910570922529e-07, + "logits/chosen": -1.3077720403671265, + "logits/rejected": -1.2609343528747559, + "logps/chosen": -84.30940246582031, + "logps/rejected": -52.95946502685547, + "loss": 0.4621, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.7159096002578735, + "rewards/margins": 0.11071622371673584, + "rewards/rejected": 1.6051933765411377, + "step": 10841 + }, + { + "epoch": 1.76, + "learning_rate": 3.7359242565174427e-07, + "logits/chosen": -1.4067963361740112, + "logits/rejected": -1.2547134160995483, + "logps/chosen": -88.80110931396484, + "logps/rejected": -52.80884552001953, + "loss": 0.0539, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.148238658905029, + "rewards/margins": 3.787997007369995, + "rewards/rejected": 2.360241651535034, + "step": 10842 + }, + { + "epoch": 1.76, + "learning_rate": 3.7309411385222204e-07, + "logits/chosen": -1.5162456035614014, + "logits/rejected": -1.5953330993652344, + "logps/chosen": -42.57054138183594, + "logps/rejected": -106.44866943359375, + "loss": 2.1744, + "rewards/accuracies": 0.0, + "rewards/chosen": 4.961008548736572, + "rewards/margins": -4.1682610511779785, + "rewards/rejected": 9.12926959991455, + "step": 10843 + }, + { + "epoch": 1.76, + "learning_rate": 3.7259612172811767e-07, + "logits/chosen": -1.2061210870742798, + "logits/rejected": -1.1957253217697144, + "logps/chosen": -77.64398193359375, + "logps/rejected": -71.35333251953125, + "loss": 0.3867, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.620534658432007, + "rewards/margins": -0.11333537101745605, + "rewards/rejected": 2.733870029449463, + "step": 10844 + }, + { + "epoch": 1.76, + "learning_rate": 3.720984493138352e-07, + "logits/chosen": -1.4061180353164673, + "logits/rejected": -1.1851838827133179, + "logps/chosen": -112.55081176757812, + "logps/rejected": -73.29158020019531, + "loss": 0.0081, + "rewards/accuracies": 1.0, + "rewards/chosen": 7.294772624969482, + "rewards/margins": 4.466136455535889, + "rewards/rejected": 2.8286361694335938, + "step": 10845 + }, + { + "epoch": 1.76, + "learning_rate": 3.716010966437611e-07, + "logits/chosen": -1.0263941287994385, + "logits/rejected": -0.9653593897819519, + "logps/chosen": -60.163978576660156, + "logps/rejected": -65.82077026367188, + "loss": 0.6119, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.335071563720703, + "rewards/margins": -0.45073771476745605, + "rewards/rejected": 2.785809278488159, + "step": 10846 + }, + { + "epoch": 1.76, + "learning_rate": 3.7110406375225616e-07, + "logits/chosen": -1.2151446342468262, + "logits/rejected": -1.2151446342468262, + "logps/chosen": -46.08527374267578, + "logps/rejected": -46.08527374267578, + "loss": 0.3472, + "rewards/accuracies": 0.0, + "rewards/chosen": 4.8444743156433105, + "rewards/margins": 0.0, + "rewards/rejected": 4.8444743156433105, + "step": 10847 + }, + { + "epoch": 1.76, + "learning_rate": 3.706073506736624e-07, + "logits/chosen": -1.336572527885437, + "logits/rejected": -1.263132929801941, + "logps/chosen": -51.071319580078125, + "logps/rejected": -75.0633773803711, + "loss": 0.4908, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.156729221343994, + "rewards/margins": -0.4679985046386719, + "rewards/rejected": 2.624727725982666, + "step": 10848 + }, + { + "epoch": 1.76, + "learning_rate": 3.701109574422962e-07, + "logits/chosen": -1.5531013011932373, + "logits/rejected": -1.5531013011932373, + "logps/chosen": -68.35484313964844, + "logps/rejected": -68.35484313964844, + "loss": 0.3504, + "rewards/accuracies": 0.0, + "rewards/chosen": 4.8424391746521, + "rewards/margins": 0.0, + "rewards/rejected": 4.8424391746521, + "step": 10849 + }, + { + "epoch": 1.76, + "learning_rate": 3.6961488409245515e-07, + "logits/chosen": -1.1531040668487549, + "logits/rejected": -1.109713077545166, + "logps/chosen": -36.627593994140625, + "logps/rejected": -25.257909774780273, + "loss": 0.0978, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.117737293243408, + "rewards/margins": 1.5441575050354004, + "rewards/rejected": 0.5735797882080078, + "step": 10850 + }, + { + "epoch": 1.76, + "learning_rate": 3.6911913065841187e-07, + "logits/chosen": -1.066863775253296, + "logits/rejected": -1.0668301582336426, + "logps/chosen": -17.45579719543457, + "logps/rejected": -14.508855819702148, + "loss": 0.3841, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.8041338324546814, + "rewards/margins": -0.010736644268035889, + "rewards/rejected": 0.8148704767227173, + "step": 10851 + }, + { + "epoch": 1.76, + "learning_rate": 3.6862369717441895e-07, + "logits/chosen": -1.198839545249939, + "logits/rejected": -1.2446640729904175, + "logps/chosen": -52.619346618652344, + "logps/rejected": -105.65516662597656, + "loss": 0.1944, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.772667646408081, + "rewards/margins": 0.9550292491912842, + "rewards/rejected": 1.8176383972167969, + "step": 10852 + }, + { + "epoch": 1.76, + "learning_rate": 3.6812858367470616e-07, + "logits/chosen": -1.461498498916626, + "logits/rejected": -1.4663166999816895, + "logps/chosen": -91.79766845703125, + "logps/rejected": -102.1562728881836, + "loss": 0.3473, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.0820701122283936, + "rewards/margins": 0.04020833969116211, + "rewards/rejected": 2.0418617725372314, + "step": 10853 + }, + { + "epoch": 1.76, + "learning_rate": 3.676337901934812e-07, + "logits/chosen": -1.4525339603424072, + "logits/rejected": -1.5730568170547485, + "logps/chosen": -153.5110321044922, + "logps/rejected": -181.3055877685547, + "loss": 2.491, + "rewards/accuracies": 0.0, + "rewards/chosen": 7.233378887176514, + "rewards/margins": -2.3213648796081543, + "rewards/rejected": 9.554743766784668, + "step": 10854 + }, + { + "epoch": 1.76, + "learning_rate": 3.67139316764929e-07, + "logits/chosen": -1.5619804859161377, + "logits/rejected": -1.5619804859161377, + "logps/chosen": -77.16973876953125, + "logps/rejected": -77.16973876953125, + "loss": 2.9728, + "rewards/accuracies": 0.0, + "rewards/chosen": 4.307921886444092, + "rewards/margins": 0.0, + "rewards/rejected": 4.307921886444092, + "step": 10855 + }, + { + "epoch": 1.76, + "learning_rate": 3.6664516342321434e-07, + "logits/chosen": -1.7546494007110596, + "logits/rejected": -1.7214609384536743, + "logps/chosen": -101.31880950927734, + "logps/rejected": -61.173065185546875, + "loss": 0.1735, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.1457481384277344, + "rewards/margins": 0.9532233476638794, + "rewards/rejected": 1.192524790763855, + "step": 10856 + }, + { + "epoch": 1.76, + "learning_rate": 3.661513302024766e-07, + "logits/chosen": -1.0880461931228638, + "logits/rejected": -0.9652395844459534, + "logps/chosen": -70.14400482177734, + "logps/rejected": -33.06532669067383, + "loss": 0.1867, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.253524303436279, + "rewards/margins": 2.2756218910217285, + "rewards/rejected": 1.9779022932052612, + "step": 10857 + }, + { + "epoch": 1.76, + "learning_rate": 3.656578171368369e-07, + "logits/chosen": -1.2683638334274292, + "logits/rejected": -1.3363980054855347, + "logps/chosen": -68.55174255371094, + "logps/rejected": -121.23049926757812, + "loss": 1.1961, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.4738190174102783, + "rewards/margins": -2.259460687637329, + "rewards/rejected": 4.733279705047607, + "step": 10858 + }, + { + "epoch": 1.76, + "learning_rate": 3.6516462426039123e-07, + "logits/chosen": -1.2447679042816162, + "logits/rejected": -1.2447679042816162, + "logps/chosen": -53.91135787963867, + "logps/rejected": -53.91135787963867, + "loss": 1.6962, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.4596638679504395, + "rewards/margins": 0.0, + "rewards/rejected": 3.4596638679504395, + "step": 10859 + }, + { + "epoch": 1.76, + "learning_rate": 3.6467175160721515e-07, + "logits/chosen": -1.3432773351669312, + "logits/rejected": -1.3432773351669312, + "logps/chosen": -33.07440948486328, + "logps/rejected": -33.07440948486328, + "loss": 1.7024, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.839625597000122, + "rewards/margins": 0.0, + "rewards/rejected": 3.839625597000122, + "step": 10860 + }, + { + "epoch": 1.76, + "learning_rate": 3.6417919921136027e-07, + "logits/chosen": -1.4740204811096191, + "logits/rejected": -1.3092966079711914, + "logps/chosen": -118.00333404541016, + "logps/rejected": -110.62361145019531, + "loss": 0.0084, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.100673675537109, + "rewards/margins": 4.77550745010376, + "rewards/rejected": 1.3251663446426392, + "step": 10861 + }, + { + "epoch": 1.76, + "learning_rate": 3.636869671068588e-07, + "logits/chosen": -1.0767784118652344, + "logits/rejected": -0.898445188999176, + "logps/chosen": -106.23880767822266, + "logps/rejected": -15.606170654296875, + "loss": 0.0235, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.3836236000061035, + "rewards/margins": 3.1048080921173096, + "rewards/rejected": 0.27881547808647156, + "step": 10862 + }, + { + "epoch": 1.76, + "learning_rate": 3.631950553277186e-07, + "logits/chosen": -1.0851162672042847, + "logits/rejected": -1.1049987077713013, + "logps/chosen": -57.5257453918457, + "logps/rejected": -107.55342102050781, + "loss": 0.3742, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.8125828504562378, + "rewards/margins": 0.5963482856750488, + "rewards/rejected": 1.216234564781189, + "step": 10863 + }, + { + "epoch": 1.76, + "learning_rate": 3.627034639079258e-07, + "logits/chosen": -1.142235279083252, + "logits/rejected": -1.142235279083252, + "logps/chosen": -63.55166244506836, + "logps/rejected": -63.55166244506836, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.5121235847473145, + "rewards/margins": 0.0, + "rewards/rejected": 3.5121235847473145, + "step": 10864 + }, + { + "epoch": 1.76, + "learning_rate": 3.6221219288144596e-07, + "logits/chosen": -1.358930230140686, + "logits/rejected": -0.9115872383117676, + "logps/chosen": -82.18489074707031, + "logps/rejected": -61.816810607910156, + "loss": 0.605, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.6402740478515625, + "rewards/margins": 1.154168725013733, + "rewards/rejected": 1.4861053228378296, + "step": 10865 + }, + { + "epoch": 1.76, + "learning_rate": 3.6172124228221914e-07, + "logits/chosen": -1.2247337102890015, + "logits/rejected": -1.1297917366027832, + "logps/chosen": -54.06626892089844, + "logps/rejected": -105.04805755615234, + "loss": 0.3071, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.7317872047424316, + "rewards/margins": 0.7271537780761719, + "rewards/rejected": 2.0046334266662598, + "step": 10866 + }, + { + "epoch": 1.76, + "learning_rate": 3.612306121441683e-07, + "logits/chosen": -0.6495659947395325, + "logits/rejected": -0.6838775873184204, + "logps/chosen": -19.241249084472656, + "logps/rejected": -42.416717529296875, + "loss": 2.1831, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.4392692744731903, + "rewards/margins": -1.7733848094940186, + "rewards/rejected": 2.2126541137695312, + "step": 10867 + }, + { + "epoch": 1.76, + "learning_rate": 3.607403025011885e-07, + "logits/chosen": -0.8798518180847168, + "logits/rejected": -0.8823743462562561, + "logps/chosen": -45.86939239501953, + "logps/rejected": -39.337547302246094, + "loss": 0.2607, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.3433799743652344, + "rewards/margins": 0.42301249504089355, + "rewards/rejected": 2.920367479324341, + "step": 10868 + }, + { + "epoch": 1.76, + "learning_rate": 3.602503133871577e-07, + "logits/chosen": -1.4387917518615723, + "logits/rejected": -1.5860097408294678, + "logps/chosen": -83.10269927978516, + "logps/rejected": -36.96708297729492, + "loss": 0.3265, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.090256452560425, + "rewards/margins": 3.180690050125122, + "rewards/rejected": -0.0904335007071495, + "step": 10869 + }, + { + "epoch": 1.76, + "learning_rate": 3.5976064483592754e-07, + "logits/chosen": -1.223193645477295, + "logits/rejected": -1.1495987176895142, + "logps/chosen": -57.399681091308594, + "logps/rejected": -22.19432258605957, + "loss": 0.7762, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.920334577560425, + "rewards/margins": 2.814865827560425, + "rewards/rejected": 0.10546875, + "step": 10870 + }, + { + "epoch": 1.76, + "learning_rate": 3.592712968813311e-07, + "logits/chosen": -1.1528719663619995, + "logits/rejected": -1.185362696647644, + "logps/chosen": -61.30792999267578, + "logps/rejected": -113.48272705078125, + "loss": 0.5715, + "rewards/accuracies": 0.0, + "rewards/chosen": 5.8869781494140625, + "rewards/margins": -0.7581772804260254, + "rewards/rejected": 6.645155429840088, + "step": 10871 + }, + { + "epoch": 1.76, + "learning_rate": 3.587822695571763e-07, + "logits/chosen": -1.3290374279022217, + "logits/rejected": -1.3199775218963623, + "logps/chosen": -61.27288055419922, + "logps/rejected": -59.2530403137207, + "loss": 0.431, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.8010414838790894, + "rewards/margins": 1.0210933685302734, + "rewards/rejected": 0.7799480557441711, + "step": 10872 + }, + { + "epoch": 1.76, + "learning_rate": 3.582935628972523e-07, + "logits/chosen": -1.5383572578430176, + "logits/rejected": -1.6620285511016846, + "logps/chosen": -30.924528121948242, + "logps/rejected": -143.57565307617188, + "loss": 2.1076, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.3104944229125977, + "rewards/margins": -4.164730072021484, + "rewards/rejected": 6.475224494934082, + "step": 10873 + }, + { + "epoch": 1.76, + "learning_rate": 3.578051769353219e-07, + "logits/chosen": -1.3243929147720337, + "logits/rejected": -1.1489285230636597, + "logps/chosen": -169.4804229736328, + "logps/rejected": -16.253101348876953, + "loss": 0.0277, + "rewards/accuracies": 1.0, + "rewards/chosen": 7.118455410003662, + "rewards/margins": 5.944974422454834, + "rewards/rejected": 1.1734808683395386, + "step": 10874 + }, + { + "epoch": 1.77, + "learning_rate": 3.573171117051294e-07, + "logits/chosen": -1.7936514616012573, + "logits/rejected": -1.7681221961975098, + "logps/chosen": -128.43722534179688, + "logps/rejected": -145.4722442626953, + "loss": 0.3059, + "rewards/accuracies": 1.0, + "rewards/chosen": 8.300776481628418, + "rewards/margins": 0.19133377075195312, + "rewards/rejected": 8.109442710876465, + "step": 10875 + }, + { + "epoch": 1.77, + "learning_rate": 3.5682936724039497e-07, + "logits/chosen": -1.3205058574676514, + "logits/rejected": -1.338286280632019, + "logps/chosen": -47.255035400390625, + "logps/rejected": -86.78863525390625, + "loss": 0.7432, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.077221632003784, + "rewards/margins": -0.003488302230834961, + "rewards/rejected": 3.080709934234619, + "step": 10876 + }, + { + "epoch": 1.77, + "learning_rate": 3.5634194357481724e-07, + "logits/chosen": -1.1811468601226807, + "logits/rejected": -1.4018193483352661, + "logps/chosen": -32.738773345947266, + "logps/rejected": -123.46044158935547, + "loss": 2.5569, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.597726821899414, + "rewards/margins": -4.900484085083008, + "rewards/rejected": 7.498210906982422, + "step": 10877 + }, + { + "epoch": 1.77, + "learning_rate": 3.55854840742072e-07, + "logits/chosen": -1.3558257818222046, + "logits/rejected": -1.2962065935134888, + "logps/chosen": -62.08064651489258, + "logps/rejected": -59.123504638671875, + "loss": 0.6229, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.8345439434051514, + "rewards/margins": -0.8572454452514648, + "rewards/rejected": 3.691789388656616, + "step": 10878 + }, + { + "epoch": 1.77, + "learning_rate": 3.553680587758146e-07, + "logits/chosen": -1.0353195667266846, + "logits/rejected": -0.9994885921478271, + "logps/chosen": -65.5843505859375, + "logps/rejected": -48.92383575439453, + "loss": 0.1429, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.299093008041382, + "rewards/margins": 1.187462329864502, + "rewards/rejected": 1.1116306781768799, + "step": 10879 + }, + { + "epoch": 1.77, + "learning_rate": 3.548815977096759e-07, + "logits/chosen": -1.1858631372451782, + "logits/rejected": -1.2049508094787598, + "logps/chosen": -65.91205596923828, + "logps/rejected": -123.38993072509766, + "loss": 0.5798, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.8490402698516846, + "rewards/margins": 0.5204429626464844, + "rewards/rejected": 1.3285973072052002, + "step": 10880 + }, + { + "epoch": 1.77, + "learning_rate": 3.543954575772668e-07, + "logits/chosen": -1.1005362272262573, + "logits/rejected": -1.096988320350647, + "logps/chosen": -93.07252502441406, + "logps/rejected": -77.83999633789062, + "loss": 0.93, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.7156509160995483, + "rewards/margins": -1.0132721662521362, + "rewards/rejected": 2.7289230823516846, + "step": 10881 + }, + { + "epoch": 1.77, + "learning_rate": 3.539096384121743e-07, + "logits/chosen": -1.2790724039077759, + "logits/rejected": -1.289027214050293, + "logps/chosen": -73.87553405761719, + "logps/rejected": -66.10513305664062, + "loss": 0.3215, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.13181471824646, + "rewards/margins": 0.11444401741027832, + "rewards/rejected": 3.0173707008361816, + "step": 10882 + }, + { + "epoch": 1.77, + "learning_rate": 3.534241402479643e-07, + "logits/chosen": -1.3609297275543213, + "logits/rejected": -1.261707067489624, + "logps/chosen": -108.45030212402344, + "logps/rejected": -34.9134407043457, + "loss": 0.032, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.8075501918792725, + "rewards/margins": 2.9205288887023926, + "rewards/rejected": -0.1129787489771843, + "step": 10883 + }, + { + "epoch": 1.77, + "learning_rate": 3.5293896311817955e-07, + "logits/chosen": -1.4940776824951172, + "logits/rejected": -1.4752287864685059, + "logps/chosen": -54.3165397644043, + "logps/rejected": -61.21672439575195, + "loss": 0.7183, + "rewards/accuracies": 0.0, + "rewards/chosen": 4.198850154876709, + "rewards/margins": -0.09825563430786133, + "rewards/rejected": 4.29710578918457, + "step": 10884 + }, + { + "epoch": 1.77, + "learning_rate": 3.5245410705634085e-07, + "logits/chosen": -0.8967084884643555, + "logits/rejected": -0.9658462405204773, + "logps/chosen": -26.633129119873047, + "logps/rejected": -68.75833892822266, + "loss": 0.5072, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.6323028802871704, + "rewards/margins": -0.18641430139541626, + "rewards/rejected": 0.8187171816825867, + "step": 10885 + }, + { + "epoch": 1.77, + "learning_rate": 3.5196957209594875e-07, + "logits/chosen": -1.3632179498672485, + "logits/rejected": -1.2862577438354492, + "logps/chosen": -33.94950866699219, + "logps/rejected": -21.73771095275879, + "loss": 0.0854, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2422127723693848, + "rewards/margins": 2.477325439453125, + "rewards/rejected": 0.7648874521255493, + "step": 10886 + }, + { + "epoch": 1.77, + "learning_rate": 3.5148535827047856e-07, + "logits/chosen": -1.6733968257904053, + "logits/rejected": -1.5888561010360718, + "logps/chosen": -158.45977783203125, + "logps/rejected": -158.39645385742188, + "loss": 0.5252, + "rewards/accuracies": 0.0, + "rewards/chosen": 8.200186729431152, + "rewards/margins": -0.6164484024047852, + "rewards/rejected": 8.816635131835938, + "step": 10887 + }, + { + "epoch": 1.77, + "learning_rate": 3.5100146561338645e-07, + "logits/chosen": -1.3874931335449219, + "logits/rejected": -1.3948713541030884, + "logps/chosen": -106.77699279785156, + "logps/rejected": -91.2906723022461, + "loss": 0.5759, + "rewards/accuracies": 0.0, + "rewards/chosen": 6.448431491851807, + "rewards/margins": -0.17699337005615234, + "rewards/rejected": 6.625424861907959, + "step": 10888 + }, + { + "epoch": 1.77, + "learning_rate": 3.505178941581028e-07, + "logits/chosen": -1.7385509014129639, + "logits/rejected": -1.61155366897583, + "logps/chosen": -133.56777954101562, + "logps/rejected": -90.97293090820312, + "loss": 0.4857, + "rewards/accuracies": 1.0, + "rewards/chosen": 7.200644016265869, + "rewards/margins": 0.9635648727416992, + "rewards/rejected": 6.23707914352417, + "step": 10889 + }, + { + "epoch": 1.77, + "learning_rate": 3.500346439380398e-07, + "logits/chosen": -1.4752370119094849, + "logits/rejected": -1.5451550483703613, + "logps/chosen": -60.43396759033203, + "logps/rejected": -69.19589233398438, + "loss": 2.1982, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.1178977489471436, + "rewards/margins": -2.7439401149749756, + "rewards/rejected": 4.861837863922119, + "step": 10890 + }, + { + "epoch": 1.77, + "learning_rate": 3.4955171498658345e-07, + "logits/chosen": -1.496891975402832, + "logits/rejected": -1.45811927318573, + "logps/chosen": -81.77024841308594, + "logps/rejected": -130.70391845703125, + "loss": 1.4114, + "rewards/accuracies": 0.0, + "rewards/chosen": 4.685161113739014, + "rewards/margins": -1.8140349388122559, + "rewards/rejected": 6.4991960525512695, + "step": 10891 + }, + { + "epoch": 1.77, + "learning_rate": 3.490691073371016e-07, + "logits/chosen": -1.2678560018539429, + "logits/rejected": -1.2791945934295654, + "logps/chosen": -66.87796020507812, + "logps/rejected": -55.19136047363281, + "loss": 0.4313, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.212982177734375, + "rewards/margins": 0.993804931640625, + "rewards/rejected": 2.21917724609375, + "step": 10892 + }, + { + "epoch": 1.77, + "learning_rate": 3.4858682102293594e-07, + "logits/chosen": -1.1247013807296753, + "logits/rejected": -1.1334911584854126, + "logps/chosen": -47.58235168457031, + "logps/rejected": -90.50991821289062, + "loss": 0.5206, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.2628071308135986, + "rewards/margins": 0.4862492084503174, + "rewards/rejected": 0.7765579223632812, + "step": 10893 + }, + { + "epoch": 1.77, + "learning_rate": 3.4810485607740975e-07, + "logits/chosen": -1.5608446598052979, + "logits/rejected": -1.4643951654434204, + "logps/chosen": -76.99446105957031, + "logps/rejected": -53.43754196166992, + "loss": 0.0495, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.8662126064300537, + "rewards/margins": 2.3051342964172363, + "rewards/rejected": 1.5610783100128174, + "step": 10894 + }, + { + "epoch": 1.77, + "learning_rate": 3.476232125338208e-07, + "logits/chosen": -1.3034294843673706, + "logits/rejected": -1.2879525423049927, + "logps/chosen": -60.41463851928711, + "logps/rejected": -13.399306297302246, + "loss": 0.3905, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.806925654411316, + "rewards/margins": 1.3157281875610352, + "rewards/rejected": 0.49119749665260315, + "step": 10895 + }, + { + "epoch": 1.77, + "learning_rate": 3.4714189042544755e-07, + "logits/chosen": -1.308321475982666, + "logits/rejected": -1.3256404399871826, + "logps/chosen": -71.07679748535156, + "logps/rejected": -72.3862533569336, + "loss": 0.18, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.836935520172119, + "rewards/margins": 2.0491600036621094, + "rewards/rejected": 1.7877753973007202, + "step": 10896 + }, + { + "epoch": 1.77, + "learning_rate": 3.466608897855428e-07, + "logits/chosen": -1.4772553443908691, + "logits/rejected": -1.2866371870040894, + "logps/chosen": -118.39266967773438, + "logps/rejected": -64.07735443115234, + "loss": 0.5274, + "rewards/accuracies": 1.0, + "rewards/chosen": 7.740914821624756, + "rewards/margins": 6.521481990814209, + "rewards/rejected": 1.2194328308105469, + "step": 10897 + }, + { + "epoch": 1.77, + "learning_rate": 3.461802106473411e-07, + "logits/chosen": -1.2142724990844727, + "logits/rejected": -1.2326223850250244, + "logps/chosen": -66.59265899658203, + "logps/rejected": -76.58238983154297, + "loss": 0.5296, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.4612510204315186, + "rewards/margins": 0.7064635753631592, + "rewards/rejected": 2.7547874450683594, + "step": 10898 + }, + { + "epoch": 1.77, + "learning_rate": 3.456998530440514e-07, + "logits/chosen": -1.2070472240447998, + "logits/rejected": -1.0781128406524658, + "logps/chosen": -91.54896545410156, + "logps/rejected": -68.76631927490234, + "loss": 0.2559, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.1242356300354, + "rewards/margins": 0.9825034141540527, + "rewards/rejected": 4.141732215881348, + "step": 10899 + }, + { + "epoch": 1.77, + "learning_rate": 3.4521981700886276e-07, + "logits/chosen": -0.920947790145874, + "logits/rejected": -0.920947790145874, + "logps/chosen": -55.64502716064453, + "logps/rejected": -55.64502716064453, + "loss": 1.1817, + "rewards/accuracies": 0.0, + "rewards/chosen": 4.2852654457092285, + "rewards/margins": 0.0, + "rewards/rejected": 4.2852654457092285, + "step": 10900 + }, + { + "epoch": 1.77, + "learning_rate": 3.447401025749403e-07, + "logits/chosen": -1.2913724184036255, + "logits/rejected": -1.2883130311965942, + "logps/chosen": -40.50953674316406, + "logps/rejected": -52.35029983520508, + "loss": 0.3241, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.4116661548614502, + "rewards/margins": 0.6012119650840759, + "rewards/rejected": 0.8104541897773743, + "step": 10901 + }, + { + "epoch": 1.77, + "learning_rate": 3.4426070977542914e-07, + "logits/chosen": -0.968277633190155, + "logits/rejected": -0.968277633190155, + "logps/chosen": -53.462440490722656, + "logps/rejected": -53.462440490722656, + "loss": 0.8301, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.1596046686172485, + "rewards/margins": 0.0, + "rewards/rejected": 1.1596046686172485, + "step": 10902 + }, + { + "epoch": 1.77, + "learning_rate": 3.4378163864344946e-07, + "logits/chosen": -0.8935820460319519, + "logits/rejected": -0.8892764449119568, + "logps/chosen": -49.17517852783203, + "logps/rejected": -64.7191162109375, + "loss": 0.2135, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.115072727203369, + "rewards/margins": 0.9921455383300781, + "rewards/rejected": 2.122927188873291, + "step": 10903 + }, + { + "epoch": 1.77, + "learning_rate": 3.43302889212101e-07, + "logits/chosen": -1.0063544511795044, + "logits/rejected": -1.0508787631988525, + "logps/chosen": -33.09270477294922, + "logps/rejected": -107.42425537109375, + "loss": 0.0963, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.0150628089904785, + "rewards/margins": 1.561673879623413, + "rewards/rejected": 0.4533889889717102, + "step": 10904 + }, + { + "epoch": 1.77, + "learning_rate": 3.4282446151446047e-07, + "logits/chosen": -1.5583183765411377, + "logits/rejected": -1.5068250894546509, + "logps/chosen": -104.54109954833984, + "logps/rejected": -67.73428344726562, + "loss": 0.7654, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.5750114917755127, + "rewards/margins": 1.3300384283065796, + "rewards/rejected": 1.244973063468933, + "step": 10905 + }, + { + "epoch": 1.77, + "learning_rate": 3.423463555835843e-07, + "logits/chosen": -1.1389731168746948, + "logits/rejected": -0.6717286109924316, + "logps/chosen": -53.40056610107422, + "logps/rejected": -61.08274459838867, + "loss": 0.1647, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.916948676109314, + "rewards/margins": 1.0433323383331299, + "rewards/rejected": 0.8736163973808289, + "step": 10906 + }, + { + "epoch": 1.77, + "learning_rate": 3.418685714525022e-07, + "logits/chosen": -1.4288336038589478, + "logits/rejected": -1.5118361711502075, + "logps/chosen": -67.64933013916016, + "logps/rejected": -92.37959289550781, + "loss": 0.8153, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.4578559398651123, + "rewards/margins": -1.3897287845611572, + "rewards/rejected": 4.8475847244262695, + "step": 10907 + }, + { + "epoch": 1.77, + "learning_rate": 3.4139110915422767e-07, + "logits/chosen": -1.7479145526885986, + "logits/rejected": -1.7881609201431274, + "logps/chosen": -74.97527313232422, + "logps/rejected": -92.10869598388672, + "loss": 0.5995, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.3629082441329956, + "rewards/margins": -0.7771719694137573, + "rewards/rejected": 2.140080213546753, + "step": 10908 + }, + { + "epoch": 1.77, + "learning_rate": 3.409139687217461e-07, + "logits/chosen": -1.0374351739883423, + "logits/rejected": -1.036224603652954, + "logps/chosen": -1.924467921257019, + "logps/rejected": -2.5705933570861816, + "loss": 0.6306, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.4761922061443329, + "rewards/margins": 0.05233454704284668, + "rewards/rejected": 0.4238576591014862, + "step": 10909 + }, + { + "epoch": 1.77, + "learning_rate": 3.4043715018802547e-07, + "logits/chosen": -1.5401198863983154, + "logits/rejected": -1.5476281642913818, + "logps/chosen": -121.09642028808594, + "logps/rejected": -101.52180480957031, + "loss": 0.2037, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.845576763153076, + "rewards/margins": 0.8015365600585938, + "rewards/rejected": 6.044040203094482, + "step": 10910 + }, + { + "epoch": 1.77, + "learning_rate": 3.399606535860078e-07, + "logits/chosen": -1.1571334600448608, + "logits/rejected": -1.1954902410507202, + "logps/chosen": -82.48330688476562, + "logps/rejected": -112.37591552734375, + "loss": 0.0561, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.032963514328003, + "rewards/margins": 2.336566925048828, + "rewards/rejected": 0.6963966488838196, + "step": 10911 + }, + { + "epoch": 1.77, + "learning_rate": 3.3948447894861624e-07, + "logits/chosen": -1.607374906539917, + "logits/rejected": -1.7926738262176514, + "logps/chosen": -210.75108337402344, + "logps/rejected": -226.328125, + "loss": 0.4507, + "rewards/accuracies": 0.0, + "rewards/chosen": 8.975404739379883, + "rewards/margins": -0.3516855239868164, + "rewards/rejected": 9.3270902633667, + "step": 10912 + }, + { + "epoch": 1.77, + "learning_rate": 3.3900862630874775e-07, + "logits/chosen": -1.5171046257019043, + "logits/rejected": -1.6189601421356201, + "logps/chosen": -177.43722534179688, + "logps/rejected": -196.0350799560547, + "loss": 1.1332, + "rewards/accuracies": 1.0, + "rewards/chosen": 7.823359966278076, + "rewards/margins": 1.7399721145629883, + "rewards/rejected": 6.083387851715088, + "step": 10913 + }, + { + "epoch": 1.77, + "learning_rate": 3.385330956992816e-07, + "logits/chosen": -1.4744330644607544, + "logits/rejected": -1.5978872776031494, + "logps/chosen": -63.500022888183594, + "logps/rejected": -37.51152801513672, + "loss": 0.1267, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.4160804748535156, + "rewards/margins": 3.3038737773895264, + "rewards/rejected": 0.11220665276050568, + "step": 10914 + }, + { + "epoch": 1.77, + "learning_rate": 3.380578871530704e-07, + "logits/chosen": -1.1462128162384033, + "logits/rejected": -1.085204005241394, + "logps/chosen": -62.72184753417969, + "logps/rejected": -37.279972076416016, + "loss": 0.6341, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.270782470703125, + "rewards/margins": -0.7704167366027832, + "rewards/rejected": 3.041199207305908, + "step": 10915 + }, + { + "epoch": 1.77, + "learning_rate": 3.3758300070294847e-07, + "logits/chosen": -0.8940314054489136, + "logits/rejected": -1.0765514373779297, + "logps/chosen": -40.7963981628418, + "logps/rejected": -243.95152282714844, + "loss": 0.6305, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.1559722423553467, + "rewards/margins": -0.8787858486175537, + "rewards/rejected": 3.0347580909729004, + "step": 10916 + }, + { + "epoch": 1.77, + "learning_rate": 3.3710843638172394e-07, + "logits/chosen": -1.540898323059082, + "logits/rejected": -1.5790536403656006, + "logps/chosen": -66.10303497314453, + "logps/rejected": -72.8394546508789, + "loss": 0.9405, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.805193305015564, + "rewards/margins": -0.06964266300201416, + "rewards/rejected": 1.8748359680175781, + "step": 10917 + }, + { + "epoch": 1.77, + "learning_rate": 3.366341942221868e-07, + "logits/chosen": -0.9387064576148987, + "logits/rejected": -0.9192999601364136, + "logps/chosen": -25.502492904663086, + "logps/rejected": -2.3647730350494385, + "loss": 0.403, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.4695201814174652, + "rewards/margins": 0.015826702117919922, + "rewards/rejected": 0.4536934792995453, + "step": 10918 + }, + { + "epoch": 1.77, + "learning_rate": 3.3616027425710076e-07, + "logits/chosen": -1.2765343189239502, + "logits/rejected": -1.4193909168243408, + "logps/chosen": -48.432254791259766, + "logps/rejected": -98.97222900390625, + "loss": 2.4646, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.951772689819336, + "rewards/margins": -4.763190269470215, + "rewards/rejected": 8.71496295928955, + "step": 10919 + }, + { + "epoch": 1.77, + "learning_rate": 3.3568667651921015e-07, + "logits/chosen": -1.4037097692489624, + "logits/rejected": -1.4037097692489624, + "logps/chosen": -26.369977951049805, + "logps/rejected": -26.369977951049805, + "loss": 0.5328, + "rewards/accuracies": 0.0, + "rewards/chosen": 4.7528581619262695, + "rewards/margins": 0.0, + "rewards/rejected": 4.7528581619262695, + "step": 10920 + }, + { + "epoch": 1.77, + "learning_rate": 3.352134010412367e-07, + "logits/chosen": -1.1024160385131836, + "logits/rejected": -1.1502554416656494, + "logps/chosen": -41.05613708496094, + "logps/rejected": -63.967185974121094, + "loss": 0.2344, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.6147507429122925, + "rewards/margins": 0.607068657875061, + "rewards/rejected": 1.0076820850372314, + "step": 10921 + }, + { + "epoch": 1.77, + "learning_rate": 3.34740447855878e-07, + "logits/chosen": -1.2593944072723389, + "logits/rejected": -1.2982455492019653, + "logps/chosen": -79.39469909667969, + "logps/rejected": -98.14115905761719, + "loss": 0.6588, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.4196746349334717, + "rewards/margins": 0.867457389831543, + "rewards/rejected": 2.5522172451019287, + "step": 10922 + }, + { + "epoch": 1.77, + "learning_rate": 3.342678169958119e-07, + "logits/chosen": -1.0971243381500244, + "logits/rejected": -1.0879487991333008, + "logps/chosen": -40.7994384765625, + "logps/rejected": -1.5472263097763062, + "loss": 2.3822, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.038418959826231, + "rewards/margins": -0.5858253836631775, + "rewards/rejected": 0.6242443323135376, + "step": 10923 + }, + { + "epoch": 1.77, + "learning_rate": 3.3379550849369114e-07, + "logits/chosen": -1.2839696407318115, + "logits/rejected": -1.4439221620559692, + "logps/chosen": -64.82205200195312, + "logps/rejected": -113.18436431884766, + "loss": 2.1833, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.100860595703125, + "rewards/margins": -3.1652979850769043, + "rewards/rejected": 6.266158580780029, + "step": 10924 + }, + { + "epoch": 1.77, + "learning_rate": 3.333235223821502e-07, + "logits/chosen": -1.177422285079956, + "logits/rejected": -1.083085298538208, + "logps/chosen": -52.99019241333008, + "logps/rejected": -30.648683547973633, + "loss": 1.5785, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.478994369506836, + "rewards/margins": 0.7799031734466553, + "rewards/rejected": 1.6990911960601807, + "step": 10925 + }, + { + "epoch": 1.77, + "learning_rate": 3.3285185869379567e-07, + "logits/chosen": -1.522377371788025, + "logits/rejected": -1.3064922094345093, + "logps/chosen": -197.182373046875, + "logps/rejected": -38.6683235168457, + "loss": 0.1344, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.4768218994140625, + "rewards/margins": 3.9456660747528076, + "rewards/rejected": 0.5311557650566101, + "step": 10926 + }, + { + "epoch": 1.77, + "learning_rate": 3.3238051746121827e-07, + "logits/chosen": -1.0779035091400146, + "logits/rejected": -0.9987031817436218, + "logps/chosen": -44.173240661621094, + "logps/rejected": -58.651241302490234, + "loss": 0.448, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.01265025138855, + "rewards/margins": -0.35367250442504883, + "rewards/rejected": 3.3663227558135986, + "step": 10927 + }, + { + "epoch": 1.77, + "learning_rate": 3.319094987169802e-07, + "logits/chosen": -1.3830478191375732, + "logits/rejected": -1.3238297700881958, + "logps/chosen": -61.397064208984375, + "logps/rejected": -21.132612228393555, + "loss": 0.2648, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.60888671875, + "rewards/margins": 0.39160823822021484, + "rewards/rejected": 1.2172784805297852, + "step": 10928 + }, + { + "epoch": 1.77, + "learning_rate": 3.3143880249362714e-07, + "logits/chosen": -1.2070640325546265, + "logits/rejected": -1.2770217657089233, + "logps/chosen": -25.423213958740234, + "logps/rejected": -64.12972259521484, + "loss": 1.2142, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.104536771774292, + "rewards/margins": -1.836465835571289, + "rewards/rejected": 3.941002607345581, + "step": 10929 + }, + { + "epoch": 1.77, + "learning_rate": 3.309684288236775e-07, + "logits/chosen": -1.4229611158370972, + "logits/rejected": -1.464535117149353, + "logps/chosen": -96.57405853271484, + "logps/rejected": -242.30299377441406, + "loss": 0.2169, + "rewards/accuracies": 1.0, + "rewards/chosen": 8.322863578796387, + "rewards/margins": 0.6663222312927246, + "rewards/rejected": 7.656541347503662, + "step": 10930 + }, + { + "epoch": 1.77, + "learning_rate": 3.30498377739632e-07, + "logits/chosen": -1.0186277627944946, + "logits/rejected": -1.0186277627944946, + "logps/chosen": -32.739044189453125, + "logps/rejected": -32.739044189453125, + "loss": 0.6805, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.0468018054962158, + "rewards/margins": 0.0, + "rewards/rejected": 1.0468018054962158, + "step": 10931 + }, + { + "epoch": 1.77, + "learning_rate": 3.3002864927396404e-07, + "logits/chosen": -1.0499602556228638, + "logits/rejected": -1.0371427536010742, + "logps/chosen": -36.83660888671875, + "logps/rejected": -40.626502990722656, + "loss": 0.2129, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.52714467048645, + "rewards/margins": 1.2280113697052002, + "rewards/rejected": 1.29913330078125, + "step": 10932 + }, + { + "epoch": 1.77, + "learning_rate": 3.295592434591294e-07, + "logits/chosen": -1.4128220081329346, + "logits/rejected": -1.5077745914459229, + "logps/chosen": -71.765625, + "logps/rejected": -91.39563751220703, + "loss": 0.5587, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.443730115890503, + "rewards/margins": 0.47412335872650146, + "rewards/rejected": 1.9696067571640015, + "step": 10933 + }, + { + "epoch": 1.77, + "learning_rate": 3.290901603275587e-07, + "logits/chosen": -1.297857403755188, + "logits/rejected": -1.2617756128311157, + "logps/chosen": -111.90426635742188, + "logps/rejected": -114.37877655029297, + "loss": 0.5899, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.05930495262146, + "rewards/margins": 0.26488423347473145, + "rewards/rejected": 2.7944207191467285, + "step": 10934 + }, + { + "epoch": 1.77, + "learning_rate": 3.2862139991166165e-07, + "logits/chosen": -1.0681167840957642, + "logits/rejected": -1.14067542552948, + "logps/chosen": -18.26883316040039, + "logps/rejected": -49.42718505859375, + "loss": 0.8889, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.7354564666748047, + "rewards/margins": -1.5357251167297363, + "rewards/rejected": 3.271181583404541, + "step": 10935 + }, + { + "epoch": 1.78, + "learning_rate": 3.28152962243824e-07, + "logits/chosen": -1.319532871246338, + "logits/rejected": -1.2548812627792358, + "logps/chosen": -66.92596435546875, + "logps/rejected": -43.39204025268555, + "loss": 0.8606, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.103541612625122, + "rewards/margins": -1.3162035942077637, + "rewards/rejected": 3.4197452068328857, + "step": 10936 + }, + { + "epoch": 1.78, + "learning_rate": 3.27684847356412e-07, + "logits/chosen": -1.0528761148452759, + "logits/rejected": -0.900136411190033, + "logps/chosen": -49.65111541748047, + "logps/rejected": -17.332622528076172, + "loss": 0.0548, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.831667423248291, + "rewards/margins": 2.40997314453125, + "rewards/rejected": 0.42169418931007385, + "step": 10937 + }, + { + "epoch": 1.78, + "learning_rate": 3.2721705528176715e-07, + "logits/chosen": -1.535093069076538, + "logits/rejected": -1.4410651922225952, + "logps/chosen": -28.804180145263672, + "logps/rejected": -34.795841217041016, + "loss": 1.6322, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.292132616043091, + "rewards/margins": 1.90570068359375, + "rewards/rejected": 0.38643190264701843, + "step": 10938 + }, + { + "epoch": 1.78, + "learning_rate": 3.2674958605220965e-07, + "logits/chosen": -1.4804669618606567, + "logits/rejected": -1.4804669618606567, + "logps/chosen": -74.92794036865234, + "logps/rejected": -74.92794036865234, + "loss": 0.366, + "rewards/accuracies": 0.0, + "rewards/chosen": 7.311326026916504, + "rewards/margins": 0.0, + "rewards/rejected": 7.311326026916504, + "step": 10939 + }, + { + "epoch": 1.78, + "learning_rate": 3.2628243970003635e-07, + "logits/chosen": -1.294782280921936, + "logits/rejected": -1.2248016595840454, + "logps/chosen": -135.44268798828125, + "logps/rejected": -85.58795928955078, + "loss": 0.9136, + "rewards/accuracies": 0.0, + "rewards/chosen": 5.506967067718506, + "rewards/margins": -0.5468316078186035, + "rewards/rejected": 6.053798675537109, + "step": 10940 + }, + { + "epoch": 1.78, + "learning_rate": 3.2581561625752435e-07, + "logits/chosen": -0.8271697759628296, + "logits/rejected": -0.8316076397895813, + "logps/chosen": -18.008390426635742, + "logps/rejected": -24.02039337158203, + "loss": 0.5958, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.7563284039497375, + "rewards/margins": -0.45208650827407837, + "rewards/rejected": 1.208414912223816, + "step": 10941 + }, + { + "epoch": 1.78, + "learning_rate": 3.253491157569244e-07, + "logits/chosen": -1.0627330541610718, + "logits/rejected": -1.0010530948638916, + "logps/chosen": -59.70275115966797, + "logps/rejected": -116.0394287109375, + "loss": 0.5176, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.2072746753692627, + "rewards/margins": 0.21467053890228271, + "rewards/rejected": 1.99260413646698, + "step": 10942 + }, + { + "epoch": 1.78, + "learning_rate": 3.2488293823046967e-07, + "logits/chosen": -0.9086506962776184, + "logits/rejected": -0.9081320762634277, + "logps/chosen": -2.0225048065185547, + "logps/rejected": -0.6438630223274231, + "loss": 0.4863, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.18849559128284454, + "rewards/margins": -0.036340922117233276, + "rewards/rejected": 0.22483651340007782, + "step": 10943 + }, + { + "epoch": 1.78, + "learning_rate": 3.244170837103672e-07, + "logits/chosen": -1.6827988624572754, + "logits/rejected": -1.681565523147583, + "logps/chosen": -93.80513000488281, + "logps/rejected": -99.10638427734375, + "loss": 0.7707, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.877825975418091, + "rewards/margins": -0.8407974243164062, + "rewards/rejected": 3.718623399734497, + "step": 10944 + }, + { + "epoch": 1.78, + "learning_rate": 3.2395155222880335e-07, + "logits/chosen": -1.0689269304275513, + "logits/rejected": -0.942753791809082, + "logps/chosen": -48.10686492919922, + "logps/rejected": -35.18262481689453, + "loss": 1.3262, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.5894906520843506, + "rewards/margins": 1.2177151441574097, + "rewards/rejected": 1.371775507926941, + "step": 10945 + }, + { + "epoch": 1.78, + "learning_rate": 3.23486343817942e-07, + "logits/chosen": -1.268106460571289, + "logits/rejected": -1.2026798725128174, + "logps/chosen": -49.15216064453125, + "logps/rejected": -29.308990478515625, + "loss": 0.9498, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.3987159729003906, + "rewards/margins": -0.742476224899292, + "rewards/rejected": 2.1411921977996826, + "step": 10946 + }, + { + "epoch": 1.78, + "learning_rate": 3.2302145850992504e-07, + "logits/chosen": -1.4581279754638672, + "logits/rejected": -1.4958916902542114, + "logps/chosen": -138.81124877929688, + "logps/rejected": -134.8321990966797, + "loss": 0.7592, + "rewards/accuracies": 0.0, + "rewards/chosen": 6.697699069976807, + "rewards/margins": -0.7286515235900879, + "rewards/rejected": 7.4263505935668945, + "step": 10947 + }, + { + "epoch": 1.78, + "learning_rate": 3.2255689633687084e-07, + "logits/chosen": -1.504554271697998, + "logits/rejected": -1.527424693107605, + "logps/chosen": -32.60063171386719, + "logps/rejected": -67.61978149414062, + "loss": 0.2166, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.5923194885253906, + "rewards/margins": 0.6152877807617188, + "rewards/rejected": 1.9770317077636719, + "step": 10948 + }, + { + "epoch": 1.78, + "learning_rate": 3.220926573308775e-07, + "logits/chosen": -1.5586605072021484, + "logits/rejected": -1.6075371503829956, + "logps/chosen": -75.30984497070312, + "logps/rejected": -156.9908905029297, + "loss": 2.5301, + "rewards/accuracies": 0.0, + "rewards/chosen": 4.547486782073975, + "rewards/margins": -4.854645252227783, + "rewards/rejected": 9.402132034301758, + "step": 10949 + }, + { + "epoch": 1.78, + "learning_rate": 3.2162874152401824e-07, + "logits/chosen": -1.606677770614624, + "logits/rejected": -1.582925796508789, + "logps/chosen": -52.43600082397461, + "logps/rejected": -73.72947692871094, + "loss": 0.2499, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.665382146835327, + "rewards/margins": 0.5442395210266113, + "rewards/rejected": 2.121142625808716, + "step": 10950 + }, + { + "epoch": 1.78, + "learning_rate": 3.211651489483458e-07, + "logits/chosen": -1.2270846366882324, + "logits/rejected": -1.2176247835159302, + "logps/chosen": -71.03089141845703, + "logps/rejected": -72.44522857666016, + "loss": 0.1814, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.6558678150177, + "rewards/margins": 0.843543291091919, + "rewards/rejected": 2.8123245239257812, + "step": 10951 + }, + { + "epoch": 1.78, + "learning_rate": 3.207018796358902e-07, + "logits/chosen": -1.3028351068496704, + "logits/rejected": -1.347183108329773, + "logps/chosen": -38.97126007080078, + "logps/rejected": -70.9585952758789, + "loss": 0.9442, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.1696979999542236, + "rewards/margins": -1.2567341327667236, + "rewards/rejected": 2.4264321327209473, + "step": 10952 + }, + { + "epoch": 1.78, + "learning_rate": 3.20238933618659e-07, + "logits/chosen": -1.5566660165786743, + "logits/rejected": -1.5486005544662476, + "logps/chosen": -77.66471862792969, + "logps/rejected": -93.6436996459961, + "loss": 0.7523, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.6368727684021, + "rewards/margins": 1.9127461910247803, + "rewards/rejected": 2.7241265773773193, + "step": 10953 + }, + { + "epoch": 1.78, + "learning_rate": 3.1977631092863613e-07, + "logits/chosen": -1.7336496114730835, + "logits/rejected": -1.6417120695114136, + "logps/chosen": -99.79032135009766, + "logps/rejected": -47.73017883300781, + "loss": 0.7155, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.6880897283554077, + "rewards/margins": -1.082037329673767, + "rewards/rejected": 2.770127058029175, + "step": 10954 + }, + { + "epoch": 1.78, + "learning_rate": 3.1931401159778663e-07, + "logits/chosen": -1.4060168266296387, + "logits/rejected": -1.2110896110534668, + "logps/chosen": -148.3221893310547, + "logps/rejected": -187.17189025878906, + "loss": 0.113, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.755366802215576, + "rewards/margins": 1.3825013637542725, + "rewards/rejected": 3.3728654384613037, + "step": 10955 + }, + { + "epoch": 1.78, + "learning_rate": 3.1885203565804936e-07, + "logits/chosen": -1.5699840784072876, + "logits/rejected": -1.5566006898880005, + "logps/chosen": -115.31375122070312, + "logps/rejected": -159.0033721923828, + "loss": 0.0688, + "rewards/accuracies": 1.0, + "rewards/chosen": 8.502781867980957, + "rewards/margins": 1.9491639137268066, + "rewards/rejected": 6.55361795425415, + "step": 10956 + }, + { + "epoch": 1.78, + "learning_rate": 3.1839038314134263e-07, + "logits/chosen": -1.4111099243164062, + "logits/rejected": -1.5154125690460205, + "logps/chosen": -53.56221008300781, + "logps/rejected": -96.45869445800781, + "loss": 0.4629, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.8047072887420654, + "rewards/margins": -0.36835646629333496, + "rewards/rejected": 4.1730637550354, + "step": 10957 + }, + { + "epoch": 1.78, + "learning_rate": 3.179290540795632e-07, + "logits/chosen": -1.5501118898391724, + "logits/rejected": -1.3063664436340332, + "logps/chosen": -113.77548217773438, + "logps/rejected": -21.89426040649414, + "loss": 0.0208, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.5670135021209717, + "rewards/margins": 3.2789599895477295, + "rewards/rejected": 0.2880535125732422, + "step": 10958 + }, + { + "epoch": 1.78, + "learning_rate": 3.1746804850458336e-07, + "logits/chosen": -1.6705619096755981, + "logits/rejected": -1.6734391450881958, + "logps/chosen": -80.92211151123047, + "logps/rejected": -153.03396606445312, + "loss": 2.2366, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.823122501373291, + "rewards/margins": -4.412242889404297, + "rewards/rejected": 7.235365390777588, + "step": 10959 + }, + { + "epoch": 1.78, + "learning_rate": 3.170073664482559e-07, + "logits/chosen": -1.3604389429092407, + "logits/rejected": -1.335694432258606, + "logps/chosen": -57.55738830566406, + "logps/rejected": -57.278717041015625, + "loss": 1.0038, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.5180511474609375, + "rewards/margins": -1.849853515625, + "rewards/rejected": 4.3679046630859375, + "step": 10960 + }, + { + "epoch": 1.78, + "learning_rate": 3.1654700794240765e-07, + "logits/chosen": -1.3227043151855469, + "logits/rejected": -1.2790398597717285, + "logps/chosen": -64.77465057373047, + "logps/rejected": -56.000465393066406, + "loss": 0.4766, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.694150686264038, + "rewards/margins": 1.1846879720687866, + "rewards/rejected": 1.5094627141952515, + "step": 10961 + }, + { + "epoch": 1.78, + "learning_rate": 3.160869730188465e-07, + "logits/chosen": -1.2363626956939697, + "logits/rejected": -1.2078406810760498, + "logps/chosen": -98.9500961303711, + "logps/rejected": -67.42916870117188, + "loss": 1.2933, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.1108330488204956, + "rewards/margins": -0.9911490678787231, + "rewards/rejected": 2.1019821166992188, + "step": 10962 + }, + { + "epoch": 1.78, + "learning_rate": 3.156272617093553e-07, + "logits/chosen": -0.9277005195617676, + "logits/rejected": -0.952913224697113, + "logps/chosen": -50.04475402832031, + "logps/rejected": -79.35258483886719, + "loss": 0.5329, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.226983666419983, + "rewards/margins": -0.4068237543106079, + "rewards/rejected": 1.6338074207305908, + "step": 10963 + }, + { + "epoch": 1.78, + "learning_rate": 3.1516787404569706e-07, + "logits/chosen": -1.2663592100143433, + "logits/rejected": -1.3509771823883057, + "logps/chosen": -133.63555908203125, + "logps/rejected": -108.06092834472656, + "loss": 0.8932, + "rewards/accuracies": 0.0, + "rewards/chosen": 6.5337066650390625, + "rewards/margins": -1.5589923858642578, + "rewards/rejected": 8.09269905090332, + "step": 10964 + }, + { + "epoch": 1.78, + "learning_rate": 3.1470881005960964e-07, + "logits/chosen": -1.276928424835205, + "logits/rejected": -1.2558891773223877, + "logps/chosen": -29.92889404296875, + "logps/rejected": -55.575416564941406, + "loss": 0.7912, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.8066391944885254, + "rewards/margins": -1.0736892223358154, + "rewards/rejected": 3.880328416824341, + "step": 10965 + }, + { + "epoch": 1.78, + "learning_rate": 3.142500697828116e-07, + "logits/chosen": -1.3972896337509155, + "logits/rejected": -1.3972896337509155, + "logps/chosen": -64.44715881347656, + "logps/rejected": -64.44715881347656, + "loss": 0.8342, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.799002170562744, + "rewards/margins": 0.0, + "rewards/rejected": 2.799002170562744, + "step": 10966 + }, + { + "epoch": 1.78, + "learning_rate": 3.137916532469959e-07, + "logits/chosen": -1.042528748512268, + "logits/rejected": -1.027126669883728, + "logps/chosen": -63.17836380004883, + "logps/rejected": -50.29823303222656, + "loss": 0.2756, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.527900457382202, + "rewards/margins": 2.028127908706665, + "rewards/rejected": 0.4997726380825043, + "step": 10967 + }, + { + "epoch": 1.78, + "learning_rate": 3.1333356048383666e-07, + "logits/chosen": -1.2425718307495117, + "logits/rejected": -1.2044974565505981, + "logps/chosen": -77.27528381347656, + "logps/rejected": -51.61228561401367, + "loss": 0.902, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.440300703048706, + "rewards/margins": 1.2124195098876953, + "rewards/rejected": 2.2278811931610107, + "step": 10968 + }, + { + "epoch": 1.78, + "learning_rate": 3.128757915249814e-07, + "logits/chosen": -1.3906155824661255, + "logits/rejected": -1.337270736694336, + "logps/chosen": -76.70074462890625, + "logps/rejected": -67.58180236816406, + "loss": 0.5116, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.5641487836837769, + "rewards/margins": -0.5110260248184204, + "rewards/rejected": 2.0751748085021973, + "step": 10969 + }, + { + "epoch": 1.78, + "learning_rate": 3.1241834640206025e-07, + "logits/chosen": -1.401784896850586, + "logits/rejected": -1.4316779375076294, + "logps/chosen": -72.02656555175781, + "logps/rejected": -121.58283996582031, + "loss": 1.7301, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.7679741382598877, + "rewards/margins": 0.013254642486572266, + "rewards/rejected": 2.7547194957733154, + "step": 10970 + }, + { + "epoch": 1.78, + "learning_rate": 3.1196122514667635e-07, + "logits/chosen": -1.2180836200714111, + "logits/rejected": -1.217560887336731, + "logps/chosen": -49.71700668334961, + "logps/rejected": -101.08038330078125, + "loss": 0.6615, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.4255177974700928, + "rewards/margins": -0.8651027679443359, + "rewards/rejected": 3.2906205654144287, + "step": 10971 + }, + { + "epoch": 1.78, + "learning_rate": 3.115044277904139e-07, + "logits/chosen": -1.3131824731826782, + "logits/rejected": -1.3561502695083618, + "logps/chosen": -48.29557800292969, + "logps/rejected": -67.51155853271484, + "loss": 1.8903, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.9322526454925537, + "rewards/margins": -3.6585028171539307, + "rewards/rejected": 6.590755462646484, + "step": 10972 + }, + { + "epoch": 1.78, + "learning_rate": 3.110479543648315e-07, + "logits/chosen": -1.378319263458252, + "logits/rejected": -1.430608868598938, + "logps/chosen": -69.27000427246094, + "logps/rejected": -53.23231506347656, + "loss": 0.1946, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2046432495117188, + "rewards/margins": 0.8337295055389404, + "rewards/rejected": 2.3709137439727783, + "step": 10973 + }, + { + "epoch": 1.78, + "learning_rate": 3.105918049014689e-07, + "logits/chosen": -0.9731219410896301, + "logits/rejected": -0.9725675582885742, + "logps/chosen": -10.867105484008789, + "logps/rejected": -5.295690536499023, + "loss": 0.3925, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.12101965397596359, + "rewards/margins": -0.1340102255344391, + "rewards/rejected": 0.2550298869609833, + "step": 10974 + }, + { + "epoch": 1.78, + "learning_rate": 3.101359794318404e-07, + "logits/chosen": -1.2098525762557983, + "logits/rejected": -1.1748685836791992, + "logps/chosen": -45.7950325012207, + "logps/rejected": -16.390491485595703, + "loss": 0.4757, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.6223713159561157, + "rewards/margins": -0.033121466636657715, + "rewards/rejected": 0.6554927825927734, + "step": 10975 + }, + { + "epoch": 1.78, + "learning_rate": 3.096804779874407e-07, + "logits/chosen": -1.468105673789978, + "logits/rejected": -1.120840072631836, + "logps/chosen": -114.54046630859375, + "logps/rejected": -56.314979553222656, + "loss": 0.0056, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.639420986175537, + "rewards/margins": 5.9065470695495605, + "rewards/rejected": 0.7328739166259766, + "step": 10976 + }, + { + "epoch": 1.78, + "learning_rate": 3.0922530059973863e-07, + "logits/chosen": -1.2281044721603394, + "logits/rejected": -1.2281044721603394, + "logps/chosen": -56.198814392089844, + "logps/rejected": -56.198814392089844, + "loss": 0.4556, + "rewards/accuracies": 0.0, + "rewards/chosen": 4.263902187347412, + "rewards/margins": 0.0, + "rewards/rejected": 4.263902187347412, + "step": 10977 + }, + { + "epoch": 1.78, + "learning_rate": 3.0877044730018515e-07, + "logits/chosen": -1.3606661558151245, + "logits/rejected": -1.5386813879013062, + "logps/chosen": -122.67546081542969, + "logps/rejected": -104.24783325195312, + "loss": 0.5569, + "rewards/accuracies": 1.0, + "rewards/chosen": 7.328578472137451, + "rewards/margins": 2.4159913063049316, + "rewards/rejected": 4.9125871658325195, + "step": 10978 + }, + { + "epoch": 1.78, + "learning_rate": 3.083159181202039e-07, + "logits/chosen": -1.2713069915771484, + "logits/rejected": -1.125933289527893, + "logps/chosen": -133.32894897460938, + "logps/rejected": -103.19373321533203, + "loss": 0.5407, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.301175117492676, + "rewards/margins": 3.0358285903930664, + "rewards/rejected": 3.2653465270996094, + "step": 10979 + }, + { + "epoch": 1.78, + "learning_rate": 3.0786171309119985e-07, + "logits/chosen": -0.9675487279891968, + "logits/rejected": -0.9913957715034485, + "logps/chosen": -46.18263626098633, + "logps/rejected": -36.29203414916992, + "loss": 1.1804, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.6277241706848145, + "rewards/margins": 1.106084942817688, + "rewards/rejected": 1.5216392278671265, + "step": 10980 + }, + { + "epoch": 1.78, + "learning_rate": 3.0740783224455397e-07, + "logits/chosen": -0.9212191700935364, + "logits/rejected": -1.0545903444290161, + "logps/chosen": -73.80236053466797, + "logps/rejected": -85.86412048339844, + "loss": 2.942, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.8623565435409546, + "rewards/margins": -2.4890947341918945, + "rewards/rejected": 4.351451396942139, + "step": 10981 + }, + { + "epoch": 1.78, + "learning_rate": 3.0695427561162563e-07, + "logits/chosen": -1.1635303497314453, + "logits/rejected": -1.1635303497314453, + "logps/chosen": -54.04046630859375, + "logps/rejected": -54.04046630859375, + "loss": 0.4382, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.9365272521972656, + "rewards/margins": 0.0, + "rewards/rejected": 2.9365272521972656, + "step": 10982 + }, + { + "epoch": 1.78, + "learning_rate": 3.0650104322374976e-07, + "logits/chosen": -1.0504364967346191, + "logits/rejected": -1.0504364967346191, + "logps/chosen": -2.2532904148101807, + "logps/rejected": -2.2532904148101807, + "loss": 0.3479, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.3382229506969452, + "rewards/margins": 0.0, + "rewards/rejected": 0.3382229506969452, + "step": 10983 + }, + { + "epoch": 1.78, + "learning_rate": 3.0604813511224243e-07, + "logits/chosen": -1.0541073083877563, + "logits/rejected": -1.0634455680847168, + "logps/chosen": -93.77332305908203, + "logps/rejected": -83.40461730957031, + "loss": 0.0456, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.8568763732910156, + "rewards/margins": 2.6939496994018555, + "rewards/rejected": 1.1629265546798706, + "step": 10984 + }, + { + "epoch": 1.78, + "learning_rate": 3.0559555130839357e-07, + "logits/chosen": -1.3486210107803345, + "logits/rejected": -1.1812814474105835, + "logps/chosen": -60.743919372558594, + "logps/rejected": -30.437435150146484, + "loss": 0.8535, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.5673164129257202, + "rewards/margins": 2.1956911087036133, + "rewards/rejected": -0.6283746957778931, + "step": 10985 + }, + { + "epoch": 1.78, + "learning_rate": 3.0514329184347437e-07, + "logits/chosen": -1.4776920080184937, + "logits/rejected": -1.3654825687408447, + "logps/chosen": -90.17862701416016, + "logps/rejected": -87.73014831542969, + "loss": 0.022, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.034021854400635, + "rewards/margins": 3.1576592922210693, + "rewards/rejected": 2.8763625621795654, + "step": 10986 + }, + { + "epoch": 1.78, + "learning_rate": 3.0469135674872975e-07, + "logits/chosen": -0.655794620513916, + "logits/rejected": -0.655794620513916, + "logps/chosen": -60.63860321044922, + "logps/rejected": -60.63860321044922, + "loss": 2.3415, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.690900444984436, + "rewards/margins": 0.0, + "rewards/rejected": 1.690900444984436, + "step": 10987 + }, + { + "epoch": 1.78, + "learning_rate": 3.042397460553853e-07, + "logits/chosen": -1.1001486778259277, + "logits/rejected": -1.07852041721344, + "logps/chosen": -38.321868896484375, + "logps/rejected": -16.23638916015625, + "loss": 0.2878, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.7831100821495056, + "rewards/margins": 0.2685800790786743, + "rewards/rejected": 0.5145300030708313, + "step": 10988 + }, + { + "epoch": 1.78, + "learning_rate": 3.0378845979464276e-07, + "logits/chosen": -1.238163948059082, + "logits/rejected": -1.295278549194336, + "logps/chosen": -110.13505554199219, + "logps/rejected": -102.50233459472656, + "loss": 0.3796, + "rewards/accuracies": 0.0, + "rewards/chosen": 7.231431484222412, + "rewards/margins": -0.11163663864135742, + "rewards/rejected": 7.3430681228637695, + "step": 10989 + }, + { + "epoch": 1.78, + "learning_rate": 3.033374979976811e-07, + "logits/chosen": -0.9932044744491577, + "logits/rejected": -0.9703707098960876, + "logps/chosen": -74.80606079101562, + "logps/rejected": -17.018447875976562, + "loss": 0.3755, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.6091926693916321, + "rewards/margins": -0.06944447755813599, + "rewards/rejected": 0.6786371469497681, + "step": 10990 + }, + { + "epoch": 1.78, + "learning_rate": 3.028868606956592e-07, + "logits/chosen": -1.025151014328003, + "logits/rejected": -1.1149345636367798, + "logps/chosen": -35.20789337158203, + "logps/rejected": -89.18891906738281, + "loss": 0.6372, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.0566468238830566, + "rewards/margins": -0.6155149936676025, + "rewards/rejected": 2.672161817550659, + "step": 10991 + }, + { + "epoch": 1.78, + "learning_rate": 3.0243654791970945e-07, + "logits/chosen": -1.3257976770401, + "logits/rejected": -1.3070610761642456, + "logps/chosen": -26.48764419555664, + "logps/rejected": -106.0303726196289, + "loss": 0.8526, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.4936695098876953, + "rewards/margins": 0.38784360885620117, + "rewards/rejected": 2.105825901031494, + "step": 10992 + }, + { + "epoch": 1.78, + "learning_rate": 3.0198655970094637e-07, + "logits/chosen": -0.6056399345397949, + "logits/rejected": -0.6073104739189148, + "logps/chosen": -3.238238573074341, + "logps/rejected": -1.2168205976486206, + "loss": 0.7162, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.49760743975639343, + "rewards/margins": -0.09025785326957703, + "rewards/rejected": 0.5878652930259705, + "step": 10993 + }, + { + "epoch": 1.78, + "learning_rate": 3.015368960704584e-07, + "logits/chosen": -1.3232417106628418, + "logits/rejected": -1.2914519309997559, + "logps/chosen": -101.24502563476562, + "logps/rejected": -180.76930236816406, + "loss": 0.2035, + "rewards/accuracies": 1.0, + "rewards/chosen": 8.267809867858887, + "rewards/margins": 0.8406767845153809, + "rewards/rejected": 7.427133083343506, + "step": 10994 + }, + { + "epoch": 1.78, + "learning_rate": 3.0108755705931415e-07, + "logits/chosen": -1.1276702880859375, + "logits/rejected": -1.1076000928878784, + "logps/chosen": -60.84607696533203, + "logps/rejected": -27.03213119506836, + "loss": 0.3788, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.181241512298584, + "rewards/margins": 2.3685240745544434, + "rewards/rejected": 1.8127174377441406, + "step": 10995 + }, + { + "epoch": 1.78, + "learning_rate": 3.006385426985575e-07, + "logits/chosen": -1.1991099119186401, + "logits/rejected": -1.1995702981948853, + "logps/chosen": -67.50114440917969, + "logps/rejected": -80.64490509033203, + "loss": 0.287, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.47857666015625, + "rewards/margins": 0.3006095886230469, + "rewards/rejected": 2.177967071533203, + "step": 10996 + }, + { + "epoch": 1.78, + "learning_rate": 3.0018985301921265e-07, + "logits/chosen": -1.6560704708099365, + "logits/rejected": -1.4930524826049805, + "logps/chosen": -147.625732421875, + "logps/rejected": -21.358800888061523, + "loss": 0.017, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.8836076259613037, + "rewards/margins": 3.545386552810669, + "rewards/rejected": 0.3382209837436676, + "step": 10997 + }, + { + "epoch": 1.79, + "learning_rate": 2.997414880522781e-07, + "logits/chosen": -1.0291810035705566, + "logits/rejected": -1.0080410242080688, + "logps/chosen": -73.91718292236328, + "logps/rejected": -92.24506378173828, + "loss": 0.2086, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.7085899114608765, + "rewards/margins": 1.172705054283142, + "rewards/rejected": 0.5358848571777344, + "step": 10998 + }, + { + "epoch": 1.79, + "learning_rate": 2.992934478287335e-07, + "logits/chosen": -1.2624915838241577, + "logits/rejected": -0.9071478843688965, + "logps/chosen": -75.39119720458984, + "logps/rejected": -84.79931640625, + "loss": 0.3274, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.0768256187438965, + "rewards/margins": 0.10483455657958984, + "rewards/rejected": 5.971991062164307, + "step": 10999 + }, + { + "epoch": 1.79, + "learning_rate": 2.9884573237953183e-07, + "logits/chosen": -1.2313660383224487, + "logits/rejected": -1.1943103075027466, + "logps/chosen": -45.09819412231445, + "logps/rejected": -18.319202423095703, + "loss": 0.2027, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.7318668365478516, + "rewards/margins": 1.4388819932937622, + "rewards/rejected": 1.2929848432540894, + "step": 11000 + }, + { + "epoch": 1.79, + "learning_rate": 2.9839834173560845e-07, + "logits/chosen": -1.0896669626235962, + "logits/rejected": -1.0830429792404175, + "logps/chosen": -18.732046127319336, + "logps/rejected": -31.024776458740234, + "loss": 0.2866, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.553013801574707, + "rewards/margins": 0.9735651016235352, + "rewards/rejected": -0.4205513000488281, + "step": 11001 + }, + { + "epoch": 1.79, + "learning_rate": 2.979512759278719e-07, + "logits/chosen": -1.414406180381775, + "logits/rejected": -1.3528456687927246, + "logps/chosen": -115.50414276123047, + "logps/rejected": -108.05411529541016, + "loss": 0.1091, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.460193634033203, + "rewards/margins": 1.4204390048980713, + "rewards/rejected": 3.039754629135132, + "step": 11002 + }, + { + "epoch": 1.79, + "learning_rate": 2.9750453498721186e-07, + "logits/chosen": -1.1920385360717773, + "logits/rejected": -1.2011560201644897, + "logps/chosen": -45.5734748840332, + "logps/rejected": -73.38203430175781, + "loss": 0.1899, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.7764973640441895, + "rewards/margins": 1.4087414741516113, + "rewards/rejected": 1.3677558898925781, + "step": 11003 + }, + { + "epoch": 1.79, + "learning_rate": 2.970581189444921e-07, + "logits/chosen": -1.192337989807129, + "logits/rejected": -1.1497046947479248, + "logps/chosen": -16.243881225585938, + "logps/rejected": -18.229272842407227, + "loss": 0.7981, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.5939819812774658, + "rewards/margins": -0.846644401550293, + "rewards/rejected": 2.440626382827759, + "step": 11004 + }, + { + "epoch": 1.79, + "learning_rate": 2.966120278305573e-07, + "logits/chosen": -1.302801251411438, + "logits/rejected": -1.268411636352539, + "logps/chosen": -82.7030029296875, + "logps/rejected": -89.39202117919922, + "loss": 0.3329, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.5131210088729858, + "rewards/margins": 0.1208869218826294, + "rewards/rejected": 1.3922340869903564, + "step": 11005 + }, + { + "epoch": 1.79, + "learning_rate": 2.961662616762273e-07, + "logits/chosen": -1.1274040937423706, + "logits/rejected": -1.0064936876296997, + "logps/chosen": -72.657470703125, + "logps/rejected": -19.397136688232422, + "loss": 0.3076, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.6082122325897217, + "rewards/margins": 0.4245166778564453, + "rewards/rejected": 2.1836955547332764, + "step": 11006 + }, + { + "epoch": 1.79, + "learning_rate": 2.957208205123008e-07, + "logits/chosen": -1.523396372795105, + "logits/rejected": -1.548769235610962, + "logps/chosen": -142.36648559570312, + "logps/rejected": -183.4669189453125, + "loss": 2.0631, + "rewards/accuracies": 0.0, + "rewards/chosen": 5.130887031555176, + "rewards/margins": -4.091466903686523, + "rewards/rejected": 9.2223539352417, + "step": 11007 + }, + { + "epoch": 1.79, + "learning_rate": 2.9527570436955255e-07, + "logits/chosen": -0.8232628703117371, + "logits/rejected": -1.4131405353546143, + "logps/chosen": -14.853689193725586, + "logps/rejected": -67.62094116210938, + "loss": 0.8674, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.7440240979194641, + "rewards/margins": -1.463921308517456, + "rewards/rejected": 2.2079453468322754, + "step": 11008 + }, + { + "epoch": 1.79, + "learning_rate": 2.9483091327873745e-07, + "logits/chosen": -1.4394879341125488, + "logits/rejected": -1.4115749597549438, + "logps/chosen": -59.12759017944336, + "logps/rejected": -81.61892700195312, + "loss": 0.3376, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.337045669555664, + "rewards/margins": 0.6418521404266357, + "rewards/rejected": 1.6951935291290283, + "step": 11009 + }, + { + "epoch": 1.79, + "learning_rate": 2.9438644727058484e-07, + "logits/chosen": -1.5519522428512573, + "logits/rejected": -1.5557115077972412, + "logps/chosen": -85.72559356689453, + "logps/rejected": -79.09706115722656, + "loss": 0.3757, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.879431962966919, + "rewards/margins": -0.11292493343353271, + "rewards/rejected": 1.9923568964004517, + "step": 11010 + }, + { + "epoch": 1.79, + "learning_rate": 2.939423063758035e-07, + "logits/chosen": -1.1273096799850464, + "logits/rejected": -1.1362628936767578, + "logps/chosen": -60.859947204589844, + "logps/rejected": -70.45315551757812, + "loss": 0.4867, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.811037540435791, + "rewards/margins": 1.4469338655471802, + "rewards/rejected": 1.3641036748886108, + "step": 11011 + }, + { + "epoch": 1.79, + "learning_rate": 2.9349849062508094e-07, + "logits/chosen": -1.25658118724823, + "logits/rejected": -1.181776523590088, + "logps/chosen": -207.94522094726562, + "logps/rejected": -124.48739624023438, + "loss": 0.0274, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.60353422164917, + "rewards/margins": 5.400232315063477, + "rewards/rejected": 1.203302025794983, + "step": 11012 + }, + { + "epoch": 1.79, + "learning_rate": 2.9305500004907786e-07, + "logits/chosen": -1.2446460723876953, + "logits/rejected": -1.1078238487243652, + "logps/chosen": -51.7082405090332, + "logps/rejected": -24.887067794799805, + "loss": 0.3747, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.8030362129211426, + "rewards/margins": 2.3989620208740234, + "rewards/rejected": 0.4040742814540863, + "step": 11013 + }, + { + "epoch": 1.79, + "learning_rate": 2.926118346784379e-07, + "logits/chosen": -1.1529754400253296, + "logits/rejected": -1.1250025033950806, + "logps/chosen": -32.06932830810547, + "logps/rejected": -23.57817840576172, + "loss": 0.7564, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.832183837890625, + "rewards/margins": 0.06074059009552002, + "rewards/rejected": 1.771443247795105, + "step": 11014 + }, + { + "epoch": 1.79, + "learning_rate": 2.921689945437778e-07, + "logits/chosen": -1.576816201210022, + "logits/rejected": -1.5076969861984253, + "logps/chosen": -135.0721435546875, + "logps/rejected": -81.25239562988281, + "loss": 0.0115, + "rewards/accuracies": 1.0, + "rewards/chosen": 7.177430629730225, + "rewards/margins": 4.338533401489258, + "rewards/rejected": 2.8388969898223877, + "step": 11015 + }, + { + "epoch": 1.79, + "learning_rate": 2.9172647967569525e-07, + "logits/chosen": -0.9815253019332886, + "logits/rejected": -1.0332634449005127, + "logps/chosen": -35.59994888305664, + "logps/rejected": -42.001590728759766, + "loss": 1.0766, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.224226713180542, + "rewards/margins": -0.06418919563293457, + "rewards/rejected": 3.2884159088134766, + "step": 11016 + }, + { + "epoch": 1.79, + "learning_rate": 2.912842901047619e-07, + "logits/chosen": -0.9803473949432373, + "logits/rejected": -0.9803473949432373, + "logps/chosen": -1.834148645401001, + "logps/rejected": -1.834148645401001, + "loss": 0.5609, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.25745290517807007, + "rewards/margins": 0.0, + "rewards/rejected": 0.25745290517807007, + "step": 11017 + }, + { + "epoch": 1.79, + "learning_rate": 2.9084242586153e-07, + "logits/chosen": -1.321738362312317, + "logits/rejected": -1.321738362312317, + "logps/chosen": -84.36074829101562, + "logps/rejected": -84.36074829101562, + "loss": 0.3474, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.535137891769409, + "rewards/margins": 0.0, + "rewards/rejected": 3.535137891769409, + "step": 11018 + }, + { + "epoch": 1.79, + "learning_rate": 2.904008869765279e-07, + "logits/chosen": -0.6970956921577454, + "logits/rejected": -0.7188027501106262, + "logps/chosen": -60.04823303222656, + "logps/rejected": -61.80532455444336, + "loss": 0.4281, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.5389763116836548, + "rewards/margins": -0.1443309783935547, + "rewards/rejected": 1.6833072900772095, + "step": 11019 + }, + { + "epoch": 1.79, + "learning_rate": 2.8995967348026235e-07, + "logits/chosen": -1.5297552347183228, + "logits/rejected": -1.4816781282424927, + "logps/chosen": -70.64125061035156, + "logps/rejected": -11.207843780517578, + "loss": 0.4859, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.9253631830215454, + "rewards/margins": 1.8996862173080444, + "rewards/rejected": 0.025676919147372246, + "step": 11020 + }, + { + "epoch": 1.79, + "learning_rate": 2.895187854032161e-07, + "logits/chosen": -1.466119408607483, + "logits/rejected": -1.4119794368743896, + "logps/chosen": -42.49945068359375, + "logps/rejected": -72.86836242675781, + "loss": 1.3785, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.6535584926605225, + "rewards/margins": 0.84820556640625, + "rewards/rejected": 2.8053529262542725, + "step": 11021 + }, + { + "epoch": 1.79, + "learning_rate": 2.8907822277585153e-07, + "logits/chosen": -1.389649748802185, + "logits/rejected": -1.2917994260787964, + "logps/chosen": -74.47181701660156, + "logps/rejected": -75.20834350585938, + "loss": 0.4659, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.850395202636719, + "rewards/margins": 2.1906142234802246, + "rewards/rejected": 2.659780979156494, + "step": 11022 + }, + { + "epoch": 1.79, + "learning_rate": 2.8863798562860535e-07, + "logits/chosen": -1.0966267585754395, + "logits/rejected": -1.0995349884033203, + "logps/chosen": -23.636768341064453, + "logps/rejected": -63.72167205810547, + "loss": 0.4028, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.3981692790985107, + "rewards/margins": 0.6034601926803589, + "rewards/rejected": 1.7947090864181519, + "step": 11023 + }, + { + "epoch": 1.79, + "learning_rate": 2.88198073991896e-07, + "logits/chosen": -1.2737454175949097, + "logits/rejected": -1.1737703084945679, + "logps/chosen": -69.31326293945312, + "logps/rejected": -23.522064208984375, + "loss": 0.2222, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.08884596824646, + "rewards/margins": 1.6341603994369507, + "rewards/rejected": 1.4546855688095093, + "step": 11024 + }, + { + "epoch": 1.79, + "learning_rate": 2.8775848789611596e-07, + "logits/chosen": -1.07652747631073, + "logits/rejected": -1.0340856313705444, + "logps/chosen": -89.37734985351562, + "logps/rejected": -48.26704025268555, + "loss": 0.9283, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.163598656654358, + "rewards/margins": -0.557956337928772, + "rewards/rejected": 1.7215549945831299, + "step": 11025 + }, + { + "epoch": 1.79, + "learning_rate": 2.873192273716369e-07, + "logits/chosen": -1.6173464059829712, + "logits/rejected": -1.6055210828781128, + "logps/chosen": -56.26474380493164, + "logps/rejected": -60.864933013916016, + "loss": 1.2918, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.002418279647827, + "rewards/margins": -0.9999330043792725, + "rewards/rejected": 4.0023512840271, + "step": 11026 + }, + { + "epoch": 1.79, + "learning_rate": 2.868802924488068e-07, + "logits/chosen": -1.1048496961593628, + "logits/rejected": -1.1302313804626465, + "logps/chosen": -35.70783615112305, + "logps/rejected": -102.86058044433594, + "loss": 0.2035, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.8526833057403564, + "rewards/margins": 0.7416130304336548, + "rewards/rejected": 1.1110702753067017, + "step": 11027 + }, + { + "epoch": 1.79, + "learning_rate": 2.864416831579536e-07, + "logits/chosen": -1.8204634189605713, + "logits/rejected": -1.8539681434631348, + "logps/chosen": -40.51982116699219, + "logps/rejected": -91.85950469970703, + "loss": 0.3744, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.572129011154175, + "rewards/margins": 0.4307982921600342, + "rewards/rejected": 2.1413307189941406, + "step": 11028 + }, + { + "epoch": 1.79, + "learning_rate": 2.860033995293793e-07, + "logits/chosen": -1.1568470001220703, + "logits/rejected": -1.1568470001220703, + "logps/chosen": -43.26459503173828, + "logps/rejected": -43.26459503173828, + "loss": 0.6901, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.414626359939575, + "rewards/margins": 0.0, + "rewards/rejected": 3.414626359939575, + "step": 11029 + }, + { + "epoch": 1.79, + "learning_rate": 2.855654415933662e-07, + "logits/chosen": -1.2415847778320312, + "logits/rejected": -1.236719012260437, + "logps/chosen": -92.88825988769531, + "logps/rejected": -58.41654586791992, + "loss": 0.2441, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.6439177989959717, + "rewards/margins": 0.4865856170654297, + "rewards/rejected": 2.157332181930542, + "step": 11030 + }, + { + "epoch": 1.79, + "learning_rate": 2.851278093801718e-07, + "logits/chosen": -1.176645040512085, + "logits/rejected": -1.1297016143798828, + "logps/chosen": -57.860347747802734, + "logps/rejected": -66.4001693725586, + "loss": 0.5535, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.094015121459961, + "rewards/margins": 0.198286771774292, + "rewards/rejected": 2.895728349685669, + "step": 11031 + }, + { + "epoch": 1.79, + "learning_rate": 2.8469050292003476e-07, + "logits/chosen": -1.4806283712387085, + "logits/rejected": -1.4461363554000854, + "logps/chosen": -208.92245483398438, + "logps/rejected": -78.580078125, + "loss": 0.0991, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.939364910125732, + "rewards/margins": 3.492547035217285, + "rewards/rejected": 1.4468177556991577, + "step": 11032 + }, + { + "epoch": 1.79, + "learning_rate": 2.8425352224316583e-07, + "logits/chosen": -1.200360655784607, + "logits/rejected": -1.2080233097076416, + "logps/chosen": -125.69371032714844, + "logps/rejected": -116.79558563232422, + "loss": 4.2497, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.6728012561798096, + "rewards/margins": -3.776648759841919, + "rewards/rejected": 6.4494500160217285, + "step": 11033 + }, + { + "epoch": 1.79, + "learning_rate": 2.8381686737975867e-07, + "logits/chosen": -1.0982102155685425, + "logits/rejected": -1.1158570051193237, + "logps/chosen": -9.304302215576172, + "logps/rejected": -1.0715813636779785, + "loss": 0.6881, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.18283243477344513, + "rewards/margins": -0.19666112959384918, + "rewards/rejected": 0.3794935643672943, + "step": 11034 + }, + { + "epoch": 1.79, + "learning_rate": 2.8338053835998025e-07, + "logits/chosen": -1.1058154106140137, + "logits/rejected": -1.0635077953338623, + "logps/chosen": -54.633323669433594, + "logps/rejected": -71.79637145996094, + "loss": 0.9635, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.3595054149627686, + "rewards/margins": -1.318100929260254, + "rewards/rejected": 3.6776063442230225, + "step": 11035 + }, + { + "epoch": 1.79, + "learning_rate": 2.829445352139787e-07, + "logits/chosen": -1.1892778873443604, + "logits/rejected": -1.213233470916748, + "logps/chosen": -188.763671875, + "logps/rejected": -139.05953979492188, + "loss": 0.1792, + "rewards/accuracies": 1.0, + "rewards/chosen": 7.1567230224609375, + "rewards/margins": 0.851191520690918, + "rewards/rejected": 6.3055315017700195, + "step": 11036 + }, + { + "epoch": 1.79, + "learning_rate": 2.8250885797187543e-07, + "logits/chosen": -1.2964915037155151, + "logits/rejected": -1.3417233228683472, + "logps/chosen": -145.5272979736328, + "logps/rejected": -76.38775634765625, + "loss": 0.0276, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.297431945800781, + "rewards/margins": 2.8796591758728027, + "rewards/rejected": 1.417772650718689, + "step": 11037 + }, + { + "epoch": 1.79, + "learning_rate": 2.8207350666377366e-07, + "logits/chosen": -1.3052479028701782, + "logits/rejected": -1.0582926273345947, + "logps/chosen": -136.014892578125, + "logps/rejected": -29.320167541503906, + "loss": 0.0029, + "rewards/accuracies": 1.0, + "rewards/chosen": 7.4855055809021, + "rewards/margins": 5.513855457305908, + "rewards/rejected": 1.9716500043869019, + "step": 11038 + }, + { + "epoch": 1.79, + "learning_rate": 2.816384813197498e-07, + "logits/chosen": -1.175583004951477, + "logits/rejected": -1.2789931297302246, + "logps/chosen": -51.74332046508789, + "logps/rejected": -59.42817306518555, + "loss": 1.2566, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.6976398229599, + "rewards/margins": -1.6376785039901733, + "rewards/rejected": 3.3353183269500732, + "step": 11039 + }, + { + "epoch": 1.79, + "learning_rate": 2.8120378196986263e-07, + "logits/chosen": -1.6299772262573242, + "logits/rejected": -1.4664556980133057, + "logps/chosen": -155.3947296142578, + "logps/rejected": -53.73161315917969, + "loss": 0.5784, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.6420793533325195, + "rewards/margins": 1.6396639347076416, + "rewards/rejected": 3.002415418624878, + "step": 11040 + }, + { + "epoch": 1.79, + "learning_rate": 2.807694086441437e-07, + "logits/chosen": -0.7450892329216003, + "logits/rejected": -0.7111308574676514, + "logps/chosen": -24.48529815673828, + "logps/rejected": -20.127216339111328, + "loss": 0.2262, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.7059955596923828, + "rewards/margins": 1.970668077468872, + "rewards/rejected": -0.26467248797416687, + "step": 11041 + }, + { + "epoch": 1.79, + "learning_rate": 2.8033536137260565e-07, + "logits/chosen": -0.8691930174827576, + "logits/rejected": -0.8628098964691162, + "logps/chosen": -3.6273033618927, + "logps/rejected": -2.630927562713623, + "loss": 0.498, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.36838382482528687, + "rewards/margins": -0.11970549821853638, + "rewards/rejected": 0.48808932304382324, + "step": 11042 + }, + { + "epoch": 1.79, + "learning_rate": 2.799016401852356e-07, + "logits/chosen": -1.140968680381775, + "logits/rejected": -1.0373506546020508, + "logps/chosen": -78.92231750488281, + "logps/rejected": -59.84425354003906, + "loss": 0.2506, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.893321514129639, + "rewards/margins": 2.8065638542175293, + "rewards/rejected": 3.0867576599121094, + "step": 11043 + }, + { + "epoch": 1.79, + "learning_rate": 2.7946824511200067e-07, + "logits/chosen": -1.2266467809677124, + "logits/rejected": -1.2148841619491577, + "logps/chosen": -137.69158935546875, + "logps/rejected": -97.3612289428711, + "loss": 0.865, + "rewards/accuracies": 0.0, + "rewards/chosen": 6.612781047821045, + "rewards/margins": -0.6906013488769531, + "rewards/rejected": 7.303382396697998, + "step": 11044 + }, + { + "epoch": 1.79, + "learning_rate": 2.790351761828436e-07, + "logits/chosen": -1.2017289400100708, + "logits/rejected": -1.3301472663879395, + "logps/chosen": -80.27371215820312, + "logps/rejected": -85.6506118774414, + "loss": 2.5211, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.753248691558838, + "rewards/margins": -4.8417534828186035, + "rewards/rejected": 8.595002174377441, + "step": 11045 + }, + { + "epoch": 1.79, + "learning_rate": 2.78602433427686e-07, + "logits/chosen": -1.588598608970642, + "logits/rejected": -1.3543673753738403, + "logps/chosen": -107.32569885253906, + "logps/rejected": -14.851202964782715, + "loss": 0.0067, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.590775966644287, + "rewards/margins": 5.757900238037109, + "rewards/rejected": 0.8328759074211121, + "step": 11046 + }, + { + "epoch": 1.79, + "learning_rate": 2.781700168764251e-07, + "logits/chosen": -1.1514312028884888, + "logits/rejected": -1.2117412090301514, + "logps/chosen": -59.61347198486328, + "logps/rejected": -47.58563995361328, + "loss": 0.568, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0996506214141846, + "rewards/margins": 1.0388588905334473, + "rewards/rejected": 2.0607917308807373, + "step": 11047 + }, + { + "epoch": 1.79, + "learning_rate": 2.7773792655893803e-07, + "logits/chosen": -1.3350621461868286, + "logits/rejected": -1.498435378074646, + "logps/chosen": -128.29879760742188, + "logps/rejected": -95.94515228271484, + "loss": 0.1115, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.92030668258667, + "rewards/margins": 3.249842405319214, + "rewards/rejected": 3.670464277267456, + "step": 11048 + }, + { + "epoch": 1.79, + "learning_rate": 2.7730616250507827e-07, + "logits/chosen": -1.4082056283950806, + "logits/rejected": -1.3513665199279785, + "logps/chosen": -59.31074523925781, + "logps/rejected": -72.72913360595703, + "loss": 0.1724, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.7523727416992188, + "rewards/margins": 1.3086936473846436, + "rewards/rejected": 1.4436790943145752, + "step": 11049 + }, + { + "epoch": 1.79, + "learning_rate": 2.768747247446757e-07, + "logits/chosen": -1.0429915189743042, + "logits/rejected": -1.0726447105407715, + "logps/chosen": -60.94379425048828, + "logps/rejected": -100.49589538574219, + "loss": 0.2504, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9670968055725098, + "rewards/margins": 1.273553490638733, + "rewards/rejected": 1.6935433149337769, + "step": 11050 + }, + { + "epoch": 1.79, + "learning_rate": 2.7644361330753933e-07, + "logits/chosen": -1.4717994928359985, + "logits/rejected": -1.4598098993301392, + "logps/chosen": -71.27587890625, + "logps/rejected": -63.980506896972656, + "loss": 0.6582, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.3537368774414062, + "rewards/margins": -1.0010933876037598, + "rewards/rejected": 4.354830265045166, + "step": 11051 + }, + { + "epoch": 1.79, + "learning_rate": 2.760128282234542e-07, + "logits/chosen": -1.6058602333068848, + "logits/rejected": -1.4470491409301758, + "logps/chosen": -133.59304809570312, + "logps/rejected": -54.026432037353516, + "loss": 0.226, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.87678861618042, + "rewards/margins": 2.527062177658081, + "rewards/rejected": 3.349726438522339, + "step": 11052 + }, + { + "epoch": 1.79, + "learning_rate": 2.7558236952218485e-07, + "logits/chosen": -1.405228853225708, + "logits/rejected": -1.3441131114959717, + "logps/chosen": -101.54865264892578, + "logps/rejected": -80.09577178955078, + "loss": 0.1244, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.216689348220825, + "rewards/margins": 1.2895249128341675, + "rewards/rejected": 1.9271644353866577, + "step": 11053 + }, + { + "epoch": 1.79, + "learning_rate": 2.7515223723346974e-07, + "logits/chosen": -1.1603460311889648, + "logits/rejected": -1.1603460311889648, + "logps/chosen": -8.235213279724121, + "logps/rejected": -8.235213279724121, + "loss": 0.5808, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.0530561208724976, + "rewards/margins": 0.0, + "rewards/rejected": 1.0530561208724976, + "step": 11054 + }, + { + "epoch": 1.79, + "learning_rate": 2.7472243138702893e-07, + "logits/chosen": -1.4179788827896118, + "logits/rejected": -1.1528429985046387, + "logps/chosen": -108.25102233886719, + "logps/rejected": -27.40422821044922, + "loss": 0.0174, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.553910732269287, + "rewards/margins": 7.180490493774414, + "rewards/rejected": -0.6265797019004822, + "step": 11055 + }, + { + "epoch": 1.79, + "learning_rate": 2.7429295201255643e-07, + "logits/chosen": -0.8193327784538269, + "logits/rejected": -0.9224820733070374, + "logps/chosen": -42.679447174072266, + "logps/rejected": -66.29764556884766, + "loss": 0.9155, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.7300552129745483, + "rewards/margins": -0.18269729614257812, + "rewards/rejected": 1.9127525091171265, + "step": 11056 + }, + { + "epoch": 1.79, + "learning_rate": 2.738637991397264e-07, + "logits/chosen": -1.3737841844558716, + "logits/rejected": -1.4266424179077148, + "logps/chosen": -83.28841400146484, + "logps/rejected": -139.0508575439453, + "loss": 0.5355, + "rewards/accuracies": 0.0, + "rewards/chosen": 7.976169586181641, + "rewards/margins": -0.5211315155029297, + "rewards/rejected": 8.49730110168457, + "step": 11057 + }, + { + "epoch": 1.79, + "learning_rate": 2.7343497279818833e-07, + "logits/chosen": -1.2234700918197632, + "logits/rejected": -1.2268747091293335, + "logps/chosen": -49.778934478759766, + "logps/rejected": -86.75251770019531, + "loss": 0.4956, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.465604782104492, + "rewards/margins": 1.1790698766708374, + "rewards/rejected": 1.2865349054336548, + "step": 11058 + }, + { + "epoch": 1.8, + "learning_rate": 2.730064730175708e-07, + "logits/chosen": -1.4249389171600342, + "logits/rejected": -1.363174319267273, + "logps/chosen": -130.7549591064453, + "logps/rejected": -68.80731201171875, + "loss": 0.0509, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.281691074371338, + "rewards/margins": 2.736663818359375, + "rewards/rejected": 3.545027256011963, + "step": 11059 + }, + { + "epoch": 1.8, + "learning_rate": 2.725782998274784e-07, + "logits/chosen": -1.081349492073059, + "logits/rejected": -1.081349492073059, + "logps/chosen": -27.595808029174805, + "logps/rejected": -27.595808029174805, + "loss": 0.8323, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.3550150394439697, + "rewards/margins": 0.0, + "rewards/rejected": 1.3550150394439697, + "step": 11060 + }, + { + "epoch": 1.8, + "learning_rate": 2.721504532574948e-07, + "logits/chosen": -1.0691118240356445, + "logits/rejected": -1.0588394403457642, + "logps/chosen": -42.24609375, + "logps/rejected": -42.61138153076172, + "loss": 0.2783, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.9245785474777222, + "rewards/margins": 0.3104095458984375, + "rewards/rejected": 1.6141690015792847, + "step": 11061 + }, + { + "epoch": 1.8, + "learning_rate": 2.7172293333717846e-07, + "logits/chosen": -1.6303201913833618, + "logits/rejected": -1.4202228784561157, + "logps/chosen": -138.0637664794922, + "logps/rejected": -78.66853332519531, + "loss": 0.4587, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.26193380355835, + "rewards/margins": 1.3514175415039062, + "rewards/rejected": 4.910516262054443, + "step": 11062 + }, + { + "epoch": 1.8, + "learning_rate": 2.7129574009606916e-07, + "logits/chosen": -1.2412201166152954, + "logits/rejected": -1.2412201166152954, + "logps/chosen": -61.793148040771484, + "logps/rejected": -61.793148040771484, + "loss": 0.5324, + "rewards/accuracies": 0.0, + "rewards/chosen": 5.148007869720459, + "rewards/margins": 0.0, + "rewards/rejected": 5.148007869720459, + "step": 11063 + }, + { + "epoch": 1.8, + "learning_rate": 2.708688735636805e-07, + "logits/chosen": -0.8186553120613098, + "logits/rejected": -0.8206488490104675, + "logps/chosen": -102.49929809570312, + "logps/rejected": -45.014556884765625, + "loss": 0.5544, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.2449378967285156, + "rewards/margins": -0.7042770385742188, + "rewards/rejected": 2.9492149353027344, + "step": 11064 + }, + { + "epoch": 1.8, + "learning_rate": 2.70442333769505e-07, + "logits/chosen": -1.52328622341156, + "logits/rejected": -1.4745278358459473, + "logps/chosen": -118.42595672607422, + "logps/rejected": -119.10868072509766, + "loss": 0.0046, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.339066505432129, + "rewards/margins": 5.080221652984619, + "rewards/rejected": 1.2588447332382202, + "step": 11065 + }, + { + "epoch": 1.8, + "learning_rate": 2.70016120743013e-07, + "logits/chosen": -1.0497487783432007, + "logits/rejected": -1.0497487783432007, + "logps/chosen": -4.029816627502441, + "logps/rejected": -4.029816627502441, + "loss": 0.3469, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.27592459321022034, + "rewards/margins": 0.0, + "rewards/rejected": 0.27592459321022034, + "step": 11066 + }, + { + "epoch": 1.8, + "learning_rate": 2.695902345136514e-07, + "logits/chosen": -1.4518603086471558, + "logits/rejected": -1.3071057796478271, + "logps/chosen": -88.05520629882812, + "logps/rejected": -19.57685089111328, + "loss": 0.0844, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.738501787185669, + "rewards/margins": 1.9086713790893555, + "rewards/rejected": 0.8298303484916687, + "step": 11067 + }, + { + "epoch": 1.8, + "learning_rate": 2.691646751108451e-07, + "logits/chosen": -1.6477450132369995, + "logits/rejected": -1.6068459749221802, + "logps/chosen": -93.37982177734375, + "logps/rejected": -34.732460021972656, + "loss": 0.2248, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0510590076446533, + "rewards/margins": 3.9173941612243652, + "rewards/rejected": -0.8663350939750671, + "step": 11068 + }, + { + "epoch": 1.8, + "learning_rate": 2.687394425639961e-07, + "logits/chosen": -1.4006245136260986, + "logits/rejected": -1.4006245136260986, + "logps/chosen": -132.8536376953125, + "logps/rejected": -132.8536376953125, + "loss": 1.0392, + "rewards/accuracies": 0.0, + "rewards/chosen": 6.04705810546875, + "rewards/margins": 0.0, + "rewards/rejected": 6.04705810546875, + "step": 11069 + }, + { + "epoch": 1.8, + "learning_rate": 2.683145369024837e-07, + "logits/chosen": -1.0940111875534058, + "logits/rejected": -1.436903953552246, + "logps/chosen": -59.77593994140625, + "logps/rejected": -34.3223876953125, + "loss": 0.0322, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0123703479766846, + "rewards/margins": 2.763861894607544, + "rewards/rejected": 0.24850845336914062, + "step": 11070 + }, + { + "epoch": 1.8, + "learning_rate": 2.6788995815566597e-07, + "logits/chosen": -1.3758862018585205, + "logits/rejected": -1.3758862018585205, + "logps/chosen": -40.96711730957031, + "logps/rejected": -40.96711730957031, + "loss": 0.7408, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.403996229171753, + "rewards/margins": 0.0, + "rewards/rejected": 2.403996229171753, + "step": 11071 + }, + { + "epoch": 1.8, + "learning_rate": 2.674657063528757e-07, + "logits/chosen": -1.4175769090652466, + "logits/rejected": -1.3792014122009277, + "logps/chosen": -88.04708862304688, + "logps/rejected": -56.460323333740234, + "loss": 0.4532, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.2497872114181519, + "rewards/margins": 0.6427853107452393, + "rewards/rejected": 0.6070019006729126, + "step": 11072 + }, + { + "epoch": 1.8, + "learning_rate": 2.6704178152342606e-07, + "logits/chosen": -1.0476560592651367, + "logits/rejected": -1.0476560592651367, + "logps/chosen": -27.121435165405273, + "logps/rejected": -27.121435165405273, + "loss": 0.3566, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.7267881631851196, + "rewards/margins": 0.0, + "rewards/rejected": 1.7267881631851196, + "step": 11073 + }, + { + "epoch": 1.8, + "learning_rate": 2.666181836966053e-07, + "logits/chosen": -1.3546918630599976, + "logits/rejected": -1.3167005777359009, + "logps/chosen": -93.03538513183594, + "logps/rejected": -77.81964874267578, + "loss": 0.297, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.789759874343872, + "rewards/margins": 0.4411590099334717, + "rewards/rejected": 3.3486008644104004, + "step": 11074 + }, + { + "epoch": 1.8, + "learning_rate": 2.6619491290168056e-07, + "logits/chosen": -1.197641134262085, + "logits/rejected": -1.2154464721679688, + "logps/chosen": -140.56466674804688, + "logps/rejected": -115.29793548583984, + "loss": 0.2031, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.708169460296631, + "rewards/margins": 0.7539482116699219, + "rewards/rejected": 5.954221248626709, + "step": 11075 + }, + { + "epoch": 1.8, + "learning_rate": 2.657719691678956e-07, + "logits/chosen": -1.08017897605896, + "logits/rejected": -1.0333731174468994, + "logps/chosen": -92.18258666992188, + "logps/rejected": -59.93267822265625, + "loss": 0.5438, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.022863745689392, + "rewards/margins": -0.6477768421173096, + "rewards/rejected": 1.6706405878067017, + "step": 11076 + }, + { + "epoch": 1.8, + "learning_rate": 2.653493525244721e-07, + "logits/chosen": -1.5358607769012451, + "logits/rejected": -1.584386944770813, + "logps/chosen": -151.578369140625, + "logps/rejected": -110.50010681152344, + "loss": 0.8773, + "rewards/accuracies": 0.0, + "rewards/chosen": 7.9709930419921875, + "rewards/margins": -0.9608383178710938, + "rewards/rejected": 8.931831359863281, + "step": 11077 + }, + { + "epoch": 1.8, + "learning_rate": 2.6492706300060787e-07, + "logits/chosen": -1.5201791524887085, + "logits/rejected": -1.5821115970611572, + "logps/chosen": -79.92211151123047, + "logps/rejected": -186.72540283203125, + "loss": 1.2413, + "rewards/accuracies": 0.0, + "rewards/chosen": 4.653934478759766, + "rewards/margins": -2.111691951751709, + "rewards/rejected": 6.765626430511475, + "step": 11078 + }, + { + "epoch": 1.8, + "learning_rate": 2.645051006254806e-07, + "logits/chosen": -1.4643771648406982, + "logits/rejected": -1.346221923828125, + "logps/chosen": -235.565185546875, + "logps/rejected": -60.691932678222656, + "loss": 1.0688, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2134857177734375, + "rewards/margins": 0.7512259483337402, + "rewards/rejected": 2.4622597694396973, + "step": 11079 + }, + { + "epoch": 1.8, + "learning_rate": 2.6408346542824317e-07, + "logits/chosen": -1.614051342010498, + "logits/rejected": -1.6296156644821167, + "logps/chosen": -80.88084411621094, + "logps/rejected": -67.16059875488281, + "loss": 0.3909, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.125593662261963, + "rewards/margins": 0.803238034248352, + "rewards/rejected": 1.3223556280136108, + "step": 11080 + }, + { + "epoch": 1.8, + "learning_rate": 2.636621574380266e-07, + "logits/chosen": -1.310024380683899, + "logits/rejected": -1.3298622369766235, + "logps/chosen": -94.77642822265625, + "logps/rejected": -91.1204833984375, + "loss": 0.2612, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.6818283796310425, + "rewards/margins": 1.0529358386993408, + "rewards/rejected": 0.6288925409317017, + "step": 11081 + }, + { + "epoch": 1.8, + "learning_rate": 2.6324117668393876e-07, + "logits/chosen": -1.1771854162216187, + "logits/rejected": -1.240781307220459, + "logps/chosen": -65.81591796875, + "logps/rejected": -138.76113891601562, + "loss": 1.1467, + "rewards/accuracies": 0.0, + "rewards/chosen": 4.279658794403076, + "rewards/margins": -2.1706128120422363, + "rewards/rejected": 6.4502716064453125, + "step": 11082 + }, + { + "epoch": 1.8, + "learning_rate": 2.6282052319506646e-07, + "logits/chosen": -1.2943509817123413, + "logits/rejected": -1.2951831817626953, + "logps/chosen": -132.18218994140625, + "logps/rejected": -90.44120025634766, + "loss": 0.3367, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.7354583740234375, + "rewards/margins": 0.10547327995300293, + "rewards/rejected": 3.6299850940704346, + "step": 11083 + }, + { + "epoch": 1.8, + "learning_rate": 2.624001970004725e-07, + "logits/chosen": -1.8097351789474487, + "logits/rejected": -1.7543631792068481, + "logps/chosen": -57.06940460205078, + "logps/rejected": -25.37270736694336, + "loss": 0.5174, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.693692922592163, + "rewards/margins": 2.4443795680999756, + "rewards/rejected": 0.2493133544921875, + "step": 11084 + }, + { + "epoch": 1.8, + "learning_rate": 2.6198019812919693e-07, + "logits/chosen": -0.9595987796783447, + "logits/rejected": -1.0175931453704834, + "logps/chosen": -20.81827163696289, + "logps/rejected": -32.39297866821289, + "loss": 0.4379, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.8886783123016357, + "rewards/margins": -0.277346134185791, + "rewards/rejected": 3.1660244464874268, + "step": 11085 + }, + { + "epoch": 1.8, + "learning_rate": 2.615605266102589e-07, + "logits/chosen": -1.0779820680618286, + "logits/rejected": -1.0864006280899048, + "logps/chosen": -61.6826171875, + "logps/rejected": -43.95696258544922, + "loss": 0.8566, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.1442253589630127, + "rewards/margins": -0.3170825242996216, + "rewards/rejected": 1.4613078832626343, + "step": 11086 + }, + { + "epoch": 1.8, + "learning_rate": 2.6114118247265176e-07, + "logits/chosen": -1.4294661283493042, + "logits/rejected": -1.2335774898529053, + "logps/chosen": -86.48432922363281, + "logps/rejected": -100.71642303466797, + "loss": 0.603, + "rewards/accuracies": 0.0, + "rewards/chosen": 5.068168640136719, + "rewards/margins": -0.09824752807617188, + "rewards/rejected": 5.166416168212891, + "step": 11087 + }, + { + "epoch": 1.8, + "learning_rate": 2.607221657453507e-07, + "logits/chosen": -1.1090519428253174, + "logits/rejected": -1.0782750844955444, + "logps/chosen": -110.39598083496094, + "logps/rejected": -75.21351623535156, + "loss": 0.3992, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.5358726978302002, + "rewards/margins": 0.6053459644317627, + "rewards/rejected": 0.9305267333984375, + "step": 11088 + }, + { + "epoch": 1.8, + "learning_rate": 2.603034764573037e-07, + "logits/chosen": -1.2849924564361572, + "logits/rejected": -1.263384461402893, + "logps/chosen": -35.815086364746094, + "logps/rejected": -46.637908935546875, + "loss": 0.6673, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.606337785720825, + "rewards/margins": -0.6472358703613281, + "rewards/rejected": 3.2535736560821533, + "step": 11089 + }, + { + "epoch": 1.8, + "learning_rate": 2.5988511463743924e-07, + "logits/chosen": -1.434970736503601, + "logits/rejected": -1.3004508018493652, + "logps/chosen": -162.04421997070312, + "logps/rejected": -63.036949157714844, + "loss": 0.022, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.8261260986328125, + "rewards/margins": 4.198650360107422, + "rewards/rejected": 2.6274757385253906, + "step": 11090 + }, + { + "epoch": 1.8, + "learning_rate": 2.594670803146621e-07, + "logits/chosen": -1.1608625650405884, + "logits/rejected": -1.2372678518295288, + "logps/chosen": -31.438522338867188, + "logps/rejected": -66.4439697265625, + "loss": 0.752, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.2780468463897705, + "rewards/margins": -1.2480525970458984, + "rewards/rejected": 3.526099443435669, + "step": 11091 + }, + { + "epoch": 1.8, + "learning_rate": 2.5904937351785454e-07, + "logits/chosen": -0.9534885883331299, + "logits/rejected": -1.082082748413086, + "logps/chosen": -69.78057098388672, + "logps/rejected": -89.32897186279297, + "loss": 1.5965, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.0097482204437256, + "rewards/margins": -2.992173910140991, + "rewards/rejected": 5.001922130584717, + "step": 11092 + }, + { + "epoch": 1.8, + "learning_rate": 2.5863199427587526e-07, + "logits/chosen": -1.447524070739746, + "logits/rejected": -1.4887866973876953, + "logps/chosen": -112.33927917480469, + "logps/rejected": -229.29598999023438, + "loss": 0.6278, + "rewards/accuracies": 0.0, + "rewards/chosen": 7.327925205230713, + "rewards/margins": -0.9140582084655762, + "rewards/rejected": 8.241983413696289, + "step": 11093 + }, + { + "epoch": 1.8, + "learning_rate": 2.5821494261756284e-07, + "logits/chosen": -0.5718659162521362, + "logits/rejected": -0.5552747845649719, + "logps/chosen": -2.9778380393981934, + "logps/rejected": -8.203886032104492, + "loss": 0.3224, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.292358934879303, + "rewards/margins": 0.1572452187538147, + "rewards/rejected": 0.13511371612548828, + "step": 11094 + }, + { + "epoch": 1.8, + "learning_rate": 2.577982185717304e-07, + "logits/chosen": -1.5111862421035767, + "logits/rejected": -1.497209906578064, + "logps/chosen": -163.06094360351562, + "logps/rejected": -93.77271270751953, + "loss": 2.3438, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.059818983078003, + "rewards/margins": 0.6875159740447998, + "rewards/rejected": 2.372303009033203, + "step": 11095 + }, + { + "epoch": 1.8, + "learning_rate": 2.5738182216717035e-07, + "logits/chosen": -1.2008614540100098, + "logits/rejected": -1.2008614540100098, + "logps/chosen": -59.919036865234375, + "logps/rejected": -59.919036865234375, + "loss": 0.4043, + "rewards/accuracies": 0.0, + "rewards/chosen": 4.7720866203308105, + "rewards/margins": 0.0, + "rewards/rejected": 4.7720866203308105, + "step": 11096 + }, + { + "epoch": 1.8, + "learning_rate": 2.569657534326503e-07, + "logits/chosen": -1.2462108135223389, + "logits/rejected": -1.216710090637207, + "logps/chosen": -71.9542007446289, + "logps/rejected": -99.40596008300781, + "loss": 0.6139, + "rewards/accuracies": 0.0, + "rewards/chosen": 4.7937188148498535, + "rewards/margins": -0.6415243148803711, + "rewards/rejected": 5.435243129730225, + "step": 11097 + }, + { + "epoch": 1.8, + "learning_rate": 2.5655001239691836e-07, + "logits/chosen": -1.2208566665649414, + "logits/rejected": -1.0027666091918945, + "logps/chosen": -55.91361999511719, + "logps/rejected": -46.522560119628906, + "loss": 0.6736, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.0183563232421875, + "rewards/margins": 1.8708839416503906, + "rewards/rejected": 2.147472381591797, + "step": 11098 + }, + { + "epoch": 1.8, + "learning_rate": 2.5613459908869707e-07, + "logits/chosen": -1.2320626974105835, + "logits/rejected": -1.2497731447219849, + "logps/chosen": -36.24739074707031, + "logps/rejected": -32.83561706542969, + "loss": 0.3178, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.0871918201446533, + "rewards/margins": 0.13320469856262207, + "rewards/rejected": 0.9539871215820312, + "step": 11099 + }, + { + "epoch": 1.8, + "learning_rate": 2.557195135366891e-07, + "logits/chosen": -0.9175806045532227, + "logits/rejected": -0.8698822259902954, + "logps/chosen": -32.89401626586914, + "logps/rejected": -33.421363830566406, + "loss": 0.1308, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.7447292804718018, + "rewards/margins": 1.4375015497207642, + "rewards/rejected": 1.3072277307510376, + "step": 11100 + }, + { + "epoch": 1.8, + "learning_rate": 2.5530475576957094e-07, + "logits/chosen": -1.3087249994277954, + "logits/rejected": -1.2417818307876587, + "logps/chosen": -46.371971130371094, + "logps/rejected": -33.096248626708984, + "loss": 1.4762, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.8539435863494873, + "rewards/margins": 0.7379207611083984, + "rewards/rejected": 2.116022825241089, + "step": 11101 + }, + { + "epoch": 1.8, + "learning_rate": 2.5489032581600014e-07, + "logits/chosen": -1.5743237733840942, + "logits/rejected": -1.635780930519104, + "logps/chosen": -158.61549377441406, + "logps/rejected": -110.83941650390625, + "loss": 0.5423, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.3833985328674316, + "rewards/margins": -0.5645630359649658, + "rewards/rejected": 3.9479615688323975, + "step": 11102 + }, + { + "epoch": 1.8, + "learning_rate": 2.544762237046083e-07, + "logits/chosen": -1.294732689857483, + "logits/rejected": -1.3358670473098755, + "logps/chosen": -70.8409652709961, + "logps/rejected": -95.2138671875, + "loss": 1.5399, + "rewards/accuracies": 0.0, + "rewards/chosen": 5.411815166473389, + "rewards/margins": -1.1404004096984863, + "rewards/rejected": 6.552215576171875, + "step": 11103 + }, + { + "epoch": 1.8, + "learning_rate": 2.5406244946400706e-07, + "logits/chosen": -1.36640465259552, + "logits/rejected": -1.2170109748840332, + "logps/chosen": -67.51110076904297, + "logps/rejected": -44.722023010253906, + "loss": 0.0897, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.821096181869507, + "rewards/margins": 2.0855557918548584, + "rewards/rejected": 0.7355403900146484, + "step": 11104 + }, + { + "epoch": 1.8, + "learning_rate": 2.536490031227845e-07, + "logits/chosen": -1.3463451862335205, + "logits/rejected": -1.35421884059906, + "logps/chosen": -82.29783630371094, + "logps/rejected": -83.242431640625, + "loss": 0.0297, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.580860137939453, + "rewards/margins": 2.843074083328247, + "rewards/rejected": 0.7377861142158508, + "step": 11105 + }, + { + "epoch": 1.8, + "learning_rate": 2.532358847095051e-07, + "logits/chosen": -1.564587116241455, + "logits/rejected": -1.533469557762146, + "logps/chosen": -31.928203582763672, + "logps/rejected": -42.903282165527344, + "loss": 0.43, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.3167388439178467, + "rewards/margins": 0.1767127513885498, + "rewards/rejected": 2.140026092529297, + "step": 11106 + }, + { + "epoch": 1.8, + "learning_rate": 2.5282309425271213e-07, + "logits/chosen": -1.3091100454330444, + "logits/rejected": -1.2597377300262451, + "logps/chosen": -129.69882202148438, + "logps/rejected": -129.9102783203125, + "loss": 0.4343, + "rewards/accuracies": 0.0, + "rewards/chosen": 5.212475776672363, + "rewards/margins": -0.2934384346008301, + "rewards/rejected": 5.505914211273193, + "step": 11107 + }, + { + "epoch": 1.8, + "learning_rate": 2.5241063178092497e-07, + "logits/chosen": -1.143866777420044, + "logits/rejected": -1.121124267578125, + "logps/chosen": -75.97196960449219, + "logps/rejected": -45.89447784423828, + "loss": 0.0906, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.537222385406494, + "rewards/margins": 1.6436104774475098, + "rewards/rejected": 1.8936119079589844, + "step": 11108 + }, + { + "epoch": 1.8, + "learning_rate": 2.5199849732264134e-07, + "logits/chosen": -0.9465543627738953, + "logits/rejected": -0.9348503351211548, + "logps/chosen": -4.268847465515137, + "logps/rejected": -2.513087749481201, + "loss": 0.391, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.21880102157592773, + "rewards/margins": -0.12453791499137878, + "rewards/rejected": 0.3433389365673065, + "step": 11109 + }, + { + "epoch": 1.8, + "learning_rate": 2.515866909063347e-07, + "logits/chosen": -1.2210607528686523, + "logits/rejected": -1.26276695728302, + "logps/chosen": -64.15795135498047, + "logps/rejected": -82.60295867919922, + "loss": 1.4797, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.8197517395019531, + "rewards/margins": -2.8934402465820312, + "rewards/rejected": 4.713191986083984, + "step": 11110 + }, + { + "epoch": 1.8, + "learning_rate": 2.5117521256045883e-07, + "logits/chosen": -1.3815058469772339, + "logits/rejected": -1.3741172552108765, + "logps/chosen": -44.692623138427734, + "logps/rejected": -57.43339538574219, + "loss": 0.6834, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.3623920679092407, + "rewards/margins": 0.16840553283691406, + "rewards/rejected": 1.1939865350723267, + "step": 11111 + }, + { + "epoch": 1.8, + "learning_rate": 2.5076406231344107e-07, + "logits/chosen": -1.2200673818588257, + "logits/rejected": -1.2009787559509277, + "logps/chosen": -51.38947296142578, + "logps/rejected": -64.0937728881836, + "loss": 0.4688, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.3950852155685425, + "rewards/margins": -0.40271830558776855, + "rewards/rejected": 1.797803521156311, + "step": 11112 + }, + { + "epoch": 1.8, + "learning_rate": 2.5035324019368977e-07, + "logits/chosen": -1.157417893409729, + "logits/rejected": -1.157417893409729, + "logps/chosen": -35.653316497802734, + "logps/rejected": -35.653316497802734, + "loss": 0.3582, + "rewards/accuracies": 0.0, + "rewards/chosen": 4.649566173553467, + "rewards/margins": 0.0, + "rewards/rejected": 4.649566173553467, + "step": 11113 + }, + { + "epoch": 1.8, + "learning_rate": 2.4994274622958726e-07, + "logits/chosen": -1.3596851825714111, + "logits/rejected": -1.3477078676223755, + "logps/chosen": -56.8693733215332, + "logps/rejected": -46.608123779296875, + "loss": 0.5387, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.450331449508667, + "rewards/margins": 0.2917659282684326, + "rewards/rejected": 2.1585655212402344, + "step": 11114 + }, + { + "epoch": 1.8, + "learning_rate": 2.495325804494964e-07, + "logits/chosen": -0.670185923576355, + "logits/rejected": -0.6909637451171875, + "logps/chosen": -4.488031387329102, + "logps/rejected": -29.3367919921875, + "loss": 0.412, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.21900497376918793, + "rewards/margins": -0.20349641144275665, + "rewards/rejected": 0.4225013852119446, + "step": 11115 + }, + { + "epoch": 1.8, + "learning_rate": 2.491227428817539e-07, + "logits/chosen": -1.1902658939361572, + "logits/rejected": -1.1916087865829468, + "logps/chosen": -4.013585567474365, + "logps/rejected": -14.377485275268555, + "loss": 1.1232, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.3013502061367035, + "rewards/margins": -0.7044632434844971, + "rewards/rejected": 1.005813479423523, + "step": 11116 + }, + { + "epoch": 1.8, + "learning_rate": 2.487132335546777e-07, + "logits/chosen": -1.0904093980789185, + "logits/rejected": -1.0824536085128784, + "logps/chosen": -37.253414154052734, + "logps/rejected": -85.4669418334961, + "loss": 0.2991, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.6379474401474, + "rewards/margins": 0.5143977403640747, + "rewards/rejected": 1.1235496997833252, + "step": 11117 + }, + { + "epoch": 1.8, + "learning_rate": 2.483040524965591e-07, + "logits/chosen": -1.4467300176620483, + "logits/rejected": -1.436306357383728, + "logps/chosen": -126.79771423339844, + "logps/rejected": -155.83090209960938, + "loss": 1.4992, + "rewards/accuracies": 0.0, + "rewards/chosen": 7.526561260223389, + "rewards/margins": -2.538712978363037, + "rewards/rejected": 10.065274238586426, + "step": 11118 + }, + { + "epoch": 1.8, + "learning_rate": 2.4789519973566986e-07, + "logits/chosen": -1.2603687047958374, + "logits/rejected": -1.158947229385376, + "logps/chosen": -49.10541534423828, + "logps/rejected": -67.34318542480469, + "loss": 0.2291, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.453855276107788, + "rewards/margins": 0.5975556373596191, + "rewards/rejected": 2.856299638748169, + "step": 11119 + }, + { + "epoch": 1.8, + "learning_rate": 2.4748667530025694e-07, + "logits/chosen": -1.3246761560440063, + "logits/rejected": -1.4184439182281494, + "logps/chosen": -74.61441802978516, + "logps/rejected": -80.54335021972656, + "loss": 1.1152, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.949974775314331, + "rewards/margins": -1.3616936206817627, + "rewards/rejected": 4.311668395996094, + "step": 11120 + }, + { + "epoch": 1.81, + "learning_rate": 2.470784792185471e-07, + "logits/chosen": -1.067824363708496, + "logits/rejected": -1.067824363708496, + "logps/chosen": -0.6070218086242676, + "logps/rejected": -0.6070218086242676, + "loss": 0.3596, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.16028976440429688, + "rewards/margins": 0.0, + "rewards/rejected": 0.16028976440429688, + "step": 11121 + }, + { + "epoch": 1.81, + "learning_rate": 2.466706115187406e-07, + "logits/chosen": -1.4138598442077637, + "logits/rejected": -1.4188833236694336, + "logps/chosen": -69.42713165283203, + "logps/rejected": -91.96638488769531, + "loss": 0.2303, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.4242303371429443, + "rewards/margins": 2.0098609924316406, + "rewards/rejected": 1.4143692255020142, + "step": 11122 + }, + { + "epoch": 1.81, + "learning_rate": 2.4626307222901935e-07, + "logits/chosen": -1.622830867767334, + "logits/rejected": -1.629036545753479, + "logps/chosen": -86.01354217529297, + "logps/rejected": -81.00250244140625, + "loss": 0.4637, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.220595598220825, + "rewards/margins": -0.20826196670532227, + "rewards/rejected": 2.4288575649261475, + "step": 11123 + }, + { + "epoch": 1.81, + "learning_rate": 2.4585586137753913e-07, + "logits/chosen": -1.6484752893447876, + "logits/rejected": -1.6433136463165283, + "logps/chosen": -132.23745727539062, + "logps/rejected": -152.63632202148438, + "loss": 0.414, + "rewards/accuracies": 1.0, + "rewards/chosen": 7.170388698577881, + "rewards/margins": 0.9289183616638184, + "rewards/rejected": 6.2414703369140625, + "step": 11124 + }, + { + "epoch": 1.81, + "learning_rate": 2.454489789924353e-07, + "logits/chosen": -1.1473617553710938, + "logits/rejected": -1.2369771003723145, + "logps/chosen": -112.65924072265625, + "logps/rejected": -117.47492980957031, + "loss": 0.8065, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.587005615234375, + "rewards/margins": 2.632788896560669, + "rewards/rejected": 0.9542167782783508, + "step": 11125 + }, + { + "epoch": 1.81, + "learning_rate": 2.4504242510181795e-07, + "logits/chosen": -1.5911372900009155, + "logits/rejected": -1.6268552541732788, + "logps/chosen": -139.264404296875, + "logps/rejected": -53.506813049316406, + "loss": 0.0249, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.482693672180176, + "rewards/margins": 3.030625343322754, + "rewards/rejected": 1.4520683288574219, + "step": 11126 + }, + { + "epoch": 1.81, + "learning_rate": 2.4463619973377816e-07, + "logits/chosen": -1.2813694477081299, + "logits/rejected": -1.2821507453918457, + "logps/chosen": -93.53472900390625, + "logps/rejected": -115.34341430664062, + "loss": 0.8308, + "rewards/accuracies": 0.0, + "rewards/chosen": 4.473138332366943, + "rewards/margins": -1.0757508277893066, + "rewards/rejected": 5.54888916015625, + "step": 11127 + }, + { + "epoch": 1.81, + "learning_rate": 2.442303029163806e-07, + "logits/chosen": -1.2781178951263428, + "logits/rejected": -1.2305439710617065, + "logps/chosen": -65.90150451660156, + "logps/rejected": -67.24114990234375, + "loss": 2.1643, + "rewards/accuracies": 0.0, + "rewards/chosen": 4.495948314666748, + "rewards/margins": -2.4977774620056152, + "rewards/rejected": 6.993725776672363, + "step": 11128 + }, + { + "epoch": 1.81, + "learning_rate": 2.4382473467767066e-07, + "logits/chosen": -1.3720736503601074, + "logits/rejected": -1.4042952060699463, + "logps/chosen": -62.485389709472656, + "logps/rejected": -128.18972778320312, + "loss": 2.0887, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.66355299949646, + "rewards/margins": -4.048495292663574, + "rewards/rejected": 7.712048530578613, + "step": 11129 + }, + { + "epoch": 1.81, + "learning_rate": 2.43419495045667e-07, + "logits/chosen": -1.0392972230911255, + "logits/rejected": -1.069344162940979, + "logps/chosen": -5.076859474182129, + "logps/rejected": -22.13079261779785, + "loss": 0.8087, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.8543336987495422, + "rewards/margins": -0.2784042954444885, + "rewards/rejected": 1.1327379941940308, + "step": 11130 + }, + { + "epoch": 1.81, + "learning_rate": 2.4301458404837e-07, + "logits/chosen": -1.1466470956802368, + "logits/rejected": -1.161752462387085, + "logps/chosen": -2.4495959281921387, + "logps/rejected": -40.4951057434082, + "loss": 0.5843, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.46779391169548035, + "rewards/margins": -0.26129886507987976, + "rewards/rejected": 0.7290927767753601, + "step": 11131 + }, + { + "epoch": 1.81, + "learning_rate": 2.4261000171375335e-07, + "logits/chosen": -1.2806296348571777, + "logits/rejected": -1.2491475343704224, + "logps/chosen": -47.997291564941406, + "logps/rejected": -69.12443542480469, + "loss": 0.6129, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.638953447341919, + "rewards/margins": 0.5412087440490723, + "rewards/rejected": 2.0977447032928467, + "step": 11132 + }, + { + "epoch": 1.81, + "learning_rate": 2.422057480697715e-07, + "logits/chosen": -1.0503990650177002, + "logits/rejected": -1.0552425384521484, + "logps/chosen": -6.727727890014648, + "logps/rejected": -11.078981399536133, + "loss": 0.7239, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.2603681683540344, + "rewards/margins": -0.6852840185165405, + "rewards/rejected": 0.945652186870575, + "step": 11133 + }, + { + "epoch": 1.81, + "learning_rate": 2.4180182314435305e-07, + "logits/chosen": -1.4641921520233154, + "logits/rejected": -1.4551101922988892, + "logps/chosen": -109.06071472167969, + "logps/rejected": -63.75617218017578, + "loss": 0.8432, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.1896896362304688, + "rewards/margins": -0.5231025218963623, + "rewards/rejected": 2.712792158126831, + "step": 11134 + }, + { + "epoch": 1.81, + "learning_rate": 2.413982269654069e-07, + "logits/chosen": -1.5972546339035034, + "logits/rejected": -1.5963044166564941, + "logps/chosen": -245.11859130859375, + "logps/rejected": -52.32973861694336, + "loss": 0.0382, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.113873481750488, + "rewards/margins": 2.551525592803955, + "rewards/rejected": 2.562347888946533, + "step": 11135 + }, + { + "epoch": 1.81, + "learning_rate": 2.409949595608163e-07, + "logits/chosen": -1.43869948387146, + "logits/rejected": -1.439509391784668, + "logps/chosen": -57.45189666748047, + "logps/rejected": -90.19383239746094, + "loss": 0.4829, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.3977959156036377, + "rewards/margins": -0.46325159072875977, + "rewards/rejected": 2.8610475063323975, + "step": 11136 + }, + { + "epoch": 1.81, + "learning_rate": 2.405920209584445e-07, + "logits/chosen": -1.3987228870391846, + "logits/rejected": -1.3507074117660522, + "logps/chosen": -51.46855163574219, + "logps/rejected": -58.51158905029297, + "loss": 3.8664, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.9815231561660767, + "rewards/margins": -0.4704498052597046, + "rewards/rejected": 2.4519729614257812, + "step": 11137 + }, + { + "epoch": 1.81, + "learning_rate": 2.401894111861297e-07, + "logits/chosen": -1.109382152557373, + "logits/rejected": -1.1153194904327393, + "logps/chosen": -88.78756713867188, + "logps/rejected": -84.06395721435547, + "loss": 0.6224, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.9249191284179688, + "rewards/margins": -0.9020211696624756, + "rewards/rejected": 3.8269402980804443, + "step": 11138 + }, + { + "epoch": 1.81, + "learning_rate": 2.3978713027168865e-07, + "logits/chosen": -1.2485098838806152, + "logits/rejected": -1.298356056213379, + "logps/chosen": -98.23307037353516, + "logps/rejected": -82.30083465576172, + "loss": 1.3132, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.619255781173706, + "rewards/margins": -2.5500991344451904, + "rewards/rejected": 6.1693549156188965, + "step": 11139 + }, + { + "epoch": 1.81, + "learning_rate": 2.393851782429157e-07, + "logits/chosen": -1.3170785903930664, + "logits/rejected": -1.337121844291687, + "logps/chosen": -35.829681396484375, + "logps/rejected": -47.839500427246094, + "loss": 2.9278, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.3060059547424316, + "rewards/margins": -1.6591157913208008, + "rewards/rejected": 4.965121746063232, + "step": 11140 + }, + { + "epoch": 1.81, + "learning_rate": 2.3898355512758097e-07, + "logits/chosen": -1.0596572160720825, + "logits/rejected": -1.0676753520965576, + "logps/chosen": -67.32792663574219, + "logps/rejected": -48.05351257324219, + "loss": 0.4284, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.0546157360076904, + "rewards/margins": -0.2822403907775879, + "rewards/rejected": 2.3368561267852783, + "step": 11141 + }, + { + "epoch": 1.81, + "learning_rate": 2.385822609534344e-07, + "logits/chosen": -1.3406988382339478, + "logits/rejected": -1.2393035888671875, + "logps/chosen": -64.34408569335938, + "logps/rejected": -68.08329010009766, + "loss": 0.3416, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.6117141246795654, + "rewards/margins": 0.23215866088867188, + "rewards/rejected": 2.3795554637908936, + "step": 11142 + }, + { + "epoch": 1.81, + "learning_rate": 2.3818129574819992e-07, + "logits/chosen": -1.3423497676849365, + "logits/rejected": -1.3055737018585205, + "logps/chosen": -54.789642333984375, + "logps/rejected": -71.45381927490234, + "loss": 0.2477, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.2487990856170654, + "rewards/margins": 0.45566022396087646, + "rewards/rejected": 1.793138861656189, + "step": 11143 + }, + { + "epoch": 1.81, + "learning_rate": 2.3778065953958207e-07, + "logits/chosen": -1.2973486185073853, + "logits/rejected": -1.2750566005706787, + "logps/chosen": -85.60277557373047, + "logps/rejected": -107.36573028564453, + "loss": 0.0985, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.1784491539001465, + "rewards/margins": 1.9044065475463867, + "rewards/rejected": 3.2740426063537598, + "step": 11144 + }, + { + "epoch": 1.81, + "learning_rate": 2.373803523552587e-07, + "logits/chosen": -0.8004600405693054, + "logits/rejected": -0.8008641004562378, + "logps/chosen": -2.3766942024230957, + "logps/rejected": -5.20703649520874, + "loss": 0.331, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3618951737880707, + "rewards/margins": 0.07652220129966736, + "rewards/rejected": 0.2853729724884033, + "step": 11145 + }, + { + "epoch": 1.81, + "learning_rate": 2.369803742228899e-07, + "logits/chosen": -1.2411950826644897, + "logits/rejected": -1.2625755071640015, + "logps/chosen": -78.98757934570312, + "logps/rejected": -49.555301666259766, + "loss": 0.4011, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.421618700027466, + "rewards/margins": 0.783128023147583, + "rewards/rejected": 2.638490676879883, + "step": 11146 + }, + { + "epoch": 1.81, + "learning_rate": 2.3658072517010854e-07, + "logits/chosen": -1.352044701576233, + "logits/rejected": -1.3093098402023315, + "logps/chosen": -40.43324661254883, + "logps/rejected": -41.016082763671875, + "loss": 0.5581, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.7553261518478394, + "rewards/margins": -0.6099799871444702, + "rewards/rejected": 2.3653061389923096, + "step": 11147 + }, + { + "epoch": 1.81, + "learning_rate": 2.361814052245276e-07, + "logits/chosen": -1.3674259185791016, + "logits/rejected": -1.2272160053253174, + "logps/chosen": -83.8507308959961, + "logps/rejected": -49.988243103027344, + "loss": 0.044, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.66344690322876, + "rewards/margins": 2.5212104320526123, + "rewards/rejected": 3.1422364711761475, + "step": 11148 + }, + { + "epoch": 1.81, + "learning_rate": 2.357824144137355e-07, + "logits/chosen": -1.256811499595642, + "logits/rejected": -1.1702960729599, + "logps/chosen": -65.1071548461914, + "logps/rejected": -56.35456085205078, + "loss": 0.9409, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.363598585128784, + "rewards/margins": -1.1790351867675781, + "rewards/rejected": 3.5426337718963623, + "step": 11149 + }, + { + "epoch": 1.81, + "learning_rate": 2.3538375276529912e-07, + "logits/chosen": -1.6985814571380615, + "logits/rejected": -1.606571078300476, + "logps/chosen": -101.08860778808594, + "logps/rejected": -55.30009460449219, + "loss": 0.1559, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.33143949508667, + "rewards/margins": 3.0653810501098633, + "rewards/rejected": 2.2660584449768066, + "step": 11150 + }, + { + "epoch": 1.81, + "learning_rate": 2.3498542030676198e-07, + "logits/chosen": -1.1399587392807007, + "logits/rejected": -1.0807384252548218, + "logps/chosen": -52.73536682128906, + "logps/rejected": -40.053985595703125, + "loss": 0.3598, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.3553178310394287, + "rewards/margins": 0.30861592292785645, + "rewards/rejected": 2.0467019081115723, + "step": 11151 + }, + { + "epoch": 1.81, + "learning_rate": 2.345874170656459e-07, + "logits/chosen": -1.2701332569122314, + "logits/rejected": -1.3055483102798462, + "logps/chosen": -177.25767517089844, + "logps/rejected": -70.20306396484375, + "loss": 0.0509, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.6992082595825195, + "rewards/margins": 2.298630714416504, + "rewards/rejected": 2.4005775451660156, + "step": 11152 + }, + { + "epoch": 1.81, + "learning_rate": 2.3418974306944842e-07, + "logits/chosen": -1.122595191001892, + "logits/rejected": -1.1806315183639526, + "logps/chosen": -42.713680267333984, + "logps/rejected": -95.2616958618164, + "loss": 0.623, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.7896095514297485, + "rewards/margins": 0.30339357256889343, + "rewards/rejected": 0.4862159788608551, + "step": 11153 + }, + { + "epoch": 1.81, + "learning_rate": 2.3379239834564526e-07, + "logits/chosen": -1.2808024883270264, + "logits/rejected": -1.255566120147705, + "logps/chosen": -105.39981842041016, + "logps/rejected": -84.8895492553711, + "loss": 0.3787, + "rewards/accuracies": 0.0, + "rewards/chosen": 7.4264960289001465, + "rewards/margins": -0.04047870635986328, + "rewards/rejected": 7.46697473526001, + "step": 11154 + }, + { + "epoch": 1.81, + "learning_rate": 2.333953829216884e-07, + "logits/chosen": -1.0993150472640991, + "logits/rejected": -1.086222767829895, + "logps/chosen": -60.94060516357422, + "logps/rejected": -74.60393524169922, + "loss": 0.2331, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.2836296558380127, + "rewards/margins": 0.718451738357544, + "rewards/rejected": 1.5651779174804688, + "step": 11155 + }, + { + "epoch": 1.81, + "learning_rate": 2.329986968250092e-07, + "logits/chosen": -1.4008866548538208, + "logits/rejected": -1.256662368774414, + "logps/chosen": -48.40596008300781, + "logps/rejected": -66.97113037109375, + "loss": 0.6466, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.8220956325531006, + "rewards/margins": -0.9723029136657715, + "rewards/rejected": 3.794398546218872, + "step": 11156 + }, + { + "epoch": 1.81, + "learning_rate": 2.3260234008301407e-07, + "logits/chosen": -1.3287402391433716, + "logits/rejected": -1.1574426889419556, + "logps/chosen": -162.6861114501953, + "logps/rejected": -30.856361389160156, + "loss": 0.7815, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.7230422496795654, + "rewards/margins": 3.0341169834136963, + "rewards/rejected": 0.6889252066612244, + "step": 11157 + }, + { + "epoch": 1.81, + "learning_rate": 2.322063127230878e-07, + "logits/chosen": -1.3758565187454224, + "logits/rejected": -1.3953255414962769, + "logps/chosen": -43.576271057128906, + "logps/rejected": -60.5531005859375, + "loss": 0.6561, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0457305908203125, + "rewards/margins": 1.0131042003631592, + "rewards/rejected": 2.0326263904571533, + "step": 11158 + }, + { + "epoch": 1.81, + "learning_rate": 2.3181061477259181e-07, + "logits/chosen": -1.5398372411727905, + "logits/rejected": -1.5302435159683228, + "logps/chosen": -96.89640045166016, + "logps/rejected": -41.45212936401367, + "loss": 0.6527, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.3369407653808594, + "rewards/margins": -0.8869137763977051, + "rewards/rejected": 3.2238545417785645, + "step": 11159 + }, + { + "epoch": 1.81, + "learning_rate": 2.314152462588659e-07, + "logits/chosen": -1.285056233406067, + "logits/rejected": -1.2940843105316162, + "logps/chosen": -50.2192268371582, + "logps/rejected": -101.397705078125, + "loss": 0.6738, + "rewards/accuracies": 0.0, + "rewards/chosen": 5.069283962249756, + "rewards/margins": -0.4017181396484375, + "rewards/rejected": 5.471002101898193, + "step": 11160 + }, + { + "epoch": 1.81, + "learning_rate": 2.3102020720922492e-07, + "logits/chosen": -1.5688079595565796, + "logits/rejected": -1.5830196142196655, + "logps/chosen": -187.49220275878906, + "logps/rejected": -112.9631118774414, + "loss": 0.0165, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.26242995262146, + "rewards/margins": 3.4084055423736572, + "rewards/rejected": -0.1459755003452301, + "step": 11161 + }, + { + "epoch": 1.81, + "learning_rate": 2.3062549765096366e-07, + "logits/chosen": -1.2321642637252808, + "logits/rejected": -1.226336121559143, + "logps/chosen": -26.353944778442383, + "logps/rejected": -27.565275192260742, + "loss": 0.2868, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.2235914468765259, + "rewards/margins": 0.32816123962402344, + "rewards/rejected": 0.8954302072525024, + "step": 11162 + }, + { + "epoch": 1.81, + "learning_rate": 2.3023111761135198e-07, + "logits/chosen": -1.1852331161499023, + "logits/rejected": -1.0966699123382568, + "logps/chosen": -70.70381164550781, + "logps/rejected": -15.178205490112305, + "loss": 2.4982, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.9572868347167969, + "rewards/margins": 0.45101815462112427, + "rewards/rejected": 0.5062686800956726, + "step": 11163 + }, + { + "epoch": 1.81, + "learning_rate": 2.2983706711763808e-07, + "logits/chosen": -1.1429513692855835, + "logits/rejected": -1.0669755935668945, + "logps/chosen": -58.28944396972656, + "logps/rejected": -51.04715347290039, + "loss": 0.4361, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.8347702026367188, + "rewards/margins": -0.28561830520629883, + "rewards/rejected": 2.1203885078430176, + "step": 11164 + }, + { + "epoch": 1.81, + "learning_rate": 2.294433461970469e-07, + "logits/chosen": -1.5486425161361694, + "logits/rejected": -1.5667508840560913, + "logps/chosen": -152.90664672851562, + "logps/rejected": -83.52137756347656, + "loss": 0.6365, + "rewards/accuracies": 0.0, + "rewards/chosen": 4.333978176116943, + "rewards/margins": -0.9311599731445312, + "rewards/rejected": 5.265138149261475, + "step": 11165 + }, + { + "epoch": 1.81, + "learning_rate": 2.2904995487678217e-07, + "logits/chosen": -1.256881833076477, + "logits/rejected": -1.2774500846862793, + "logps/chosen": -70.11766052246094, + "logps/rejected": -37.33310317993164, + "loss": 0.5674, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.629392385482788, + "rewards/margins": 1.9461976289749146, + "rewards/rejected": 0.6831947565078735, + "step": 11166 + }, + { + "epoch": 1.81, + "learning_rate": 2.2865689318402107e-07, + "logits/chosen": -1.590846061706543, + "logits/rejected": -1.5290684700012207, + "logps/chosen": -121.7618637084961, + "logps/rejected": -70.80540466308594, + "loss": 0.0357, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.977784156799316, + "rewards/margins": 3.0970118045806885, + "rewards/rejected": 2.880772352218628, + "step": 11167 + }, + { + "epoch": 1.81, + "learning_rate": 2.2826416114592298e-07, + "logits/chosen": -1.1949182748794556, + "logits/rejected": -1.2102954387664795, + "logps/chosen": -0.649996817111969, + "logps/rejected": -16.359169006347656, + "loss": 0.318, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.19670383632183075, + "rewards/margins": 0.1318265199661255, + "rewards/rejected": 0.06487732380628586, + "step": 11168 + }, + { + "epoch": 1.81, + "learning_rate": 2.2787175878961953e-07, + "logits/chosen": -1.3715999126434326, + "logits/rejected": -1.255842924118042, + "logps/chosen": -85.03166198730469, + "logps/rejected": -87.01609802246094, + "loss": 0.0583, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.836930751800537, + "rewards/margins": 2.9421966075897217, + "rewards/rejected": 2.8947341442108154, + "step": 11169 + }, + { + "epoch": 1.81, + "learning_rate": 2.274796861422246e-07, + "logits/chosen": -1.3835766315460205, + "logits/rejected": -1.3826513290405273, + "logps/chosen": -55.11888122558594, + "logps/rejected": -45.998008728027344, + "loss": 0.265, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.7656677961349487, + "rewards/margins": 0.3917893171310425, + "rewards/rejected": 1.3738784790039062, + "step": 11170 + }, + { + "epoch": 1.81, + "learning_rate": 2.270879432308243e-07, + "logits/chosen": -1.2480655908584595, + "logits/rejected": -1.1674631834030151, + "logps/chosen": -118.05358123779297, + "logps/rejected": -73.42952728271484, + "loss": 0.1459, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.169471263885498, + "rewards/margins": 1.098921537399292, + "rewards/rejected": 3.070549726486206, + "step": 11171 + }, + { + "epoch": 1.81, + "learning_rate": 2.266965300824858e-07, + "logits/chosen": -0.8740144968032837, + "logits/rejected": -0.8771423697471619, + "logps/chosen": -12.993572235107422, + "logps/rejected": -5.389892578125, + "loss": 1.327, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.06134748458862305, + "rewards/margins": -0.2202349156141281, + "rewards/rejected": 0.15888743102550507, + "step": 11172 + }, + { + "epoch": 1.81, + "learning_rate": 2.263054467242515e-07, + "logits/chosen": -1.0990407466888428, + "logits/rejected": -1.1244525909423828, + "logps/chosen": -143.97605895996094, + "logps/rejected": -142.99134826660156, + "loss": 0.0589, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.764138698577881, + "rewards/margins": 2.5279555320739746, + "rewards/rejected": 2.2361831665039062, + "step": 11173 + }, + { + "epoch": 1.81, + "learning_rate": 2.259146931831413e-07, + "logits/chosen": -1.2378281354904175, + "logits/rejected": -1.2609866857528687, + "logps/chosen": -54.302799224853516, + "logps/rejected": -64.36398315429688, + "loss": 0.2775, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.371403932571411, + "rewards/margins": 0.45388221740722656, + "rewards/rejected": 1.9175217151641846, + "step": 11174 + }, + { + "epoch": 1.81, + "learning_rate": 2.2552426948615368e-07, + "logits/chosen": -1.4968990087509155, + "logits/rejected": -1.2946293354034424, + "logps/chosen": -109.50723266601562, + "logps/rejected": -135.01345825195312, + "loss": 0.1963, + "rewards/accuracies": 1.0, + "rewards/chosen": 8.122954368591309, + "rewards/margins": 0.7441730499267578, + "rewards/rejected": 7.378781318664551, + "step": 11175 + }, + { + "epoch": 1.81, + "learning_rate": 2.2513417566026209e-07, + "logits/chosen": -1.1698646545410156, + "logits/rejected": -1.197354793548584, + "logps/chosen": -68.39777374267578, + "logps/rejected": -119.7840576171875, + "loss": 2.0026, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.780287265777588, + "rewards/margins": -3.2749223709106445, + "rewards/rejected": 6.055209636688232, + "step": 11176 + }, + { + "epoch": 1.81, + "learning_rate": 2.2474441173241933e-07, + "logits/chosen": -1.2340542078018188, + "logits/rejected": -1.2430691719055176, + "logps/chosen": -62.35459899902344, + "logps/rejected": -44.83163070678711, + "loss": 1.8035, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.4096076488494873, + "rewards/margins": -3.036616086959839, + "rewards/rejected": 5.446223735809326, + "step": 11177 + }, + { + "epoch": 1.81, + "learning_rate": 2.243549777295534e-07, + "logits/chosen": -1.4937831163406372, + "logits/rejected": -1.4863272905349731, + "logps/chosen": -82.28010559082031, + "logps/rejected": -77.91219329833984, + "loss": 2.297, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.6777398586273193, + "rewards/margins": 1.693769931793213, + "rewards/rejected": 0.9839698672294617, + "step": 11178 + }, + { + "epoch": 1.81, + "learning_rate": 2.239658736785716e-07, + "logits/chosen": -1.5314970016479492, + "logits/rejected": -1.506995439529419, + "logps/chosen": -27.081462860107422, + "logps/rejected": -16.097442626953125, + "loss": 0.2598, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.226836919784546, + "rewards/margins": 1.3865132331848145, + "rewards/rejected": 0.8403236269950867, + "step": 11179 + }, + { + "epoch": 1.81, + "learning_rate": 2.2357709960635577e-07, + "logits/chosen": -1.3452321290969849, + "logits/rejected": -1.4169707298278809, + "logps/chosen": -56.04102325439453, + "logps/rejected": -109.53915405273438, + "loss": 1.3974, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.551429033279419, + "rewards/margins": -1.9943945407867432, + "rewards/rejected": 4.545823574066162, + "step": 11180 + }, + { + "epoch": 1.81, + "learning_rate": 2.2318865553976777e-07, + "logits/chosen": -1.1151865720748901, + "logits/rejected": -1.1008211374282837, + "logps/chosen": -68.86355590820312, + "logps/rejected": -66.32630920410156, + "loss": 0.9706, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.2209632396698, + "rewards/margins": 0.7241507768630981, + "rewards/rejected": 1.4968124628067017, + "step": 11181 + }, + { + "epoch": 1.81, + "learning_rate": 2.2280054150564501e-07, + "logits/chosen": -1.0979028940200806, + "logits/rejected": -1.0293676853179932, + "logps/chosen": -46.53193664550781, + "logps/rejected": -44.236419677734375, + "loss": 0.4819, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.8219841718673706, + "rewards/margins": -0.09498131275177002, + "rewards/rejected": 1.9169654846191406, + "step": 11182 + }, + { + "epoch": 1.82, + "learning_rate": 2.224127575308027e-07, + "logits/chosen": -1.1870757341384888, + "logits/rejected": -1.1802340745925903, + "logps/chosen": -34.126319885253906, + "logps/rejected": -55.28569030761719, + "loss": 1.1423, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.338402509689331, + "rewards/margins": -0.1132669448852539, + "rewards/rejected": 2.451669454574585, + "step": 11183 + }, + { + "epoch": 1.82, + "learning_rate": 2.2202530364203278e-07, + "logits/chosen": -1.5890419483184814, + "logits/rejected": -1.5787583589553833, + "logps/chosen": -107.41203308105469, + "logps/rejected": -177.23004150390625, + "loss": 1.0772, + "rewards/accuracies": 0.0, + "rewards/chosen": 4.497564792633057, + "rewards/margins": -1.7845916748046875, + "rewards/rejected": 6.282156467437744, + "step": 11184 + }, + { + "epoch": 1.82, + "learning_rate": 2.2163817986610493e-07, + "logits/chosen": -1.4211241006851196, + "logits/rejected": -1.4152429103851318, + "logps/chosen": -69.82366943359375, + "logps/rejected": -99.1878433227539, + "loss": 0.0538, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.508087158203125, + "rewards/margins": 2.1813087463378906, + "rewards/rejected": 1.3267784118652344, + "step": 11185 + }, + { + "epoch": 1.82, + "learning_rate": 2.2125138622976494e-07, + "logits/chosen": -1.2365896701812744, + "logits/rejected": -1.174314260482788, + "logps/chosen": -110.49307250976562, + "logps/rejected": -25.98497200012207, + "loss": 0.0641, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.285897970199585, + "rewards/margins": 2.1762967109680176, + "rewards/rejected": 0.1096012145280838, + "step": 11186 + }, + { + "epoch": 1.82, + "learning_rate": 2.2086492275973815e-07, + "logits/chosen": -1.253633737564087, + "logits/rejected": -1.3743939399719238, + "logps/chosen": -60.97595977783203, + "logps/rejected": -82.22531127929688, + "loss": 1.8488, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.8867385387420654, + "rewards/margins": -2.6077258586883545, + "rewards/rejected": 5.49446439743042, + "step": 11187 + }, + { + "epoch": 1.82, + "learning_rate": 2.2047878948272373e-07, + "logits/chosen": -1.1601085662841797, + "logits/rejected": -1.201873779296875, + "logps/chosen": -36.717262268066406, + "logps/rejected": -92.51996612548828, + "loss": 0.9062, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.7975234985351562, + "rewards/margins": -0.49694371223449707, + "rewards/rejected": 2.2944672107696533, + "step": 11188 + }, + { + "epoch": 1.82, + "learning_rate": 2.20092986425402e-07, + "logits/chosen": -1.2453583478927612, + "logits/rejected": -1.198937177658081, + "logps/chosen": -88.42791748046875, + "logps/rejected": -39.55351638793945, + "loss": 0.3703, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.6805694103240967, + "rewards/margins": 0.993649959564209, + "rewards/rejected": 1.6869194507598877, + "step": 11189 + }, + { + "epoch": 1.82, + "learning_rate": 2.1970751361442555e-07, + "logits/chosen": -1.1774992942810059, + "logits/rejected": -1.2134790420532227, + "logps/chosen": -62.95396423339844, + "logps/rejected": -90.62744903564453, + "loss": 1.2961, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.3211121559143066, + "rewards/margins": -0.7119400501251221, + "rewards/rejected": 3.0330522060394287, + "step": 11190 + }, + { + "epoch": 1.82, + "learning_rate": 2.1932237107642917e-07, + "logits/chosen": -1.3043062686920166, + "logits/rejected": -1.3238744735717773, + "logps/chosen": -64.31995391845703, + "logps/rejected": -89.03848266601562, + "loss": 0.1741, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.5655494928359985, + "rewards/margins": 0.9460930228233337, + "rewards/rejected": 0.6194564700126648, + "step": 11191 + }, + { + "epoch": 1.82, + "learning_rate": 2.1893755883802158e-07, + "logits/chosen": -1.3945811986923218, + "logits/rejected": -1.4117456674575806, + "logps/chosen": -97.25604248046875, + "logps/rejected": -105.46773529052734, + "loss": 1.4388, + "rewards/accuracies": 0.0, + "rewards/chosen": 5.779173374176025, + "rewards/margins": -1.0218377113342285, + "rewards/rejected": 6.801011085510254, + "step": 11192 + }, + { + "epoch": 1.82, + "learning_rate": 2.1855307692578986e-07, + "logits/chosen": -1.2961996793746948, + "logits/rejected": -0.9642552733421326, + "logps/chosen": -129.0672149658203, + "logps/rejected": -27.447784423828125, + "loss": 0.569, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.349870204925537, + "rewards/margins": 1.7667837142944336, + "rewards/rejected": 3.5830864906311035, + "step": 11193 + }, + { + "epoch": 1.82, + "learning_rate": 2.1816892536629775e-07, + "logits/chosen": -1.157216191291809, + "logits/rejected": -1.151107907295227, + "logps/chosen": -48.03791427612305, + "logps/rejected": -95.02439880371094, + "loss": 0.1788, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.4739291667938232, + "rewards/margins": 0.8496111631393433, + "rewards/rejected": 1.62431800365448, + "step": 11194 + }, + { + "epoch": 1.82, + "learning_rate": 2.1778510418608734e-07, + "logits/chosen": -1.3079819679260254, + "logits/rejected": -1.3079819679260254, + "logps/chosen": -52.55853271484375, + "logps/rejected": -52.55853271484375, + "loss": 1.1726, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.2818870544433594, + "rewards/margins": 0.0, + "rewards/rejected": 1.2818870544433594, + "step": 11195 + }, + { + "epoch": 1.82, + "learning_rate": 2.1740161341167575e-07, + "logits/chosen": -1.1199084520339966, + "logits/rejected": -1.1129051446914673, + "logps/chosen": -54.10520935058594, + "logps/rejected": -63.212093353271484, + "loss": 0.4486, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.27812659740448, + "rewards/margins": -0.04113650321960449, + "rewards/rejected": 1.3192631006240845, + "step": 11196 + }, + { + "epoch": 1.82, + "learning_rate": 2.1701845306956017e-07, + "logits/chosen": -1.2371433973312378, + "logits/rejected": -1.462318778038025, + "logps/chosen": -89.57846069335938, + "logps/rejected": -37.25426483154297, + "loss": 0.0782, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.5058982372283936, + "rewards/margins": 2.3408615589141846, + "rewards/rejected": 0.16503678262233734, + "step": 11197 + }, + { + "epoch": 1.82, + "learning_rate": 2.166356231862121e-07, + "logits/chosen": -1.6695888042449951, + "logits/rejected": -1.6698132753372192, + "logps/chosen": -99.95099639892578, + "logps/rejected": -61.486480712890625, + "loss": 0.5011, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.273366689682007, + "rewards/margins": 1.7671464681625366, + "rewards/rejected": 1.5062202215194702, + "step": 11198 + }, + { + "epoch": 1.82, + "learning_rate": 2.1625312378808217e-07, + "logits/chosen": -1.306215763092041, + "logits/rejected": -1.1794891357421875, + "logps/chosen": -58.490638732910156, + "logps/rejected": -55.53547668457031, + "loss": 0.2477, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.593408107757568, + "rewards/margins": 1.8558440208435059, + "rewards/rejected": 2.7375640869140625, + "step": 11199 + }, + { + "epoch": 1.82, + "learning_rate": 2.1587095490159638e-07, + "logits/chosen": -0.8197837471961975, + "logits/rejected": -0.8491640090942383, + "logps/chosen": -14.498929023742676, + "logps/rejected": -16.920841217041016, + "loss": 0.6174, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.25355634093284607, + "rewards/margins": -0.6336444616317749, + "rewards/rejected": 0.8872007727622986, + "step": 11200 + }, + { + "epoch": 1.82, + "learning_rate": 2.154891165531603e-07, + "logits/chosen": -1.219657063484192, + "logits/rejected": -1.1041443347930908, + "logps/chosen": -73.97814178466797, + "logps/rejected": -97.89497375488281, + "loss": 0.2038, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.045017242431641, + "rewards/margins": 0.7084312438964844, + "rewards/rejected": 3.3365859985351562, + "step": 11201 + }, + { + "epoch": 1.82, + "learning_rate": 2.1510760876915505e-07, + "logits/chosen": -1.412888765335083, + "logits/rejected": -1.416776418685913, + "logps/chosen": -74.24195861816406, + "logps/rejected": -142.97146606445312, + "loss": 0.9942, + "rewards/accuracies": 0.0, + "rewards/chosen": 4.1573076248168945, + "rewards/margins": -0.7728147506713867, + "rewards/rejected": 4.930122375488281, + "step": 11202 + }, + { + "epoch": 1.82, + "learning_rate": 2.1472643157593843e-07, + "logits/chosen": -1.4316906929016113, + "logits/rejected": -1.3159624338150024, + "logps/chosen": -101.72775268554688, + "logps/rejected": -64.91178894042969, + "loss": 0.1044, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.86171293258667, + "rewards/margins": 3.3951380252838135, + "rewards/rejected": 2.4665749073028564, + "step": 11203 + }, + { + "epoch": 1.82, + "learning_rate": 2.1434558499984715e-07, + "logits/chosen": -1.0191519260406494, + "logits/rejected": -1.0045313835144043, + "logps/chosen": -72.2940673828125, + "logps/rejected": -33.370277404785156, + "loss": 1.6584, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.2432808876037598, + "rewards/margins": -0.00900411605834961, + "rewards/rejected": 2.2522850036621094, + "step": 11204 + }, + { + "epoch": 1.82, + "learning_rate": 2.1396506906719294e-07, + "logits/chosen": -1.3769570589065552, + "logits/rejected": -1.3670501708984375, + "logps/chosen": -48.50387954711914, + "logps/rejected": -76.76244354248047, + "loss": 0.3633, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.9174175262451172, + "rewards/margins": -0.03435409069061279, + "rewards/rejected": 1.95177161693573, + "step": 11205 + }, + { + "epoch": 1.82, + "learning_rate": 2.1358488380426757e-07, + "logits/chosen": -1.1474672555923462, + "logits/rejected": -1.1651904582977295, + "logps/chosen": -62.87944030761719, + "logps/rejected": -43.50503158569336, + "loss": 0.6103, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.8697067499160767, + "rewards/margins": 0.3155975341796875, + "rewards/rejected": 1.5541092157363892, + "step": 11206 + }, + { + "epoch": 1.82, + "learning_rate": 2.132050292373361e-07, + "logits/chosen": -1.2607344388961792, + "logits/rejected": -1.1664255857467651, + "logps/chosen": -67.19273376464844, + "logps/rejected": -49.01780700683594, + "loss": 0.3154, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.8775925636291504, + "rewards/margins": 0.287905216217041, + "rewards/rejected": 2.5896873474121094, + "step": 11207 + }, + { + "epoch": 1.82, + "learning_rate": 2.1282550539264536e-07, + "logits/chosen": -1.2535752058029175, + "logits/rejected": -1.2244935035705566, + "logps/chosen": -74.10650634765625, + "logps/rejected": -85.99609375, + "loss": 0.6525, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2544753551483154, + "rewards/margins": 0.6122016906738281, + "rewards/rejected": 2.6422736644744873, + "step": 11208 + }, + { + "epoch": 1.82, + "learning_rate": 2.1244631229641432e-07, + "logits/chosen": -1.4463945627212524, + "logits/rejected": -1.3646233081817627, + "logps/chosen": -48.96233367919922, + "logps/rejected": -29.770172119140625, + "loss": 0.5232, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.169584035873413, + "rewards/margins": 0.1319742202758789, + "rewards/rejected": 3.037609815597534, + "step": 11209 + }, + { + "epoch": 1.82, + "learning_rate": 2.1206744997484374e-07, + "logits/chosen": -2.1185498237609863, + "logits/rejected": -2.0459182262420654, + "logps/chosen": -62.59514617919922, + "logps/rejected": -162.9815673828125, + "loss": 1.8729, + "rewards/accuracies": 0.0, + "rewards/chosen": 4.831634044647217, + "rewards/margins": -3.206545352935791, + "rewards/rejected": 8.038179397583008, + "step": 11210 + }, + { + "epoch": 1.82, + "learning_rate": 2.1168891845410766e-07, + "logits/chosen": -1.2330031394958496, + "logits/rejected": -1.1751112937927246, + "logps/chosen": -39.644859313964844, + "logps/rejected": -44.84001541137695, + "loss": 0.4186, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.7384461164474487, + "rewards/margins": 0.21027803421020508, + "rewards/rejected": 1.5281680822372437, + "step": 11211 + }, + { + "epoch": 1.82, + "learning_rate": 2.113107177603607e-07, + "logits/chosen": -1.3342770338058472, + "logits/rejected": -1.1218241453170776, + "logps/chosen": -200.0167236328125, + "logps/rejected": -44.16851043701172, + "loss": 0.0032, + "rewards/accuracies": 1.0, + "rewards/chosen": 7.165463447570801, + "rewards/margins": 5.167640209197998, + "rewards/rejected": 1.9978233575820923, + "step": 11212 + }, + { + "epoch": 1.82, + "learning_rate": 2.1093284791973146e-07, + "logits/chosen": -1.0007095336914062, + "logits/rejected": -0.9301703572273254, + "logps/chosen": -34.576438903808594, + "logps/rejected": -48.282470703125, + "loss": 0.1823, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.044250965118408, + "rewards/margins": 1.4766510725021362, + "rewards/rejected": 0.567599892616272, + "step": 11213 + }, + { + "epoch": 1.82, + "learning_rate": 2.1055530895832897e-07, + "logits/chosen": -1.3622608184814453, + "logits/rejected": -1.5211517810821533, + "logps/chosen": -42.84978103637695, + "logps/rejected": -90.08518981933594, + "loss": 2.6248, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.8309032917022705, + "rewards/margins": -5.010619163513184, + "rewards/rejected": 7.841522216796875, + "step": 11214 + }, + { + "epoch": 1.82, + "learning_rate": 2.1017810090223523e-07, + "logits/chosen": -1.1466082334518433, + "logits/rejected": -1.0397861003875732, + "logps/chosen": -47.186424255371094, + "logps/rejected": -59.84523010253906, + "loss": 0.6511, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.964943766593933, + "rewards/margins": -0.5516296625137329, + "rewards/rejected": 2.516573429107666, + "step": 11215 + }, + { + "epoch": 1.82, + "learning_rate": 2.0980122377751378e-07, + "logits/chosen": -1.5620085000991821, + "logits/rejected": -1.560540795326233, + "logps/chosen": -91.36936950683594, + "logps/rejected": -75.03773498535156, + "loss": 0.4358, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9638307094573975, + "rewards/margins": 2.251373291015625, + "rewards/rejected": 0.7124572992324829, + "step": 11216 + }, + { + "epoch": 1.82, + "learning_rate": 2.0942467761020159e-07, + "logits/chosen": -1.2710458040237427, + "logits/rejected": -1.2710458040237427, + "logps/chosen": -141.87106323242188, + "logps/rejected": -141.87106323242188, + "loss": 0.3537, + "rewards/accuracies": 0.0, + "rewards/chosen": 9.01172161102295, + "rewards/margins": 0.0, + "rewards/rejected": 9.01172161102295, + "step": 11217 + }, + { + "epoch": 1.82, + "learning_rate": 2.090484624263167e-07, + "logits/chosen": -1.1575576066970825, + "logits/rejected": -1.1148768663406372, + "logps/chosen": -65.48103332519531, + "logps/rejected": -93.06248474121094, + "loss": 0.0965, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.662184238433838, + "rewards/margins": 1.5915284156799316, + "rewards/rejected": 2.0706558227539062, + "step": 11218 + }, + { + "epoch": 1.82, + "learning_rate": 2.086725782518495e-07, + "logits/chosen": -1.0217050313949585, + "logits/rejected": -0.9145445227622986, + "logps/chosen": -95.05091857910156, + "logps/rejected": -32.19164276123047, + "loss": 0.0947, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9799225330352783, + "rewards/margins": 3.7882204055786133, + "rewards/rejected": -0.8082977533340454, + "step": 11219 + }, + { + "epoch": 1.82, + "learning_rate": 2.0829702511277194e-07, + "logits/chosen": -1.2340043783187866, + "logits/rejected": -1.260194182395935, + "logps/chosen": -62.54001998901367, + "logps/rejected": -78.62748718261719, + "loss": 0.1863, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.3431828022003174, + "rewards/margins": 1.5563130378723145, + "rewards/rejected": 0.7868698239326477, + "step": 11220 + }, + { + "epoch": 1.82, + "learning_rate": 2.079218030350294e-07, + "logits/chosen": -1.301268458366394, + "logits/rejected": -1.334338903427124, + "logps/chosen": -164.14881896972656, + "logps/rejected": -118.23187255859375, + "loss": 0.1704, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.838566780090332, + "rewards/margins": 2.478315830230713, + "rewards/rejected": 2.360250949859619, + "step": 11221 + }, + { + "epoch": 1.82, + "learning_rate": 2.0754691204454835e-07, + "logits/chosen": -1.3163294792175293, + "logits/rejected": -1.2755430936813354, + "logps/chosen": -91.01324462890625, + "logps/rejected": -50.74601364135742, + "loss": 0.6578, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.153564453125, + "rewards/margins": 0.860446572303772, + "rewards/rejected": 1.293117880821228, + "step": 11222 + }, + { + "epoch": 1.82, + "learning_rate": 2.0717235216722808e-07, + "logits/chosen": -1.0362833738327026, + "logits/rejected": -0.975436806678772, + "logps/chosen": -54.18794250488281, + "logps/rejected": -40.58710479736328, + "loss": 0.8338, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.1030404567718506, + "rewards/margins": 0.08503055572509766, + "rewards/rejected": 2.018009901046753, + "step": 11223 + }, + { + "epoch": 1.82, + "learning_rate": 2.06798123428949e-07, + "logits/chosen": -1.179975986480713, + "logits/rejected": -1.1542580127716064, + "logps/chosen": -92.25839233398438, + "logps/rejected": -69.95585632324219, + "loss": 0.5749, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.7072465419769287, + "rewards/margins": 1.1560014486312866, + "rewards/rejected": 1.551245093345642, + "step": 11224 + }, + { + "epoch": 1.82, + "learning_rate": 2.0642422585556542e-07, + "logits/chosen": -1.3037022352218628, + "logits/rejected": -1.3189496994018555, + "logps/chosen": -64.98130798339844, + "logps/rejected": -75.69999694824219, + "loss": 0.2965, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.6205756664276123, + "rewards/margins": 0.7811002731323242, + "rewards/rejected": 2.839475393295288, + "step": 11225 + }, + { + "epoch": 1.82, + "learning_rate": 2.0605065947291115e-07, + "logits/chosen": -1.6517267227172852, + "logits/rejected": -1.5960488319396973, + "logps/chosen": -65.77253723144531, + "logps/rejected": -32.242156982421875, + "loss": 0.3695, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.356776475906372, + "rewards/margins": -0.06409025192260742, + "rewards/rejected": 2.4208667278289795, + "step": 11226 + }, + { + "epoch": 1.82, + "learning_rate": 2.0567742430679438e-07, + "logits/chosen": -1.0632350444793701, + "logits/rejected": -1.1590287685394287, + "logps/chosen": -65.81022644042969, + "logps/rejected": -84.4676513671875, + "loss": 0.7113, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.600437879562378, + "rewards/margins": -0.9324405193328857, + "rewards/rejected": 4.532878398895264, + "step": 11227 + }, + { + "epoch": 1.82, + "learning_rate": 2.0530452038300453e-07, + "logits/chosen": -1.4954538345336914, + "logits/rejected": -1.4812594652175903, + "logps/chosen": -201.34361267089844, + "logps/rejected": -64.33023071289062, + "loss": 0.0489, + "rewards/accuracies": 1.0, + "rewards/chosen": 7.5398149490356445, + "rewards/margins": 4.075760841369629, + "rewards/rejected": 3.4640541076660156, + "step": 11228 + }, + { + "epoch": 1.82, + "learning_rate": 2.0493194772730375e-07, + "logits/chosen": -1.0939747095108032, + "logits/rejected": -1.0939747095108032, + "logps/chosen": -91.83247375488281, + "logps/rejected": -91.83247375488281, + "loss": 0.5377, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.48491370677948, + "rewards/margins": 0.0, + "rewards/rejected": 1.48491370677948, + "step": 11229 + }, + { + "epoch": 1.82, + "learning_rate": 2.0455970636543364e-07, + "logits/chosen": -1.5121431350708008, + "logits/rejected": -1.3209397792816162, + "logps/chosen": -165.3402557373047, + "logps/rejected": -55.63306427001953, + "loss": 0.0446, + "rewards/accuracies": 1.0, + "rewards/chosen": 7.127530097961426, + "rewards/margins": 5.861800670623779, + "rewards/rejected": 1.265729546546936, + "step": 11230 + }, + { + "epoch": 1.82, + "learning_rate": 2.0418779632311425e-07, + "logits/chosen": -1.4644659757614136, + "logits/rejected": -1.3716604709625244, + "logps/chosen": -101.06078338623047, + "logps/rejected": -68.35643005371094, + "loss": 0.9068, + "rewards/accuracies": 1.0, + "rewards/chosen": 7.330333709716797, + "rewards/margins": 5.242661476135254, + "rewards/rejected": 2.087672472000122, + "step": 11231 + }, + { + "epoch": 1.82, + "learning_rate": 2.0381621762603832e-07, + "logits/chosen": -1.5434656143188477, + "logits/rejected": -1.3935573101043701, + "logps/chosen": -104.21916198730469, + "logps/rejected": -106.58146667480469, + "loss": 0.0776, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.801011562347412, + "rewards/margins": 3.439486503601074, + "rewards/rejected": 3.361525058746338, + "step": 11232 + }, + { + "epoch": 1.82, + "learning_rate": 2.0344497029988087e-07, + "logits/chosen": -1.4222711324691772, + "logits/rejected": -1.2817659378051758, + "logps/chosen": -54.45801544189453, + "logps/rejected": -57.50334930419922, + "loss": 0.4046, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.363852024078369, + "rewards/margins": -0.15192341804504395, + "rewards/rejected": 2.515775442123413, + "step": 11233 + }, + { + "epoch": 1.82, + "learning_rate": 2.0307405437029027e-07, + "logits/chosen": -1.060044765472412, + "logits/rejected": -1.0685399770736694, + "logps/chosen": -70.59783935546875, + "logps/rejected": -43.123626708984375, + "loss": 0.18, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.2024574279785156, + "rewards/margins": 0.8555450439453125, + "rewards/rejected": 1.3469123840332031, + "step": 11234 + }, + { + "epoch": 1.82, + "learning_rate": 2.027034698628938e-07, + "logits/chosen": -1.320054531097412, + "logits/rejected": -1.3329805135726929, + "logps/chosen": -34.93938064575195, + "logps/rejected": -41.3978271484375, + "loss": 0.2728, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.478520154953003, + "rewards/margins": 1.5601661205291748, + "rewards/rejected": 0.9183540344238281, + "step": 11235 + }, + { + "epoch": 1.82, + "learning_rate": 2.0233321680329432e-07, + "logits/chosen": -0.9990962147712708, + "logits/rejected": -0.9123276472091675, + "logps/chosen": -62.2602653503418, + "logps/rejected": -41.09104537963867, + "loss": 0.1923, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.5346333980560303, + "rewards/margins": 1.8798706531524658, + "rewards/rejected": 0.6547626852989197, + "step": 11236 + }, + { + "epoch": 1.82, + "learning_rate": 2.019632952170747e-07, + "logits/chosen": -1.2674508094787598, + "logits/rejected": -1.1602989435195923, + "logps/chosen": -52.842384338378906, + "logps/rejected": -15.744216918945312, + "loss": 0.4458, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.4011597633361816, + "rewards/margins": 3.1060791015625, + "rewards/rejected": 0.2950807511806488, + "step": 11237 + }, + { + "epoch": 1.82, + "learning_rate": 2.0159370512979116e-07, + "logits/chosen": -1.074328064918518, + "logits/rejected": -1.061310887336731, + "logps/chosen": -94.4949722290039, + "logps/rejected": -53.937232971191406, + "loss": 0.774, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.259633779525757, + "rewards/margins": -1.1073622703552246, + "rewards/rejected": 3.3669960498809814, + "step": 11238 + }, + { + "epoch": 1.82, + "learning_rate": 2.0122444656697993e-07, + "logits/chosen": -1.171437382698059, + "logits/rejected": -1.1208839416503906, + "logps/chosen": -55.049659729003906, + "logps/rejected": -54.39179229736328, + "loss": 0.1934, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.8467705249786377, + "rewards/margins": 0.7636489868164062, + "rewards/rejected": 2.0831215381622314, + "step": 11239 + }, + { + "epoch": 1.82, + "learning_rate": 2.0085551955415284e-07, + "logits/chosen": -1.6343498229980469, + "logits/rejected": -1.3677517175674438, + "logps/chosen": -60.151187896728516, + "logps/rejected": -28.627260208129883, + "loss": 1.5, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.14754056930542, + "rewards/margins": 2.412370204925537, + "rewards/rejected": 1.7351702451705933, + "step": 11240 + }, + { + "epoch": 1.82, + "learning_rate": 2.004869241167995e-07, + "logits/chosen": -1.3013455867767334, + "logits/rejected": -1.2260940074920654, + "logps/chosen": -30.88390350341797, + "logps/rejected": -51.3571662902832, + "loss": 0.5311, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.8392670154571533, + "rewards/margins": 0.05307281017303467, + "rewards/rejected": 1.7861942052841187, + "step": 11241 + }, + { + "epoch": 1.82, + "learning_rate": 2.0011866028038617e-07, + "logits/chosen": -1.432621955871582, + "logits/rejected": -1.3754571676254272, + "logps/chosen": -73.63787841796875, + "logps/rejected": -88.5677490234375, + "loss": 0.2842, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0988152027130127, + "rewards/margins": 0.3278083801269531, + "rewards/rejected": 2.7710068225860596, + "step": 11242 + }, + { + "epoch": 1.82, + "learning_rate": 1.99750728070357e-07, + "logits/chosen": -1.2042365074157715, + "logits/rejected": -1.2042365074157715, + "logps/chosen": -19.540523529052734, + "logps/rejected": -19.540523529052734, + "loss": 0.3482, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.92144775390625, + "rewards/margins": 0.0, + "rewards/rejected": 1.92144775390625, + "step": 11243 + }, + { + "epoch": 1.83, + "learning_rate": 1.9938312751213162e-07, + "logits/chosen": -1.4478809833526611, + "logits/rejected": -1.3182212114334106, + "logps/chosen": -88.4627914428711, + "logps/rejected": -12.762109756469727, + "loss": 0.0501, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0922324657440186, + "rewards/margins": 2.5503623485565186, + "rewards/rejected": 0.5418701171875, + "step": 11244 + }, + { + "epoch": 1.83, + "learning_rate": 1.990158586311086e-07, + "logits/chosen": -0.8355827331542969, + "logits/rejected": -0.8355827331542969, + "logps/chosen": -60.88386154174805, + "logps/rejected": -60.88386154174805, + "loss": 0.3641, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.088837146759033, + "rewards/margins": 0.0, + "rewards/rejected": 3.088837146759033, + "step": 11245 + }, + { + "epoch": 1.83, + "learning_rate": 1.9864892145266214e-07, + "logits/chosen": -1.2938576936721802, + "logits/rejected": -1.327346682548523, + "logps/chosen": -53.0578727722168, + "logps/rejected": -48.95158767700195, + "loss": 0.9798, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.6934545040130615, + "rewards/margins": -0.9729361534118652, + "rewards/rejected": 3.6663906574249268, + "step": 11246 + }, + { + "epoch": 1.83, + "learning_rate": 1.9828231600214464e-07, + "logits/chosen": -1.4503436088562012, + "logits/rejected": -1.277396321296692, + "logps/chosen": -110.70816040039062, + "logps/rejected": -66.48978424072266, + "loss": 0.0606, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.836772441864014, + "rewards/margins": 2.198211193084717, + "rewards/rejected": 3.638561248779297, + "step": 11247 + }, + { + "epoch": 1.83, + "learning_rate": 1.9791604230488427e-07, + "logits/chosen": -1.2611645460128784, + "logits/rejected": -1.2048767805099487, + "logps/chosen": -37.98497772216797, + "logps/rejected": -50.424137115478516, + "loss": 0.7694, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.6293258666992188, + "rewards/margins": -0.571526050567627, + "rewards/rejected": 2.2008519172668457, + "step": 11248 + }, + { + "epoch": 1.83, + "learning_rate": 1.9755010038618848e-07, + "logits/chosen": -1.070900321006775, + "logits/rejected": -1.1438322067260742, + "logps/chosen": -53.616455078125, + "logps/rejected": -71.74982452392578, + "loss": 0.6972, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.3534225225448608, + "rewards/margins": -0.8245292901992798, + "rewards/rejected": 2.1779518127441406, + "step": 11249 + }, + { + "epoch": 1.83, + "learning_rate": 1.971844902713388e-07, + "logits/chosen": -1.1879481077194214, + "logits/rejected": -1.2094029188156128, + "logps/chosen": -106.41525268554688, + "logps/rejected": -83.2054443359375, + "loss": 0.5205, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.4727630615234375, + "rewards/margins": -0.6011865139007568, + "rewards/rejected": 3.0739495754241943, + "step": 11250 + }, + { + "epoch": 1.83, + "learning_rate": 1.9681921198559717e-07, + "logits/chosen": -1.4135485887527466, + "logits/rejected": -1.4061717987060547, + "logps/chosen": -61.953590393066406, + "logps/rejected": -41.89237594604492, + "loss": 0.4076, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2195184230804443, + "rewards/margins": 0.4591672420501709, + "rewards/rejected": 2.7603511810302734, + "step": 11251 + }, + { + "epoch": 1.83, + "learning_rate": 1.96454265554199e-07, + "logits/chosen": -1.2914215326309204, + "logits/rejected": -1.2636570930480957, + "logps/chosen": -67.84823608398438, + "logps/rejected": -116.95001220703125, + "loss": 0.0352, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.820385694503784, + "rewards/margins": 2.671826124191284, + "rewards/rejected": 1.1485595703125, + "step": 11252 + }, + { + "epoch": 1.83, + "learning_rate": 1.9608965100235966e-07, + "logits/chosen": -1.4595495462417603, + "logits/rejected": -1.45730721950531, + "logps/chosen": -51.30108642578125, + "logps/rejected": -70.78910064697266, + "loss": 0.8125, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.2719132900238037, + "rewards/margins": 0.8537248373031616, + "rewards/rejected": 1.418188452720642, + "step": 11253 + }, + { + "epoch": 1.83, + "learning_rate": 1.9572536835527013e-07, + "logits/chosen": -1.528114676475525, + "logits/rejected": -1.4968746900558472, + "logps/chosen": -37.690887451171875, + "logps/rejected": -88.67813110351562, + "loss": 0.8073, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.475743055343628, + "rewards/margins": -0.36746692657470703, + "rewards/rejected": 2.843209981918335, + "step": 11254 + }, + { + "epoch": 1.83, + "learning_rate": 1.9536141763810023e-07, + "logits/chosen": -0.8437018990516663, + "logits/rejected": -0.8233986496925354, + "logps/chosen": -12.719562530517578, + "logps/rejected": -2.2379586696624756, + "loss": 0.2816, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.6532427072525024, + "rewards/margins": 0.2833099365234375, + "rewards/rejected": 0.36993277072906494, + "step": 11255 + }, + { + "epoch": 1.83, + "learning_rate": 1.9499779887599323e-07, + "logits/chosen": -1.4190435409545898, + "logits/rejected": -1.400004267692566, + "logps/chosen": -60.52690124511719, + "logps/rejected": -62.85913848876953, + "loss": 0.7169, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.0513687133789062, + "rewards/margins": -1.1034431457519531, + "rewards/rejected": 3.1548118591308594, + "step": 11256 + }, + { + "epoch": 1.83, + "learning_rate": 1.9463451209407402e-07, + "logits/chosen": -1.2779600620269775, + "logits/rejected": -1.1921029090881348, + "logps/chosen": -77.9640121459961, + "logps/rejected": -70.64454650878906, + "loss": 0.1863, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.867075443267822, + "rewards/margins": 2.6106557846069336, + "rewards/rejected": 4.256419658660889, + "step": 11257 + }, + { + "epoch": 1.83, + "learning_rate": 1.942715573174403e-07, + "logits/chosen": -1.4318615198135376, + "logits/rejected": -1.4360244274139404, + "logps/chosen": -74.12628173828125, + "logps/rejected": -56.85362243652344, + "loss": 0.0618, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.876263618469238, + "rewards/margins": 2.121904134750366, + "rewards/rejected": 2.754359483718872, + "step": 11258 + }, + { + "epoch": 1.83, + "learning_rate": 1.9390893457117032e-07, + "logits/chosen": -1.8490127325057983, + "logits/rejected": -1.7543575763702393, + "logps/chosen": -145.46530151367188, + "logps/rejected": -22.010257720947266, + "loss": 0.1857, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.7984619140625, + "rewards/margins": 0.9235172271728516, + "rewards/rejected": 1.8749446868896484, + "step": 11259 + }, + { + "epoch": 1.83, + "learning_rate": 1.9354664388031685e-07, + "logits/chosen": -1.3323043584823608, + "logits/rejected": -1.2472805976867676, + "logps/chosen": -168.55325317382812, + "logps/rejected": -31.224456787109375, + "loss": 0.1138, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.463201999664307, + "rewards/margins": 6.137418270111084, + "rewards/rejected": 0.32578393816947937, + "step": 11260 + }, + { + "epoch": 1.83, + "learning_rate": 1.9318468526991207e-07, + "logits/chosen": -1.0882095098495483, + "logits/rejected": -1.204904556274414, + "logps/chosen": -40.78962707519531, + "logps/rejected": -59.44834899902344, + "loss": 0.966, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.7443809509277344, + "rewards/margins": -1.7530527114868164, + "rewards/rejected": 4.497433662414551, + "step": 11261 + }, + { + "epoch": 1.83, + "learning_rate": 1.928230587649621e-07, + "logits/chosen": -1.512271523475647, + "logits/rejected": -1.5115845203399658, + "logps/chosen": -37.133262634277344, + "logps/rejected": -65.72552490234375, + "loss": 0.2453, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.334307074546814, + "rewards/margins": 0.4725760817527771, + "rewards/rejected": 0.8617309927940369, + "step": 11262 + }, + { + "epoch": 1.83, + "learning_rate": 1.924617643904536e-07, + "logits/chosen": -1.3709596395492554, + "logits/rejected": -1.424420714378357, + "logps/chosen": -182.54930114746094, + "logps/rejected": -104.13130187988281, + "loss": 0.1629, + "rewards/accuracies": 1.0, + "rewards/chosen": 8.015944480895996, + "rewards/margins": 1.491147518157959, + "rewards/rejected": 6.524796962738037, + "step": 11263 + }, + { + "epoch": 1.83, + "learning_rate": 1.921008021713472e-07, + "logits/chosen": -0.8934454321861267, + "logits/rejected": -0.8887712955474854, + "logps/chosen": -38.87767028808594, + "logps/rejected": -65.22103118896484, + "loss": 0.6723, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.7237838506698608, + "rewards/margins": -1.0251084566116333, + "rewards/rejected": 2.748892307281494, + "step": 11264 + }, + { + "epoch": 1.83, + "learning_rate": 1.917401721325829e-07, + "logits/chosen": -1.126121997833252, + "logits/rejected": -1.0350698232650757, + "logps/chosen": -42.48017883300781, + "logps/rejected": -68.72445678710938, + "loss": 1.2864, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.4774062633514404, + "rewards/margins": -1.3432579040527344, + "rewards/rejected": 3.820664167404175, + "step": 11265 + }, + { + "epoch": 1.83, + "learning_rate": 1.9137987429907635e-07, + "logits/chosen": -1.6730362176895142, + "logits/rejected": -1.6730362176895142, + "logps/chosen": -122.77890014648438, + "logps/rejected": -122.77890014648438, + "loss": 0.3474, + "rewards/accuracies": 0.0, + "rewards/chosen": 7.041943550109863, + "rewards/margins": 0.0, + "rewards/rejected": 7.041943550109863, + "step": 11266 + }, + { + "epoch": 1.83, + "learning_rate": 1.9101990869572095e-07, + "logits/chosen": -1.1519315242767334, + "logits/rejected": -1.0947610139846802, + "logps/chosen": -24.379261016845703, + "logps/rejected": -41.16461944580078, + "loss": 0.7311, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.815542221069336, + "rewards/margins": 0.14126849174499512, + "rewards/rejected": 1.6742737293243408, + "step": 11267 + }, + { + "epoch": 1.83, + "learning_rate": 1.9066027534738685e-07, + "logits/chosen": -1.1883069276809692, + "logits/rejected": -1.1610848903656006, + "logps/chosen": -110.88223266601562, + "logps/rejected": -94.6988754272461, + "loss": 0.2705, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.7069519758224487, + "rewards/margins": 0.5530083179473877, + "rewards/rejected": 1.153943657875061, + "step": 11268 + }, + { + "epoch": 1.83, + "learning_rate": 1.9030097427892135e-07, + "logits/chosen": -1.5793107748031616, + "logits/rejected": -1.6845380067825317, + "logps/chosen": -86.97047424316406, + "logps/rejected": -35.44646072387695, + "loss": 0.0452, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.7867798805236816, + "rewards/margins": 2.377779960632324, + "rewards/rejected": 0.4090000092983246, + "step": 11269 + }, + { + "epoch": 1.83, + "learning_rate": 1.8994200551514907e-07, + "logits/chosen": -1.1483744382858276, + "logits/rejected": -1.1793168783187866, + "logps/chosen": -50.06194305419922, + "logps/rejected": -133.99818420410156, + "loss": 0.3492, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9069740772247314, + "rewards/margins": 1.3023369312286377, + "rewards/rejected": 1.6046371459960938, + "step": 11270 + }, + { + "epoch": 1.83, + "learning_rate": 1.8958336908087015e-07, + "logits/chosen": -0.7758303880691528, + "logits/rejected": -0.7758303880691528, + "logps/chosen": -15.77455997467041, + "logps/rejected": -15.77455997467041, + "loss": 0.68, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.7601349949836731, + "rewards/margins": 0.0, + "rewards/rejected": 0.7601349949836731, + "step": 11271 + }, + { + "epoch": 1.83, + "learning_rate": 1.892250650008648e-07, + "logits/chosen": -1.4027022123336792, + "logits/rejected": -1.4463684558868408, + "logps/chosen": -58.567588806152344, + "logps/rejected": -97.33914947509766, + "loss": 0.4869, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.2461297512054443, + "rewards/margins": -0.4080810546875, + "rewards/rejected": 3.6542108058929443, + "step": 11272 + }, + { + "epoch": 1.83, + "learning_rate": 1.8886709329988705e-07, + "logits/chosen": -1.2980941534042358, + "logits/rejected": -1.2972652912139893, + "logps/chosen": -52.021728515625, + "logps/rejected": -63.29884719848633, + "loss": 0.9963, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.037733554840088, + "rewards/margins": -0.6283571720123291, + "rewards/rejected": 2.666090726852417, + "step": 11273 + }, + { + "epoch": 1.83, + "learning_rate": 1.8850945400266994e-07, + "logits/chosen": -1.103004813194275, + "logits/rejected": -1.1280649900436401, + "logps/chosen": -75.6615982055664, + "logps/rejected": -81.5311279296875, + "loss": 1.0056, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.157952070236206, + "rewards/margins": 0.28284525871276855, + "rewards/rejected": 2.8751068115234375, + "step": 11274 + }, + { + "epoch": 1.83, + "learning_rate": 1.881521471339226e-07, + "logits/chosen": -1.3530853986740112, + "logits/rejected": -1.3882296085357666, + "logps/chosen": -44.82893371582031, + "logps/rejected": -69.71018981933594, + "loss": 0.4685, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.4026718139648438, + "rewards/margins": 1.0696128606796265, + "rewards/rejected": 1.3330589532852173, + "step": 11275 + }, + { + "epoch": 1.83, + "learning_rate": 1.8779517271833247e-07, + "logits/chosen": -1.1965221166610718, + "logits/rejected": -1.2366693019866943, + "logps/chosen": -54.87509536743164, + "logps/rejected": -73.72321319580078, + "loss": 0.6664, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.298008441925049, + "rewards/margins": -0.5540995597839355, + "rewards/rejected": 2.8521080017089844, + "step": 11276 + }, + { + "epoch": 1.83, + "learning_rate": 1.874385307805615e-07, + "logits/chosen": -1.2422657012939453, + "logits/rejected": -1.2899693250656128, + "logps/chosen": -86.01748657226562, + "logps/rejected": -96.39173889160156, + "loss": 0.6067, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.7876732349395752, + "rewards/margins": -0.858605146408081, + "rewards/rejected": 2.6462783813476562, + "step": 11277 + }, + { + "epoch": 1.83, + "learning_rate": 1.8708222134525168e-07, + "logits/chosen": -1.2244857549667358, + "logits/rejected": -1.2244857549667358, + "logps/chosen": -32.41608810424805, + "logps/rejected": -32.41608810424805, + "loss": 0.374, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.3072056770324707, + "rewards/margins": 0.0, + "rewards/rejected": 3.3072056770324707, + "step": 11278 + }, + { + "epoch": 1.83, + "learning_rate": 1.8672624443701936e-07, + "logits/chosen": -1.2764496803283691, + "logits/rejected": -1.2354564666748047, + "logps/chosen": -63.226470947265625, + "logps/rejected": -71.80551147460938, + "loss": 0.4177, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.2671563625335693, + "rewards/margins": -0.20129013061523438, + "rewards/rejected": 2.4684464931488037, + "step": 11279 + }, + { + "epoch": 1.83, + "learning_rate": 1.8637060008046104e-07, + "logits/chosen": -1.519095540046692, + "logits/rejected": -1.478902816772461, + "logps/chosen": -95.64306640625, + "logps/rejected": -74.83341979980469, + "loss": 1.7354, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.335803270339966, + "rewards/margins": -1.8203437328338623, + "rewards/rejected": 5.156147003173828, + "step": 11280 + }, + { + "epoch": 1.83, + "learning_rate": 1.8601528830014647e-07, + "logits/chosen": -0.6961720585823059, + "logits/rejected": -0.7671166658401489, + "logps/chosen": -55.68777084350586, + "logps/rejected": -42.16673278808594, + "loss": 0.4954, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.1342365741729736, + "rewards/margins": -0.29142332077026367, + "rewards/rejected": 2.4256598949432373, + "step": 11281 + }, + { + "epoch": 1.83, + "learning_rate": 1.856603091206255e-07, + "logits/chosen": -1.5311859846115112, + "logits/rejected": -1.441432237625122, + "logps/chosen": -68.8237075805664, + "logps/rejected": -29.776823043823242, + "loss": 0.0949, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.972486138343811, + "rewards/margins": 1.6924993991851807, + "rewards/rejected": 0.27998676896095276, + "step": 11282 + }, + { + "epoch": 1.83, + "learning_rate": 1.8530566256642345e-07, + "logits/chosen": -1.4614241123199463, + "logits/rejected": -1.5519063472747803, + "logps/chosen": -64.87689208984375, + "logps/rejected": -131.43948364257812, + "loss": 2.0845, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.0291764736175537, + "rewards/margins": -3.8740861415863037, + "rewards/rejected": 6.903262615203857, + "step": 11283 + }, + { + "epoch": 1.83, + "learning_rate": 1.84951348662043e-07, + "logits/chosen": -1.3725199699401855, + "logits/rejected": -1.2225017547607422, + "logps/chosen": -78.82845306396484, + "logps/rejected": -26.20013999938965, + "loss": 0.0119, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.70258092880249, + "rewards/margins": 3.9426636695861816, + "rewards/rejected": 0.7599172592163086, + "step": 11284 + }, + { + "epoch": 1.83, + "learning_rate": 1.84597367431964e-07, + "logits/chosen": -1.030063509941101, + "logits/rejected": -1.0756967067718506, + "logps/chosen": -71.56192016601562, + "logps/rejected": -112.5922622680664, + "loss": 0.3042, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.16912841796875, + "rewards/margins": 0.5705360174179077, + "rewards/rejected": 1.5985924005508423, + "step": 11285 + }, + { + "epoch": 1.83, + "learning_rate": 1.8424371890064296e-07, + "logits/chosen": -1.318419337272644, + "logits/rejected": -1.3057317733764648, + "logps/chosen": -59.62782669067383, + "logps/rejected": -33.814414978027344, + "loss": 0.5162, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.8511264324188232, + "rewards/margins": -0.11124992370605469, + "rewards/rejected": 2.962376356124878, + "step": 11286 + }, + { + "epoch": 1.83, + "learning_rate": 1.8389040309251373e-07, + "logits/chosen": -1.207666277885437, + "logits/rejected": -1.159503698348999, + "logps/chosen": -73.03032684326172, + "logps/rejected": -50.33154296875, + "loss": 0.2733, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.7709710597991943, + "rewards/margins": 0.3392372131347656, + "rewards/rejected": 2.4317338466644287, + "step": 11287 + }, + { + "epoch": 1.83, + "learning_rate": 1.8353742003198783e-07, + "logits/chosen": -1.2718758583068848, + "logits/rejected": -1.2165802717208862, + "logps/chosen": -75.91136932373047, + "logps/rejected": -26.4512882232666, + "loss": 0.1386, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.8101296424865723, + "rewards/margins": 1.1451002359390259, + "rewards/rejected": 1.6650294065475464, + "step": 11288 + }, + { + "epoch": 1.83, + "learning_rate": 1.831847697434519e-07, + "logits/chosen": -0.7676711678504944, + "logits/rejected": -0.7641549706459045, + "logps/chosen": -8.062024116516113, + "logps/rejected": -17.97128677368164, + "loss": 0.305, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5681089758872986, + "rewards/margins": 0.1888096034526825, + "rewards/rejected": 0.3792993724346161, + "step": 11289 + }, + { + "epoch": 1.83, + "learning_rate": 1.8283245225127145e-07, + "logits/chosen": -1.2751667499542236, + "logits/rejected": -1.2505311965942383, + "logps/chosen": -48.650638580322266, + "logps/rejected": -37.281524658203125, + "loss": 0.3711, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.1891353130340576, + "rewards/margins": 0.04912829399108887, + "rewards/rejected": 2.1400070190429688, + "step": 11290 + }, + { + "epoch": 1.83, + "learning_rate": 1.8248046757978754e-07, + "logits/chosen": -1.242162823677063, + "logits/rejected": -1.201589584350586, + "logps/chosen": -68.12872314453125, + "logps/rejected": -65.74937438964844, + "loss": 3.7395, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.1744384765625, + "rewards/margins": -1.1961069107055664, + "rewards/rejected": 4.370545387268066, + "step": 11291 + }, + { + "epoch": 1.83, + "learning_rate": 1.821288157533202e-07, + "logits/chosen": -1.017531156539917, + "logits/rejected": -1.017531156539917, + "logps/chosen": -44.145206451416016, + "logps/rejected": -44.145206451416016, + "loss": 0.3657, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.5140732526779175, + "rewards/margins": 0.0, + "rewards/rejected": 1.5140732526779175, + "step": 11292 + }, + { + "epoch": 1.83, + "learning_rate": 1.817774967961644e-07, + "logits/chosen": -1.1836775541305542, + "logits/rejected": -1.074639916419983, + "logps/chosen": -94.78028869628906, + "logps/rejected": -69.873291015625, + "loss": 0.631, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.630729675292969, + "rewards/margins": 0.46120452880859375, + "rewards/rejected": 5.169525146484375, + "step": 11293 + }, + { + "epoch": 1.83, + "learning_rate": 1.81426510732593e-07, + "logits/chosen": -1.5474761724472046, + "logits/rejected": -1.554262399673462, + "logps/chosen": -116.76432800292969, + "logps/rejected": -71.6365966796875, + "loss": 0.1656, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0780091285705566, + "rewards/margins": 1.0322768688201904, + "rewards/rejected": 2.045732259750366, + "step": 11294 + }, + { + "epoch": 1.83, + "learning_rate": 1.8107585758685596e-07, + "logits/chosen": -1.0851134061813354, + "logits/rejected": -1.0639183521270752, + "logps/chosen": -84.30855560302734, + "logps/rejected": -78.36146545410156, + "loss": 0.3377, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1718711853027344, + "rewards/margins": 0.7026350498199463, + "rewards/rejected": 2.469236135482788, + "step": 11295 + }, + { + "epoch": 1.83, + "learning_rate": 1.8072553738318012e-07, + "logits/chosen": -0.9861071705818176, + "logits/rejected": -0.9861071705818176, + "logps/chosen": -48.81110382080078, + "logps/rejected": -48.81110382080078, + "loss": 1.1516, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.056359052658081, + "rewards/margins": 0.0, + "rewards/rejected": 2.056359052658081, + "step": 11296 + }, + { + "epoch": 1.83, + "learning_rate": 1.8037555014576935e-07, + "logits/chosen": -1.0092099905014038, + "logits/rejected": -1.0184444189071655, + "logps/chosen": -9.879314422607422, + "logps/rejected": -2.2096543312072754, + "loss": 0.5024, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.06248750910162926, + "rewards/margins": -0.3406935930252075, + "rewards/rejected": 0.27820608019828796, + "step": 11297 + }, + { + "epoch": 1.83, + "learning_rate": 1.8002589589880436e-07, + "logits/chosen": -1.150458574295044, + "logits/rejected": -1.1359144449234009, + "logps/chosen": -94.7895736694336, + "logps/rejected": -58.097694396972656, + "loss": 0.9125, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.24436569213867188, + "rewards/margins": -0.9109283685684204, + "rewards/rejected": 1.1552940607070923, + "step": 11298 + }, + { + "epoch": 1.83, + "learning_rate": 1.796765746664425e-07, + "logits/chosen": -0.7952221632003784, + "logits/rejected": -0.7896835803985596, + "logps/chosen": -2.6449692249298096, + "logps/rejected": -12.1349515914917, + "loss": 0.4217, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.48195067048072815, + "rewards/margins": -0.2677862346172333, + "rewards/rejected": 0.7497369050979614, + "step": 11299 + }, + { + "epoch": 1.83, + "learning_rate": 1.7932758647281944e-07, + "logits/chosen": -1.2593578100204468, + "logits/rejected": -1.2593578100204468, + "logps/chosen": -52.319244384765625, + "logps/rejected": -52.319244384765625, + "loss": 0.6073, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.3540725708007812, + "rewards/margins": 0.0, + "rewards/rejected": 2.3540725708007812, + "step": 11300 + }, + { + "epoch": 1.83, + "learning_rate": 1.7897893134204648e-07, + "logits/chosen": -1.0387258529663086, + "logits/rejected": -1.1258474588394165, + "logps/chosen": -98.86874389648438, + "logps/rejected": -127.19338989257812, + "loss": 2.1914, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.13157057762146, + "rewards/margins": -1.3390729427337646, + "rewards/rejected": 4.470643520355225, + "step": 11301 + }, + { + "epoch": 1.83, + "learning_rate": 1.7863060929821208e-07, + "logits/chosen": -1.4780513048171997, + "logits/rejected": -1.2237796783447266, + "logps/chosen": -132.41607666015625, + "logps/rejected": -57.03462219238281, + "loss": 0.9825, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.414987087249756, + "rewards/margins": 2.286259174346924, + "rewards/rejected": 4.128727912902832, + "step": 11302 + }, + { + "epoch": 1.83, + "learning_rate": 1.7828262036538257e-07, + "logits/chosen": -0.9749998450279236, + "logits/rejected": -1.0528091192245483, + "logps/chosen": -53.682762145996094, + "logps/rejected": -63.969703674316406, + "loss": 0.4502, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.7780144214630127, + "rewards/margins": 0.8423118591308594, + "rewards/rejected": 1.9357025623321533, + "step": 11303 + }, + { + "epoch": 1.83, + "learning_rate": 1.779349645676004e-07, + "logits/chosen": -1.2378549575805664, + "logits/rejected": -1.260127305984497, + "logps/chosen": -203.05938720703125, + "logps/rejected": -239.12115478515625, + "loss": 0.4387, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.142875671386719, + "rewards/margins": 2.3088912963867188, + "rewards/rejected": 2.833984375, + "step": 11304 + }, + { + "epoch": 1.83, + "learning_rate": 1.7758764192888577e-07, + "logits/chosen": -1.2492057085037231, + "logits/rejected": -1.1270486116409302, + "logps/chosen": -121.41934204101562, + "logps/rejected": -62.26657485961914, + "loss": 0.1021, + "rewards/accuracies": 1.0, + "rewards/chosen": 7.063397407531738, + "rewards/margins": 4.8275346755981445, + "rewards/rejected": 2.2358624935150146, + "step": 11305 + }, + { + "epoch": 1.84, + "learning_rate": 1.7724065247323453e-07, + "logits/chosen": -1.2227327823638916, + "logits/rejected": -1.2557097673416138, + "logps/chosen": -50.94282531738281, + "logps/rejected": -42.74955368041992, + "loss": 0.879, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.0597450733184814, + "rewards/margins": -0.8453166484832764, + "rewards/rejected": 3.905061721801758, + "step": 11306 + }, + { + "epoch": 1.84, + "learning_rate": 1.7689399622462134e-07, + "logits/chosen": -1.140449047088623, + "logits/rejected": -1.1436249017715454, + "logps/chosen": -168.3313751220703, + "logps/rejected": -79.39641571044922, + "loss": 0.2724, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.6722564697265625, + "rewards/margins": 0.4766874313354492, + "rewards/rejected": 4.195569038391113, + "step": 11307 + }, + { + "epoch": 1.84, + "learning_rate": 1.7654767320699596e-07, + "logits/chosen": -1.1869298219680786, + "logits/rejected": -1.2872843742370605, + "logps/chosen": -230.3559112548828, + "logps/rejected": -77.94161987304688, + "loss": 0.335, + "rewards/accuracies": 1.0, + "rewards/chosen": 7.349174499511719, + "rewards/margins": 3.5530853271484375, + "rewards/rejected": 3.7960891723632812, + "step": 11308 + }, + { + "epoch": 1.84, + "learning_rate": 1.7620168344428757e-07, + "logits/chosen": -1.6044225692749023, + "logits/rejected": -1.5763808488845825, + "logps/chosen": -83.25054931640625, + "logps/rejected": -58.88178253173828, + "loss": 0.1846, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.4627296924591064, + "rewards/margins": 0.8089470863342285, + "rewards/rejected": 2.653782606124878, + "step": 11309 + }, + { + "epoch": 1.84, + "learning_rate": 1.7585602696039817e-07, + "logits/chosen": -0.9461817145347595, + "logits/rejected": -0.9461817145347595, + "logps/chosen": -20.052780151367188, + "logps/rejected": -20.052780151367188, + "loss": 0.3542, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.432752251625061, + "rewards/margins": 0.0, + "rewards/rejected": 1.432752251625061, + "step": 11310 + }, + { + "epoch": 1.84, + "learning_rate": 1.7551070377921197e-07, + "logits/chosen": -1.3468793630599976, + "logits/rejected": -1.4503490924835205, + "logps/chosen": -61.09014892578125, + "logps/rejected": -92.50244140625, + "loss": 0.5501, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.2779786586761475, + "rewards/margins": 0.07074832916259766, + "rewards/rejected": 2.20723032951355, + "step": 11311 + }, + { + "epoch": 1.84, + "learning_rate": 1.7516571392458547e-07, + "logits/chosen": -1.5105587244033813, + "logits/rejected": -1.5233417749404907, + "logps/chosen": -39.506744384765625, + "logps/rejected": -69.03170776367188, + "loss": 1.4782, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.078542470932007, + "rewards/margins": -2.8963658809661865, + "rewards/rejected": 4.974908351898193, + "step": 11312 + }, + { + "epoch": 1.84, + "learning_rate": 1.7482105742035516e-07, + "logits/chosen": -1.1501351594924927, + "logits/rejected": -1.1501351594924927, + "logps/chosen": -15.202139854431152, + "logps/rejected": -15.202139854431152, + "loss": 0.461, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.5323641896247864, + "rewards/margins": 0.0, + "rewards/rejected": 0.5323641896247864, + "step": 11313 + }, + { + "epoch": 1.84, + "learning_rate": 1.7447673429033361e-07, + "logits/chosen": -1.0860657691955566, + "logits/rejected": -1.080265998840332, + "logps/chosen": -113.33673858642578, + "logps/rejected": -107.72791290283203, + "loss": 0.4461, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.13358998298645, + "rewards/margins": 0.6987556219100952, + "rewards/rejected": 1.434834361076355, + "step": 11314 + }, + { + "epoch": 1.84, + "learning_rate": 1.7413274455831074e-07, + "logits/chosen": -1.5066018104553223, + "logits/rejected": -1.465229868888855, + "logps/chosen": -60.62601089477539, + "logps/rejected": -74.48304748535156, + "loss": 1.0854, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.1055562496185303, + "rewards/margins": -0.5264842510223389, + "rewards/rejected": 3.632040500640869, + "step": 11315 + }, + { + "epoch": 1.84, + "learning_rate": 1.7378908824805195e-07, + "logits/chosen": -0.9979085326194763, + "logits/rejected": -0.9504037499427795, + "logps/chosen": -35.32079315185547, + "logps/rejected": -41.30314636230469, + "loss": 0.5891, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.238075613975525, + "rewards/margins": 0.47134625911712646, + "rewards/rejected": 0.7667293548583984, + "step": 11316 + }, + { + "epoch": 1.84, + "learning_rate": 1.7344576538330104e-07, + "logits/chosen": -1.0279982089996338, + "logits/rejected": -1.020219326019287, + "logps/chosen": -41.109519958496094, + "logps/rejected": -77.72468566894531, + "loss": 0.6425, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.6004737615585327, + "rewards/margins": 0.21401596069335938, + "rewards/rejected": 1.3864578008651733, + "step": 11317 + }, + { + "epoch": 1.84, + "learning_rate": 1.7310277598777847e-07, + "logits/chosen": -0.9435778856277466, + "logits/rejected": -0.9853855967521667, + "logps/chosen": -93.86492919921875, + "logps/rejected": -47.0739860534668, + "loss": 1.5161, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.5717269778251648, + "rewards/margins": -2.311767101287842, + "rewards/rejected": 2.8834941387176514, + "step": 11318 + }, + { + "epoch": 1.84, + "learning_rate": 1.7276012008518195e-07, + "logits/chosen": -1.3401634693145752, + "logits/rejected": -1.2326531410217285, + "logps/chosen": -66.30406188964844, + "logps/rejected": -52.775657653808594, + "loss": 0.5913, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.5306243896484375, + "rewards/margins": 0.09522628784179688, + "rewards/rejected": 3.4353981018066406, + "step": 11319 + }, + { + "epoch": 1.84, + "learning_rate": 1.724177976991853e-07, + "logits/chosen": -1.0658257007598877, + "logits/rejected": -1.1048121452331543, + "logps/chosen": -62.81592559814453, + "logps/rejected": -63.279876708984375, + "loss": 0.2258, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.942792534828186, + "rewards/margins": 1.1829383373260498, + "rewards/rejected": 0.7598541378974915, + "step": 11320 + }, + { + "epoch": 1.84, + "learning_rate": 1.7207580885343911e-07, + "logits/chosen": -1.2913172245025635, + "logits/rejected": -1.0771008729934692, + "logps/chosen": -64.74927520751953, + "logps/rejected": -33.13792037963867, + "loss": 0.1528, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0667877197265625, + "rewards/margins": 3.9895505905151367, + "rewards/rejected": -0.9227628707885742, + "step": 11321 + }, + { + "epoch": 1.84, + "learning_rate": 1.717341535715733e-07, + "logits/chosen": -1.5720490217208862, + "logits/rejected": -1.519622802734375, + "logps/chosen": -58.73674774169922, + "logps/rejected": -76.96391296386719, + "loss": 0.3511, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.8443024158477783, + "rewards/margins": 0.6563887596130371, + "rewards/rejected": 2.187913656234741, + "step": 11322 + }, + { + "epoch": 1.84, + "learning_rate": 1.7139283187719124e-07, + "logits/chosen": -1.0459097623825073, + "logits/rejected": -1.0459097623825073, + "logps/chosen": -48.32136917114258, + "logps/rejected": -48.32136917114258, + "loss": 0.3546, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.6457271575927734, + "rewards/margins": 0.0, + "rewards/rejected": 2.6457271575927734, + "step": 11323 + }, + { + "epoch": 1.84, + "learning_rate": 1.7105184379387628e-07, + "logits/chosen": -1.4549353122711182, + "logits/rejected": -1.4727411270141602, + "logps/chosen": -64.89669799804688, + "logps/rejected": -133.95211791992188, + "loss": 0.9396, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.9884393215179443, + "rewards/margins": -1.6082208156585693, + "rewards/rejected": 4.596660137176514, + "step": 11324 + }, + { + "epoch": 1.84, + "learning_rate": 1.7071118934518628e-07, + "logits/chosen": -1.299882173538208, + "logits/rejected": -1.3536983728408813, + "logps/chosen": -151.50518798828125, + "logps/rejected": -84.17454528808594, + "loss": 0.7308, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.1561431884765625, + "rewards/margins": 0.6047101020812988, + "rewards/rejected": 5.551433086395264, + "step": 11325 + }, + { + "epoch": 1.84, + "learning_rate": 1.7037086855465902e-07, + "logits/chosen": -1.4793704748153687, + "logits/rejected": -1.36172616481781, + "logps/chosen": -157.36312866210938, + "logps/rejected": -22.013856887817383, + "loss": 0.1843, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2516205310821533, + "rewards/margins": 0.9036164283752441, + "rewards/rejected": 2.348004102706909, + "step": 11326 + }, + { + "epoch": 1.84, + "learning_rate": 1.7003088144580515e-07, + "logits/chosen": -1.7002922296524048, + "logits/rejected": -1.637284755706787, + "logps/chosen": -63.13814163208008, + "logps/rejected": -40.638092041015625, + "loss": 0.2619, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.330648422241211, + "rewards/margins": 1.0522994995117188, + "rewards/rejected": 2.278348922729492, + "step": 11327 + }, + { + "epoch": 1.84, + "learning_rate": 1.69691228042117e-07, + "logits/chosen": -1.229877233505249, + "logits/rejected": -1.199173927307129, + "logps/chosen": -54.98560333251953, + "logps/rejected": -37.36900329589844, + "loss": 2.0318, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.7067322731018066, + "rewards/margins": 0.12757182121276855, + "rewards/rejected": 2.579160451889038, + "step": 11328 + }, + { + "epoch": 1.84, + "learning_rate": 1.6935190836705916e-07, + "logits/chosen": -1.2917426824569702, + "logits/rejected": -1.121048927307129, + "logps/chosen": -152.12631225585938, + "logps/rejected": -72.3109130859375, + "loss": 0.4012, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.961963176727295, + "rewards/margins": 3.996307611465454, + "rewards/rejected": 2.965655565261841, + "step": 11329 + }, + { + "epoch": 1.84, + "learning_rate": 1.6901292244407729e-07, + "logits/chosen": -1.0178275108337402, + "logits/rejected": -0.9959423542022705, + "logps/chosen": -23.773433685302734, + "logps/rejected": -1.9909464120864868, + "loss": 0.5533, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5542106628417969, + "rewards/margins": 0.1506497859954834, + "rewards/rejected": 0.4035608768463135, + "step": 11330 + }, + { + "epoch": 1.84, + "learning_rate": 1.6867427029659046e-07, + "logits/chosen": -1.014995813369751, + "logits/rejected": -1.0005955696105957, + "logps/chosen": -12.305326461791992, + "logps/rejected": -2.482142210006714, + "loss": 0.7072, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.3897272050380707, + "rewards/margins": -0.31030693650245667, + "rewards/rejected": 0.7000341415405273, + "step": 11331 + }, + { + "epoch": 1.84, + "learning_rate": 1.6833595194799767e-07, + "logits/chosen": -1.1740785837173462, + "logits/rejected": -1.182733178138733, + "logps/chosen": -66.27098083496094, + "logps/rejected": -114.71832275390625, + "loss": 1.1384, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.8769256472587585, + "rewards/margins": -2.113384962081909, + "rewards/rejected": 2.9903106689453125, + "step": 11332 + }, + { + "epoch": 1.84, + "learning_rate": 1.6799796742167307e-07, + "logits/chosen": -1.4101322889328003, + "logits/rejected": -1.0669969320297241, + "logps/chosen": -276.89105224609375, + "logps/rejected": -106.58486938476562, + "loss": 0.0087, + "rewards/accuracies": 1.0, + "rewards/chosen": 9.220477104187012, + "rewards/margins": 4.155721664428711, + "rewards/rejected": 5.064755439758301, + "step": 11333 + }, + { + "epoch": 1.84, + "learning_rate": 1.6766031674096795e-07, + "logits/chosen": -1.1051172018051147, + "logits/rejected": -1.122585415840149, + "logps/chosen": -48.27484893798828, + "logps/rejected": -57.13275146484375, + "loss": 0.6253, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.5398948192596436, + "rewards/margins": 1.9597601890563965, + "rewards/rejected": 1.580134630203247, + "step": 11334 + }, + { + "epoch": 1.84, + "learning_rate": 1.673229999292103e-07, + "logits/chosen": -1.039945125579834, + "logits/rejected": -1.0073294639587402, + "logps/chosen": -52.650733947753906, + "logps/rejected": -33.89474868774414, + "loss": 0.4202, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1658852100372314, + "rewards/margins": 0.9140307903289795, + "rewards/rejected": 2.251854419708252, + "step": 11335 + }, + { + "epoch": 1.84, + "learning_rate": 1.6698601700970707e-07, + "logits/chosen": -1.1926933526992798, + "logits/rejected": -0.9540087580680847, + "logps/chosen": -139.1023406982422, + "logps/rejected": -41.7652473449707, + "loss": 0.8751, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.545797824859619, + "rewards/margins": 3.3423211574554443, + "rewards/rejected": 2.203476667404175, + "step": 11336 + }, + { + "epoch": 1.84, + "learning_rate": 1.6664936800573906e-07, + "logits/chosen": -1.1806703805923462, + "logits/rejected": -1.1506563425064087, + "logps/chosen": -150.92303466796875, + "logps/rejected": -79.17156219482422, + "loss": 0.1336, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.566915988922119, + "rewards/margins": 1.2222687005996704, + "rewards/rejected": 1.3446472883224487, + "step": 11337 + }, + { + "epoch": 1.84, + "learning_rate": 1.6631305294056655e-07, + "logits/chosen": -1.3270436525344849, + "logits/rejected": -1.2935824394226074, + "logps/chosen": -47.01348114013672, + "logps/rejected": -53.522438049316406, + "loss": 0.061, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.086881160736084, + "rewards/margins": 2.600440740585327, + "rewards/rejected": 2.486440420150757, + "step": 11338 + }, + { + "epoch": 1.84, + "learning_rate": 1.6597707183742484e-07, + "logits/chosen": -1.1070650815963745, + "logits/rejected": -0.8567415475845337, + "logps/chosen": -70.995361328125, + "logps/rejected": -32.601287841796875, + "loss": 0.3622, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.8043854236602783, + "rewards/margins": 2.8998115062713623, + "rewards/rejected": -0.09542598575353622, + "step": 11339 + }, + { + "epoch": 1.84, + "learning_rate": 1.6564142471952815e-07, + "logits/chosen": -1.3772221803665161, + "logits/rejected": -1.3301575183868408, + "logps/chosen": -43.19977569580078, + "logps/rejected": -66.18818664550781, + "loss": 0.4389, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.9784996509552, + "rewards/margins": -0.19770431518554688, + "rewards/rejected": 3.176203966140747, + "step": 11340 + }, + { + "epoch": 1.84, + "learning_rate": 1.6530611161006515e-07, + "logits/chosen": -1.1762088537216187, + "logits/rejected": -1.1062414646148682, + "logps/chosen": -96.36582946777344, + "logps/rejected": -50.97824478149414, + "loss": 0.8795, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0301284790039062, + "rewards/margins": 0.7752711772918701, + "rewards/rejected": 2.254857301712036, + "step": 11341 + }, + { + "epoch": 1.84, + "learning_rate": 1.64971132532204e-07, + "logits/chosen": -1.1908719539642334, + "logits/rejected": -1.248273253440857, + "logps/chosen": -41.550559997558594, + "logps/rejected": -66.37033081054688, + "loss": 0.992, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.7920173406600952, + "rewards/margins": -1.833152174949646, + "rewards/rejected": 3.625169515609741, + "step": 11342 + }, + { + "epoch": 1.84, + "learning_rate": 1.6463648750908778e-07, + "logits/chosen": -0.8409909009933472, + "logits/rejected": -0.8172035813331604, + "logps/chosen": -77.37434387207031, + "logps/rejected": -45.330562591552734, + "loss": 0.2657, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.845149278640747, + "rewards/margins": 0.360215425491333, + "rewards/rejected": 1.484933853149414, + "step": 11343 + }, + { + "epoch": 1.84, + "learning_rate": 1.6430217656383806e-07, + "logits/chosen": -1.0786702632904053, + "logits/rejected": -1.1030926704406738, + "logps/chosen": -64.73593139648438, + "logps/rejected": -83.17498016357422, + "loss": 0.2175, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.0098037719726562, + "rewards/margins": 0.7569320201873779, + "rewards/rejected": 1.2528717517852783, + "step": 11344 + }, + { + "epoch": 1.84, + "learning_rate": 1.639681997195519e-07, + "logits/chosen": -1.057188630104065, + "logits/rejected": -1.0556362867355347, + "logps/chosen": -2.291628837585449, + "logps/rejected": -4.198273658752441, + "loss": 0.4633, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.11384570598602295, + "rewards/margins": 0.028667666018009186, + "rewards/rejected": 0.08517803996801376, + "step": 11345 + }, + { + "epoch": 1.84, + "learning_rate": 1.636345569993042e-07, + "logits/chosen": -1.5831968784332275, + "logits/rejected": -1.7632516622543335, + "logps/chosen": -160.88262939453125, + "logps/rejected": -137.65586853027344, + "loss": 0.3146, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.310943603515625, + "rewards/margins": 0.2820248603820801, + "rewards/rejected": 6.028918743133545, + "step": 11346 + }, + { + "epoch": 1.84, + "learning_rate": 1.6330124842614647e-07, + "logits/chosen": -1.477064847946167, + "logits/rejected": -1.353344202041626, + "logps/chosen": -132.83770751953125, + "logps/rejected": -69.06521606445312, + "loss": 0.7827, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.482064723968506, + "rewards/margins": 1.590137243270874, + "rewards/rejected": 3.891927480697632, + "step": 11347 + }, + { + "epoch": 1.84, + "learning_rate": 1.6296827402310756e-07, + "logits/chosen": -0.9127027988433838, + "logits/rejected": -0.9127027988433838, + "logps/chosen": -37.065086364746094, + "logps/rejected": -37.065086364746094, + "loss": 0.6894, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.2172771692276, + "rewards/margins": 0.0, + "rewards/rejected": 1.2172771692276, + "step": 11348 + }, + { + "epoch": 1.84, + "learning_rate": 1.626356338131918e-07, + "logits/chosen": -1.4427640438079834, + "logits/rejected": -1.3583468198776245, + "logps/chosen": -84.84357452392578, + "logps/rejected": -61.795501708984375, + "loss": 0.0927, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.802834510803223, + "rewards/margins": 1.6186714172363281, + "rewards/rejected": 4.1841630935668945, + "step": 11349 + }, + { + "epoch": 1.84, + "learning_rate": 1.6230332781938253e-07, + "logits/chosen": -1.3680362701416016, + "logits/rejected": -1.4185408353805542, + "logps/chosen": -60.812225341796875, + "logps/rejected": -51.7034797668457, + "loss": 0.9084, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.8220932483673096, + "rewards/margins": -1.5731594562530518, + "rewards/rejected": 4.395252704620361, + "step": 11350 + }, + { + "epoch": 1.84, + "learning_rate": 1.6197135606463855e-07, + "logits/chosen": -1.1923270225524902, + "logits/rejected": -1.1665900945663452, + "logps/chosen": -29.40624237060547, + "logps/rejected": -12.11042308807373, + "loss": 0.5506, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.9576007723808289, + "rewards/margins": 0.1393030881881714, + "rewards/rejected": 0.8182976841926575, + "step": 11351 + }, + { + "epoch": 1.84, + "learning_rate": 1.6163971857189652e-07, + "logits/chosen": -1.329030156135559, + "logits/rejected": -1.2683497667312622, + "logps/chosen": -55.808494567871094, + "logps/rejected": -8.622010231018066, + "loss": 0.6288, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.4037163257598877, + "rewards/margins": 1.9122965335845947, + "rewards/rejected": 0.49141979217529297, + "step": 11352 + }, + { + "epoch": 1.84, + "learning_rate": 1.6130841536406816e-07, + "logits/chosen": -1.2703757286071777, + "logits/rejected": -1.1873576641082764, + "logps/chosen": -29.87320899963379, + "logps/rejected": -19.021806716918945, + "loss": 0.1908, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.913344621658325, + "rewards/margins": 2.912424087524414, + "rewards/rejected": 1.0009205341339111, + "step": 11353 + }, + { + "epoch": 1.84, + "learning_rate": 1.6097744646404457e-07, + "logits/chosen": -1.3840699195861816, + "logits/rejected": -1.1939637660980225, + "logps/chosen": -100.50067138671875, + "logps/rejected": -44.703125, + "loss": 0.0271, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.9297194480896, + "rewards/margins": 3.3834967613220215, + "rewards/rejected": 2.546222686767578, + "step": 11354 + }, + { + "epoch": 1.84, + "learning_rate": 1.606468118946919e-07, + "logits/chosen": -1.2545757293701172, + "logits/rejected": -1.2438502311706543, + "logps/chosen": -59.64251708984375, + "logps/rejected": -60.52176284790039, + "loss": 0.9452, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.6475982666015625, + "rewards/margins": -0.5822408199310303, + "rewards/rejected": 3.2298390865325928, + "step": 11355 + }, + { + "epoch": 1.84, + "learning_rate": 1.6031651167885465e-07, + "logits/chosen": -1.2499992847442627, + "logits/rejected": -1.2243045568466187, + "logps/chosen": -56.89980697631836, + "logps/rejected": -80.27943420410156, + "loss": 0.4231, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.2941761016845703, + "rewards/margins": 0.4099079370498657, + "rewards/rejected": 1.8842681646347046, + "step": 11356 + }, + { + "epoch": 1.84, + "learning_rate": 1.5998654583935235e-07, + "logits/chosen": -1.2035363912582397, + "logits/rejected": -1.2446248531341553, + "logps/chosen": -61.257469177246094, + "logps/rejected": -50.35626983642578, + "loss": 0.4089, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.38639760017395, + "rewards/margins": 0.0670928955078125, + "rewards/rejected": 3.3193047046661377, + "step": 11357 + }, + { + "epoch": 1.84, + "learning_rate": 1.5965691439898345e-07, + "logits/chosen": -1.2690390348434448, + "logits/rejected": -1.3036315441131592, + "logps/chosen": -108.0273208618164, + "logps/rejected": -146.5162811279297, + "loss": 1.7933, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.515326738357544, + "rewards/margins": -1.6353905200958252, + "rewards/rejected": 4.150717258453369, + "step": 11358 + }, + { + "epoch": 1.84, + "learning_rate": 1.5932761738052192e-07, + "logits/chosen": -1.3793587684631348, + "logits/rejected": -1.3897656202316284, + "logps/chosen": -90.92086791992188, + "logps/rejected": -66.57920837402344, + "loss": 0.7997, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.4556825160980225, + "rewards/margins": -0.6373350620269775, + "rewards/rejected": 3.093017578125, + "step": 11359 + }, + { + "epoch": 1.84, + "learning_rate": 1.5899865480671904e-07, + "logits/chosen": -0.9798622727394104, + "logits/rejected": -0.9854996204376221, + "logps/chosen": -64.47441864013672, + "logps/rejected": -66.5284652709961, + "loss": 0.2366, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.0412437915802, + "rewards/margins": 0.6429153680801392, + "rewards/rejected": 1.398328423500061, + "step": 11360 + }, + { + "epoch": 1.84, + "learning_rate": 1.5867002670030386e-07, + "logits/chosen": -1.4032816886901855, + "logits/rejected": -1.3957067728042603, + "logps/chosen": -45.41210174560547, + "logps/rejected": -68.64840698242188, + "loss": 0.4723, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.779521942138672, + "rewards/margins": 0.5543053150177002, + "rewards/rejected": 2.2252166271209717, + "step": 11361 + }, + { + "epoch": 1.84, + "learning_rate": 1.583417330839798e-07, + "logits/chosen": -1.541123867034912, + "logits/rejected": -1.5614302158355713, + "logps/chosen": -118.03939819335938, + "logps/rejected": -118.526123046875, + "loss": 0.0735, + "rewards/accuracies": 1.0, + "rewards/chosen": 7.61794900894165, + "rewards/margins": 2.117784023284912, + "rewards/rejected": 5.500164985656738, + "step": 11362 + }, + { + "epoch": 1.84, + "learning_rate": 1.580137739804305e-07, + "logits/chosen": -1.2354551553726196, + "logits/rejected": -1.2446402311325073, + "logps/chosen": -79.91596221923828, + "logps/rejected": -95.14681243896484, + "loss": 0.3524, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.3608345985412598, + "rewards/margins": 1.004591464996338, + "rewards/rejected": 1.3562431335449219, + "step": 11363 + }, + { + "epoch": 1.84, + "learning_rate": 1.576861494123133e-07, + "logits/chosen": -1.1063257455825806, + "logits/rejected": -1.1328128576278687, + "logps/chosen": -41.97710418701172, + "logps/rejected": -116.13115692138672, + "loss": 2.7702, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.742391347885132, + "rewards/margins": -5.474123954772949, + "rewards/rejected": 8.21651554107666, + "step": 11364 + }, + { + "epoch": 1.84, + "learning_rate": 1.573588594022657e-07, + "logits/chosen": -1.486060380935669, + "logits/rejected": -1.401911973953247, + "logps/chosen": -90.90096282958984, + "logps/rejected": -28.608245849609375, + "loss": 0.3316, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9408347606658936, + "rewards/margins": 2.2910513877868652, + "rewards/rejected": 0.6497833132743835, + "step": 11365 + }, + { + "epoch": 1.84, + "learning_rate": 1.5703190397289902e-07, + "logits/chosen": -1.7879292964935303, + "logits/rejected": -1.6970834732055664, + "logps/chosen": -105.82656860351562, + "logps/rejected": -14.40267562866211, + "loss": 1.0858, + "rewards/accuracies": 1.0, + "rewards/chosen": 7.04733419418335, + "rewards/margins": 6.100357532501221, + "rewards/rejected": 0.9469764828681946, + "step": 11366 + }, + { + "epoch": 1.84, + "learning_rate": 1.56705283146803e-07, + "logits/chosen": -1.3237240314483643, + "logits/rejected": -1.1852689981460571, + "logps/chosen": -74.85536193847656, + "logps/rejected": -18.6600341796875, + "loss": 0.0358, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.1284685134887695, + "rewards/margins": 3.4171154499053955, + "rewards/rejected": 0.7113531231880188, + "step": 11367 + }, + { + "epoch": 1.85, + "learning_rate": 1.5637899694654456e-07, + "logits/chosen": -1.4438848495483398, + "logits/rejected": -1.45480477809906, + "logps/chosen": -192.72654724121094, + "logps/rejected": -65.02298736572266, + "loss": 0.2754, + "rewards/accuracies": 1.0, + "rewards/chosen": 7.231004238128662, + "rewards/margins": 3.8035008907318115, + "rewards/rejected": 3.4275033473968506, + "step": 11368 + }, + { + "epoch": 1.85, + "learning_rate": 1.5605304539466625e-07, + "logits/chosen": -1.2675319910049438, + "logits/rejected": -1.3426649570465088, + "logps/chosen": -59.9739990234375, + "logps/rejected": -101.3106918334961, + "loss": 0.2276, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.9902496337890625, + "rewards/margins": 0.7303626537322998, + "rewards/rejected": 1.2598869800567627, + "step": 11369 + }, + { + "epoch": 1.85, + "learning_rate": 1.5572742851368895e-07, + "logits/chosen": -1.3671550750732422, + "logits/rejected": -1.3598934412002563, + "logps/chosen": -65.1743392944336, + "logps/rejected": -120.6788558959961, + "loss": 0.1459, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.3891639709472656, + "rewards/margins": 1.264836072921753, + "rewards/rejected": 1.1243278980255127, + "step": 11370 + }, + { + "epoch": 1.85, + "learning_rate": 1.5540214632610962e-07, + "logits/chosen": -1.3853894472122192, + "logits/rejected": -1.351086974143982, + "logps/chosen": -105.13433074951172, + "logps/rejected": -69.90228271484375, + "loss": 0.196, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0743415355682373, + "rewards/margins": 0.8052527904510498, + "rewards/rejected": 2.2690887451171875, + "step": 11371 + }, + { + "epoch": 1.85, + "learning_rate": 1.5507719885440143e-07, + "logits/chosen": -0.748503565788269, + "logits/rejected": -0.7420074939727783, + "logps/chosen": -0.757561206817627, + "logps/rejected": -8.213629722595215, + "loss": 0.6753, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.19510427117347717, + "rewards/margins": 0.16396775841712952, + "rewards/rejected": 0.031136512756347656, + "step": 11372 + }, + { + "epoch": 1.85, + "learning_rate": 1.547525861210164e-07, + "logits/chosen": -1.4587101936340332, + "logits/rejected": -1.367746353149414, + "logps/chosen": -54.273921966552734, + "logps/rejected": -36.43693542480469, + "loss": 0.1234, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.1263203620910645, + "rewards/margins": 1.525838851928711, + "rewards/rejected": 0.6004814505577087, + "step": 11373 + }, + { + "epoch": 1.85, + "learning_rate": 1.544283081483805e-07, + "logits/chosen": -1.888980746269226, + "logits/rejected": -1.8937466144561768, + "logps/chosen": -56.48221969604492, + "logps/rejected": -71.24131774902344, + "loss": 0.6492, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.9557042121887207, + "rewards/margins": -0.9605205059051514, + "rewards/rejected": 3.916224718093872, + "step": 11374 + }, + { + "epoch": 1.85, + "learning_rate": 1.5410436495890026e-07, + "logits/chosen": -1.6167479753494263, + "logits/rejected": -1.5540452003479004, + "logps/chosen": -98.52539825439453, + "logps/rejected": -100.56134033203125, + "loss": 0.0321, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.88289737701416, + "rewards/margins": 2.7257211208343506, + "rewards/rejected": 3.1571762561798096, + "step": 11375 + }, + { + "epoch": 1.85, + "learning_rate": 1.5378075657495552e-07, + "logits/chosen": -1.1885606050491333, + "logits/rejected": -1.1654071807861328, + "logps/chosen": -63.26818084716797, + "logps/rejected": -99.73960876464844, + "loss": 0.4712, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.7184524536132812, + "rewards/margins": -0.3903014659881592, + "rewards/rejected": 3.1087539196014404, + "step": 11376 + }, + { + "epoch": 1.85, + "learning_rate": 1.5345748301890562e-07, + "logits/chosen": -1.2352381944656372, + "logits/rejected": -1.1981552839279175, + "logps/chosen": -76.28651428222656, + "logps/rejected": -59.69770050048828, + "loss": 0.9639, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.2229385375976562, + "rewards/margins": -1.5885009765625, + "rewards/rejected": 4.811439514160156, + "step": 11377 + }, + { + "epoch": 1.85, + "learning_rate": 1.5313454431308494e-07, + "logits/chosen": -1.2340291738510132, + "logits/rejected": -1.4047727584838867, + "logps/chosen": -218.78030395507812, + "logps/rejected": -154.90142822265625, + "loss": 0.3267, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.030142307281494, + "rewards/margins": 0.10286855697631836, + "rewards/rejected": 4.927273750305176, + "step": 11378 + }, + { + "epoch": 1.85, + "learning_rate": 1.5281194047980562e-07, + "logits/chosen": -1.2994028329849243, + "logits/rejected": -0.9534527063369751, + "logps/chosen": -141.42852783203125, + "logps/rejected": -9.401538848876953, + "loss": 0.0156, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.714005947113037, + "rewards/margins": 3.8670737743377686, + "rewards/rejected": 0.8469322323799133, + "step": 11379 + }, + { + "epoch": 1.85, + "learning_rate": 1.5248967154135708e-07, + "logits/chosen": -1.5177892446517944, + "logits/rejected": -1.3130977153778076, + "logps/chosen": -43.623085021972656, + "logps/rejected": -63.83068084716797, + "loss": 1.303, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.0677452087402344, + "rewards/margins": -2.152737617492676, + "rewards/rejected": 5.22048282623291, + "step": 11380 + }, + { + "epoch": 1.85, + "learning_rate": 1.521677375200048e-07, + "logits/chosen": -1.3318164348602295, + "logits/rejected": -1.3597908020019531, + "logps/chosen": -79.2109375, + "logps/rejected": -51.3321533203125, + "loss": 0.2778, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.676713705062866, + "rewards/margins": 0.3734910488128662, + "rewards/rejected": 2.30322265625, + "step": 11381 + }, + { + "epoch": 1.85, + "learning_rate": 1.5184613843799046e-07, + "logits/chosen": -1.3530772924423218, + "logits/rejected": -1.1325143575668335, + "logps/chosen": -58.248802185058594, + "logps/rejected": -14.636432647705078, + "loss": 0.1995, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.385871171951294, + "rewards/margins": 2.685030460357666, + "rewards/rejected": 0.7008407711982727, + "step": 11382 + }, + { + "epoch": 1.85, + "learning_rate": 1.5152487431753515e-07, + "logits/chosen": -1.5622061491012573, + "logits/rejected": -1.5211900472640991, + "logps/chosen": -58.617164611816406, + "logps/rejected": -69.48072814941406, + "loss": 0.6947, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.750354766845703, + "rewards/margins": -1.0938515663146973, + "rewards/rejected": 3.8442063331604004, + "step": 11383 + }, + { + "epoch": 1.85, + "learning_rate": 1.5120394518083338e-07, + "logits/chosen": -1.2962099313735962, + "logits/rejected": -1.2586033344268799, + "logps/chosen": -69.40959930419922, + "logps/rejected": -23.000577926635742, + "loss": 0.2862, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.298919677734375, + "rewards/margins": 1.249686598777771, + "rewards/rejected": 1.049233078956604, + "step": 11384 + }, + { + "epoch": 1.85, + "learning_rate": 1.5088335105006014e-07, + "logits/chosen": -1.2868616580963135, + "logits/rejected": -1.2868616580963135, + "logps/chosen": -30.487138748168945, + "logps/rejected": -30.487138748168945, + "loss": 0.3601, + "rewards/accuracies": 0.0, + "rewards/chosen": 4.376160621643066, + "rewards/margins": 0.0, + "rewards/rejected": 4.376160621643066, + "step": 11385 + }, + { + "epoch": 1.85, + "learning_rate": 1.5056309194736385e-07, + "logits/chosen": -1.4526288509368896, + "logits/rejected": -1.3579837083816528, + "logps/chosen": -86.25851440429688, + "logps/rejected": -81.7125244140625, + "loss": 0.1312, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.5514678955078125, + "rewards/margins": 1.3762848377227783, + "rewards/rejected": 3.175183057785034, + "step": 11386 + }, + { + "epoch": 1.85, + "learning_rate": 1.5024316789487292e-07, + "logits/chosen": -1.2280999422073364, + "logits/rejected": -1.2016761302947998, + "logps/chosen": -71.06715393066406, + "logps/rejected": -70.23795318603516, + "loss": 0.3874, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.592459201812744, + "rewards/margins": 1.4425652027130127, + "rewards/rejected": 2.1498939990997314, + "step": 11387 + }, + { + "epoch": 1.85, + "learning_rate": 1.4992357891468967e-07, + "logits/chosen": -1.3319523334503174, + "logits/rejected": -1.3679008483886719, + "logps/chosen": -54.148521423339844, + "logps/rejected": -91.6330337524414, + "loss": 1.6562, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.945258378982544, + "rewards/margins": -1.6552398204803467, + "rewards/rejected": 3.6004981994628906, + "step": 11388 + }, + { + "epoch": 1.85, + "learning_rate": 1.4960432502889588e-07, + "logits/chosen": -1.20212721824646, + "logits/rejected": -1.2120615243911743, + "logps/chosen": -84.75503540039062, + "logps/rejected": -62.38391876220703, + "loss": 0.5455, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.351344347000122, + "rewards/margins": 0.13006973266601562, + "rewards/rejected": 2.2212746143341064, + "step": 11389 + }, + { + "epoch": 1.85, + "learning_rate": 1.4928540625954725e-07, + "logits/chosen": -1.3700807094573975, + "logits/rejected": -1.2839725017547607, + "logps/chosen": -72.46090698242188, + "logps/rejected": -51.41147994995117, + "loss": 0.5463, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.2834718227386475, + "rewards/margins": 0.0013256072998046875, + "rewards/rejected": 2.2821462154388428, + "step": 11390 + }, + { + "epoch": 1.85, + "learning_rate": 1.4896682262868e-07, + "logits/chosen": -1.1708018779754639, + "logits/rejected": -1.1596134901046753, + "logps/chosen": -83.51850891113281, + "logps/rejected": -90.92649841308594, + "loss": 1.2238, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.1367568969726562, + "rewards/margins": -1.7517457008361816, + "rewards/rejected": 2.888502597808838, + "step": 11391 + }, + { + "epoch": 1.85, + "learning_rate": 1.4864857415830436e-07, + "logits/chosen": -1.2659847736358643, + "logits/rejected": -1.1661821603775024, + "logps/chosen": -93.13565063476562, + "logps/rejected": -39.63126754760742, + "loss": 0.2098, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.8888596296310425, + "rewards/margins": 1.012040376663208, + "rewards/rejected": 0.8768192529678345, + "step": 11392 + }, + { + "epoch": 1.85, + "learning_rate": 1.483306608704077e-07, + "logits/chosen": -1.3773894309997559, + "logits/rejected": -1.2816038131713867, + "logps/chosen": -124.67286682128906, + "logps/rejected": -82.09037780761719, + "loss": 0.0761, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.3486008644104, + "rewards/margins": 1.9081754684448242, + "rewards/rejected": 4.440425395965576, + "step": 11393 + }, + { + "epoch": 1.85, + "learning_rate": 1.4801308278695636e-07, + "logits/chosen": -1.2978863716125488, + "logits/rejected": -1.2978863716125488, + "logps/chosen": -20.209869384765625, + "logps/rejected": -20.209869384765625, + "loss": 0.6977, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.0304899215698242, + "rewards/margins": 0.0, + "rewards/rejected": 1.0304899215698242, + "step": 11394 + }, + { + "epoch": 1.85, + "learning_rate": 1.4769583992989056e-07, + "logits/chosen": -1.3132299184799194, + "logits/rejected": -1.2704647779464722, + "logps/chosen": -179.56338500976562, + "logps/rejected": -60.715118408203125, + "loss": 0.6172, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.627172827720642, + "rewards/margins": -0.7680672407150269, + "rewards/rejected": 2.395240068435669, + "step": 11395 + }, + { + "epoch": 1.85, + "learning_rate": 1.4737893232112944e-07, + "logits/chosen": -1.0464226007461548, + "logits/rejected": -1.105748176574707, + "logps/chosen": -77.04963684082031, + "logps/rejected": -95.31474304199219, + "loss": 0.2452, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.3071656227111816, + "rewards/margins": 1.175377607345581, + "rewards/rejected": 2.1317880153656006, + "step": 11396 + }, + { + "epoch": 1.85, + "learning_rate": 1.4706235998256767e-07, + "logits/chosen": -1.150255799293518, + "logits/rejected": -1.136839509010315, + "logps/chosen": -62.663089752197266, + "logps/rejected": -65.86070251464844, + "loss": 1.519, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.9383037090301514, + "rewards/margins": -2.070643186569214, + "rewards/rejected": 5.008946895599365, + "step": 11397 + }, + { + "epoch": 1.85, + "learning_rate": 1.4674612293607893e-07, + "logits/chosen": -1.346855640411377, + "logits/rejected": -1.1599528789520264, + "logps/chosen": -50.8637580871582, + "logps/rejected": -76.92369079589844, + "loss": 1.4621, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.66998553276062, + "rewards/margins": -1.7061269283294678, + "rewards/rejected": 4.376112461090088, + "step": 11398 + }, + { + "epoch": 1.85, + "learning_rate": 1.4643022120351012e-07, + "logits/chosen": -1.5196019411087036, + "logits/rejected": -1.299980640411377, + "logps/chosen": -134.71185302734375, + "logps/rejected": -26.992280960083008, + "loss": 0.3133, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.21697998046875, + "rewards/margins": 3.819079875946045, + "rewards/rejected": 1.3979002237319946, + "step": 11399 + }, + { + "epoch": 1.85, + "learning_rate": 1.4611465480668884e-07, + "logits/chosen": -1.369370937347412, + "logits/rejected": -1.2923325300216675, + "logps/chosen": -81.59053039550781, + "logps/rejected": -37.87818908691406, + "loss": 0.54, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.4965667724609375, + "rewards/margins": -0.02725386619567871, + "rewards/rejected": 2.523820638656616, + "step": 11400 + }, + { + "epoch": 1.85, + "learning_rate": 1.4579942376741706e-07, + "logits/chosen": -0.9707120656967163, + "logits/rejected": -0.9682797789573669, + "logps/chosen": -38.28511047363281, + "logps/rejected": -76.73670959472656, + "loss": 1.376, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.6749542951583862, + "rewards/margins": -2.642265796661377, + "rewards/rejected": 4.317220211029053, + "step": 11401 + }, + { + "epoch": 1.85, + "learning_rate": 1.4548452810747405e-07, + "logits/chosen": -1.010873794555664, + "logits/rejected": -0.6001093983650208, + "logps/chosen": -132.93377685546875, + "logps/rejected": -52.07443618774414, + "loss": 0.0816, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.2960662841796875, + "rewards/margins": 2.5819034576416016, + "rewards/rejected": 2.714162826538086, + "step": 11402 + }, + { + "epoch": 1.85, + "learning_rate": 1.451699678486157e-07, + "logits/chosen": -1.3634666204452515, + "logits/rejected": -1.261236310005188, + "logps/chosen": -90.24578857421875, + "logps/rejected": -68.59089660644531, + "loss": 0.1063, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.5667724609375, + "rewards/margins": 1.7004241943359375, + "rewards/rejected": 3.8663482666015625, + "step": 11403 + }, + { + "epoch": 1.85, + "learning_rate": 1.4485574301257688e-07, + "logits/chosen": -1.520820140838623, + "logits/rejected": -1.5013071298599243, + "logps/chosen": -67.21318817138672, + "logps/rejected": -78.26954650878906, + "loss": 0.5504, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.9360771179199219, + "rewards/margins": 0.6863089799880981, + "rewards/rejected": 1.2497681379318237, + "step": 11404 + }, + { + "epoch": 1.85, + "learning_rate": 1.445418536210652e-07, + "logits/chosen": -1.1852368116378784, + "logits/rejected": -1.0257455110549927, + "logps/chosen": -43.776153564453125, + "logps/rejected": -10.555562973022461, + "loss": 1.043, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.8635578155517578, + "rewards/margins": 0.9869443774223328, + "rewards/rejected": 0.876613438129425, + "step": 11405 + }, + { + "epoch": 1.85, + "learning_rate": 1.4422829969576945e-07, + "logits/chosen": -1.2941051721572876, + "logits/rejected": -1.2628140449523926, + "logps/chosen": -88.7108154296875, + "logps/rejected": -71.97846984863281, + "loss": 0.5319, + "rewards/accuracies": 0.0, + "rewards/chosen": 5.803802490234375, + "rewards/margins": -0.633122444152832, + "rewards/rejected": 6.436924934387207, + "step": 11406 + }, + { + "epoch": 1.85, + "learning_rate": 1.439150812583523e-07, + "logits/chosen": -1.5020414590835571, + "logits/rejected": -1.4280465841293335, + "logps/chosen": -63.853294372558594, + "logps/rejected": -20.708358764648438, + "loss": 0.0434, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0974419116973877, + "rewards/margins": 2.417146921157837, + "rewards/rejected": 0.6802949905395508, + "step": 11407 + }, + { + "epoch": 1.85, + "learning_rate": 1.4360219833045476e-07, + "logits/chosen": -1.3451160192489624, + "logits/rejected": -1.501940131187439, + "logps/chosen": -36.01473617553711, + "logps/rejected": -144.32884216308594, + "loss": 3.308, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.622407913208008, + "rewards/margins": -6.585447311401367, + "rewards/rejected": 9.207855224609375, + "step": 11408 + }, + { + "epoch": 1.85, + "learning_rate": 1.4328965093369284e-07, + "logits/chosen": -1.2310737371444702, + "logits/rejected": -1.1900626420974731, + "logps/chosen": -49.58688735961914, + "logps/rejected": -52.35327911376953, + "loss": 0.9453, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.2728397846221924, + "rewards/margins": -0.674777626991272, + "rewards/rejected": 1.9476174116134644, + "step": 11409 + }, + { + "epoch": 1.85, + "learning_rate": 1.4297743908966212e-07, + "logits/chosen": -1.3697768449783325, + "logits/rejected": -1.3449971675872803, + "logps/chosen": -73.40118408203125, + "logps/rejected": -50.251792907714844, + "loss": 0.2789, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.5630416870117188, + "rewards/margins": 0.7616584300994873, + "rewards/rejected": 2.8013832569122314, + "step": 11410 + }, + { + "epoch": 1.85, + "learning_rate": 1.4266556281993193e-07, + "logits/chosen": -1.1510133743286133, + "logits/rejected": -1.1492226123809814, + "logps/chosen": -6.733959197998047, + "logps/rejected": -8.133965492248535, + "loss": 0.4795, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.45980367064476013, + "rewards/margins": 0.2387331873178482, + "rewards/rejected": 0.22107048332691193, + "step": 11411 + }, + { + "epoch": 1.85, + "learning_rate": 1.4235402214605175e-07, + "logits/chosen": -0.611833930015564, + "logits/rejected": -0.611833930015564, + "logps/chosen": -8.281070709228516, + "logps/rejected": -8.281070709228516, + "loss": 0.3723, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.4262198507785797, + "rewards/margins": 0.0, + "rewards/rejected": 0.4262198507785797, + "step": 11412 + }, + { + "epoch": 1.85, + "learning_rate": 1.4204281708954437e-07, + "logits/chosen": -1.3524119853973389, + "logits/rejected": -1.3850606679916382, + "logps/chosen": -118.76019287109375, + "logps/rejected": -99.01565551757812, + "loss": 0.4076, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.530020236968994, + "rewards/margins": 0.5166778564453125, + "rewards/rejected": 5.013342380523682, + "step": 11413 + }, + { + "epoch": 1.85, + "learning_rate": 1.4173194767191257e-07, + "logits/chosen": -1.0376982688903809, + "logits/rejected": -1.0376982688903809, + "logps/chosen": -0.9791499972343445, + "logps/rejected": -0.9791499972343445, + "loss": 0.4525, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.27328428626060486, + "rewards/margins": 0.0, + "rewards/rejected": 0.27328428626060486, + "step": 11414 + }, + { + "epoch": 1.85, + "learning_rate": 1.414214139146336e-07, + "logits/chosen": -0.8179313540458679, + "logits/rejected": -0.8422888517379761, + "logps/chosen": -4.318436145782471, + "logps/rejected": -27.662755966186523, + "loss": 0.34, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.30494123697280884, + "rewards/margins": 0.027619600296020508, + "rewards/rejected": 0.27732163667678833, + "step": 11415 + }, + { + "epoch": 1.85, + "learning_rate": 1.4111121583916254e-07, + "logits/chosen": -1.52646803855896, + "logits/rejected": -1.5437482595443726, + "logps/chosen": -32.21727752685547, + "logps/rejected": -83.41752624511719, + "loss": 0.7772, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.240856885910034, + "rewards/margins": -0.7614409923553467, + "rewards/rejected": 4.002297878265381, + "step": 11416 + }, + { + "epoch": 1.85, + "learning_rate": 1.4080135346693112e-07, + "logits/chosen": -1.1890451908111572, + "logits/rejected": -1.1890451908111572, + "logps/chosen": -14.816442489624023, + "logps/rejected": -14.816442489624023, + "loss": 0.3784, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.0169016122817993, + "rewards/margins": 0.0, + "rewards/rejected": 1.0169016122817993, + "step": 11417 + }, + { + "epoch": 1.85, + "learning_rate": 1.4049182681934835e-07, + "logits/chosen": -1.4149209260940552, + "logits/rejected": -1.33890700340271, + "logps/chosen": -96.35816955566406, + "logps/rejected": -25.85855484008789, + "loss": 0.6626, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.880514621734619, + "rewards/margins": 2.119868755340576, + "rewards/rejected": 1.7606457471847534, + "step": 11418 + }, + { + "epoch": 1.85, + "learning_rate": 1.4018263591779935e-07, + "logits/chosen": -1.4800711870193481, + "logits/rejected": -1.5049912929534912, + "logps/chosen": -95.2106704711914, + "logps/rejected": -161.93145751953125, + "loss": 0.1377, + "rewards/accuracies": 1.0, + "rewards/chosen": 8.429793357849121, + "rewards/margins": 1.3147878646850586, + "rewards/rejected": 7.1150054931640625, + "step": 11419 + }, + { + "epoch": 1.85, + "learning_rate": 1.3987378078364534e-07, + "logits/chosen": -1.3670850992202759, + "logits/rejected": -1.3448936939239502, + "logps/chosen": -83.61239624023438, + "logps/rejected": -42.39512252807617, + "loss": 1.0639, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.261953830718994, + "rewards/margins": -0.12444710731506348, + "rewards/rejected": 3.3864009380340576, + "step": 11420 + }, + { + "epoch": 1.85, + "learning_rate": 1.3956526143822701e-07, + "logits/chosen": -1.4706426858901978, + "logits/rejected": -1.4706426858901978, + "logps/chosen": -28.36431884765625, + "logps/rejected": -28.36431884765625, + "loss": 0.4386, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.040210008621216, + "rewards/margins": 0.0, + "rewards/rejected": 3.040210008621216, + "step": 11421 + }, + { + "epoch": 1.85, + "learning_rate": 1.3925707790285848e-07, + "logits/chosen": -1.4925782680511475, + "logits/rejected": -1.389044165611267, + "logps/chosen": -70.0845718383789, + "logps/rejected": -34.24521255493164, + "loss": 0.4533, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.457530975341797, + "rewards/margins": 2.1729965209960938, + "rewards/rejected": 0.2845344543457031, + "step": 11422 + }, + { + "epoch": 1.85, + "learning_rate": 1.3894923019883378e-07, + "logits/chosen": -1.6611305475234985, + "logits/rejected": -1.5530343055725098, + "logps/chosen": -81.64859008789062, + "logps/rejected": -35.69305419921875, + "loss": 0.3385, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.545794725418091, + "rewards/margins": 0.04157710075378418, + "rewards/rejected": 2.5042176246643066, + "step": 11423 + }, + { + "epoch": 1.85, + "learning_rate": 1.3864171834742146e-07, + "logits/chosen": -1.4904130697250366, + "logits/rejected": -1.2790266275405884, + "logps/chosen": -125.62379455566406, + "logps/rejected": -79.24712371826172, + "loss": 0.1071, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.327857971191406, + "rewards/margins": 2.833611249923706, + "rewards/rejected": 3.4942467212677, + "step": 11424 + }, + { + "epoch": 1.85, + "learning_rate": 1.3833454236986788e-07, + "logits/chosen": -0.8244641423225403, + "logits/rejected": -0.8316822648048401, + "logps/chosen": -5.124290466308594, + "logps/rejected": -1.4519672393798828, + "loss": 0.3644, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.43414878845214844, + "rewards/margins": 0.19208760559558868, + "rewards/rejected": 0.24206118285655975, + "step": 11425 + }, + { + "epoch": 1.85, + "learning_rate": 1.3802770228739547e-07, + "logits/chosen": -1.1238067150115967, + "logits/rejected": -1.1311208009719849, + "logps/chosen": -21.95541000366211, + "logps/rejected": -13.936711311340332, + "loss": 0.8022, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.4022684097290039, + "rewards/margins": -0.39720410108566284, + "rewards/rejected": 0.7994725108146667, + "step": 11426 + }, + { + "epoch": 1.85, + "learning_rate": 1.377211981212051e-07, + "logits/chosen": -1.3694579601287842, + "logits/rejected": -1.3165615797042847, + "logps/chosen": -61.831382751464844, + "logps/rejected": -59.438697814941406, + "loss": 0.2748, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.8247978687286377, + "rewards/margins": 0.9132378101348877, + "rewards/rejected": 2.91156005859375, + "step": 11427 + }, + { + "epoch": 1.85, + "learning_rate": 1.3741502989247146e-07, + "logits/chosen": -1.0293384790420532, + "logits/rejected": -1.0756129026412964, + "logps/chosen": -48.212894439697266, + "logps/rejected": -75.43092346191406, + "loss": 0.8189, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.9745464324951172, + "rewards/margins": -0.4207066297531128, + "rewards/rejected": 1.39525306224823, + "step": 11428 + }, + { + "epoch": 1.86, + "learning_rate": 1.3710919762235043e-07, + "logits/chosen": -1.1424716711044312, + "logits/rejected": -1.172127604484558, + "logps/chosen": -96.68189239501953, + "logps/rejected": -125.60360717773438, + "loss": 0.7721, + "rewards/accuracies": 0.0, + "rewards/chosen": 4.846621036529541, + "rewards/margins": -1.1922111511230469, + "rewards/rejected": 6.038832187652588, + "step": 11429 + }, + { + "epoch": 1.86, + "learning_rate": 1.368037013319695e-07, + "logits/chosen": -1.3244876861572266, + "logits/rejected": -1.144522786140442, + "logps/chosen": -63.274635314941406, + "logps/rejected": -37.18279266357422, + "loss": 0.3588, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.2756569385528564, + "rewards/margins": 0.8224575519561768, + "rewards/rejected": 0.4531993865966797, + "step": 11430 + }, + { + "epoch": 1.86, + "learning_rate": 1.3649854104243798e-07, + "logits/chosen": -0.6452344655990601, + "logits/rejected": -0.6452344655990601, + "logps/chosen": -5.736806392669678, + "logps/rejected": -5.736806392669678, + "loss": 0.4316, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.22671560943126678, + "rewards/margins": 0.0, + "rewards/rejected": 0.22671560943126678, + "step": 11431 + }, + { + "epoch": 1.86, + "learning_rate": 1.3619371677483727e-07, + "logits/chosen": -1.0765864849090576, + "logits/rejected": -1.1120409965515137, + "logps/chosen": -47.682762145996094, + "logps/rejected": -78.17852783203125, + "loss": 0.7364, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.8131051063537598, + "rewards/margins": 1.2314308881759644, + "rewards/rejected": 1.5816742181777954, + "step": 11432 + }, + { + "epoch": 1.86, + "learning_rate": 1.3588922855023002e-07, + "logits/chosen": -1.217306137084961, + "logits/rejected": -1.217306137084961, + "logps/chosen": -46.727840423583984, + "logps/rejected": -46.727840423583984, + "loss": 0.3724, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.0391788482666016, + "rewards/margins": 0.0, + "rewards/rejected": 2.0391788482666016, + "step": 11433 + }, + { + "epoch": 1.86, + "learning_rate": 1.3558507638965158e-07, + "logits/chosen": -1.6811996698379517, + "logits/rejected": -1.6024819612503052, + "logps/chosen": -65.73381042480469, + "logps/rejected": -82.43805694580078, + "loss": 0.0966, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.1079559326171875, + "rewards/margins": 1.9078407287597656, + "rewards/rejected": 4.200115203857422, + "step": 11434 + }, + { + "epoch": 1.86, + "learning_rate": 1.3528126031411737e-07, + "logits/chosen": -1.3708479404449463, + "logits/rejected": -1.400658369064331, + "logps/chosen": -81.88247680664062, + "logps/rejected": -90.318603515625, + "loss": 0.8249, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.183429002761841, + "rewards/margins": -0.47189784049987793, + "rewards/rejected": 3.6553268432617188, + "step": 11435 + }, + { + "epoch": 1.86, + "learning_rate": 1.349777803446173e-07, + "logits/chosen": -0.8582700490951538, + "logits/rejected": -0.7961395382881165, + "logps/chosen": -68.48475646972656, + "logps/rejected": -73.9834976196289, + "loss": 0.1716, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.5272469520568848, + "rewards/margins": 1.0341095924377441, + "rewards/rejected": 1.4931373596191406, + "step": 11436 + }, + { + "epoch": 1.86, + "learning_rate": 1.3467463650212008e-07, + "logits/chosen": -1.0833414793014526, + "logits/rejected": -1.081275224685669, + "logps/chosen": -96.34390258789062, + "logps/rejected": -66.38024139404297, + "loss": 0.0084, + "rewards/accuracies": 1.0, + "rewards/chosen": 7.398379802703857, + "rewards/margins": 4.230060577392578, + "rewards/rejected": 3.1683189868927, + "step": 11437 + }, + { + "epoch": 1.86, + "learning_rate": 1.3437182880756848e-07, + "logits/chosen": -1.3201227188110352, + "logits/rejected": -1.21614408493042, + "logps/chosen": -52.016944885253906, + "logps/rejected": -62.68913269042969, + "loss": 0.5831, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.1262154579162598, + "rewards/margins": -0.38306641578674316, + "rewards/rejected": 3.509281873703003, + "step": 11438 + }, + { + "epoch": 1.86, + "learning_rate": 1.3406935728188519e-07, + "logits/chosen": -1.1233464479446411, + "logits/rejected": -0.9412662982940674, + "logps/chosen": -34.9409065246582, + "logps/rejected": -14.01048469543457, + "loss": 0.7942, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.2066951990127563, + "rewards/margins": 0.13497257232666016, + "rewards/rejected": 1.0717226266860962, + "step": 11439 + }, + { + "epoch": 1.86, + "learning_rate": 1.337672219459668e-07, + "logits/chosen": -1.2552473545074463, + "logits/rejected": -1.2088767290115356, + "logps/chosen": -42.149139404296875, + "logps/rejected": -61.53614807128906, + "loss": 0.2094, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.542987108230591, + "rewards/margins": 0.7393585443496704, + "rewards/rejected": 1.8036285638809204, + "step": 11440 + }, + { + "epoch": 1.86, + "learning_rate": 1.334654228206894e-07, + "logits/chosen": -1.1105667352676392, + "logits/rejected": -1.1022759675979614, + "logps/chosen": -57.30004119873047, + "logps/rejected": -39.873146057128906, + "loss": 0.3452, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.214118242263794, + "rewards/margins": 0.10011899471282959, + "rewards/rejected": 1.1139992475509644, + "step": 11441 + }, + { + "epoch": 1.86, + "learning_rate": 1.3316395992690302e-07, + "logits/chosen": -1.2157039642333984, + "logits/rejected": -1.1035690307617188, + "logps/chosen": -49.459205627441406, + "logps/rejected": -16.305744171142578, + "loss": 2.2727, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.5160210132598877, + "rewards/margins": 1.2032718658447266, + "rewards/rejected": 0.31274911761283875, + "step": 11442 + }, + { + "epoch": 1.86, + "learning_rate": 1.3286283328543702e-07, + "logits/chosen": -1.3543548583984375, + "logits/rejected": -1.2823100090026855, + "logps/chosen": -186.78660583496094, + "logps/rejected": -153.00302124023438, + "loss": 0.1379, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.4257097244262695, + "rewards/margins": 1.1522140502929688, + "rewards/rejected": 4.273495674133301, + "step": 11443 + }, + { + "epoch": 1.86, + "learning_rate": 1.325620429170954e-07, + "logits/chosen": -0.9295991659164429, + "logits/rejected": -0.8671115040779114, + "logps/chosen": -42.14581298828125, + "logps/rejected": -38.14509963989258, + "loss": 1.7092, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.029404401779175, + "rewards/margins": 0.40832364559173584, + "rewards/rejected": 1.621080756187439, + "step": 11444 + }, + { + "epoch": 1.86, + "learning_rate": 1.3226158884266094e-07, + "logits/chosen": -1.4323536157608032, + "logits/rejected": -1.364580750465393, + "logps/chosen": -56.957130432128906, + "logps/rejected": -17.80582046508789, + "loss": 2.0126, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.8711150884628296, + "rewards/margins": 1.2559077739715576, + "rewards/rejected": 0.615207314491272, + "step": 11445 + }, + { + "epoch": 1.86, + "learning_rate": 1.3196147108289148e-07, + "logits/chosen": -1.1071958541870117, + "logits/rejected": -1.1330280303955078, + "logps/chosen": -33.53020095825195, + "logps/rejected": -65.519775390625, + "loss": 0.7523, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.8803943991661072, + "rewards/margins": -0.9499080777168274, + "rewards/rejected": 1.8303024768829346, + "step": 11446 + }, + { + "epoch": 1.86, + "learning_rate": 1.316616896585232e-07, + "logits/chosen": -1.4407964944839478, + "logits/rejected": -1.3914752006530762, + "logps/chosen": -115.3297119140625, + "logps/rejected": -127.62257385253906, + "loss": 0.0848, + "rewards/accuracies": 1.0, + "rewards/chosen": 8.48034954071045, + "rewards/margins": 1.712937831878662, + "rewards/rejected": 6.767411708831787, + "step": 11447 + }, + { + "epoch": 1.86, + "learning_rate": 1.3136224459026681e-07, + "logits/chosen": -1.201686978340149, + "logits/rejected": -1.0869896411895752, + "logps/chosen": -45.7412109375, + "logps/rejected": -49.367881774902344, + "loss": 2.0331, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.1338372230529785, + "rewards/margins": -1.5751242637634277, + "rewards/rejected": 3.7089614868164062, + "step": 11448 + }, + { + "epoch": 1.86, + "learning_rate": 1.310631358988118e-07, + "logits/chosen": -1.3653863668441772, + "logits/rejected": -1.3699887990951538, + "logps/chosen": -5.646286964416504, + "logps/rejected": -1.369464635848999, + "loss": 1.0612, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.11179542541503906, + "rewards/margins": -0.2946518063545227, + "rewards/rejected": 0.40644723176956177, + "step": 11449 + }, + { + "epoch": 1.86, + "learning_rate": 1.3076436360482391e-07, + "logits/chosen": -1.3452680110931396, + "logits/rejected": -1.2733484506607056, + "logps/chosen": -52.763824462890625, + "logps/rejected": -50.28736114501953, + "loss": 0.2713, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2804527282714844, + "rewards/margins": 1.7038047313690186, + "rewards/rejected": 1.5766479969024658, + "step": 11450 + }, + { + "epoch": 1.86, + "learning_rate": 1.3046592772894551e-07, + "logits/chosen": -1.179809808731079, + "logits/rejected": -1.2110192775726318, + "logps/chosen": -18.534595489501953, + "logps/rejected": -31.280065536499023, + "loss": 0.519, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.107927680015564, + "rewards/margins": 0.29520851373672485, + "rewards/rejected": 0.8127191662788391, + "step": 11451 + }, + { + "epoch": 1.86, + "learning_rate": 1.3016782829179564e-07, + "logits/chosen": -1.2360782623291016, + "logits/rejected": -1.2289597988128662, + "logps/chosen": -45.71410369873047, + "logps/rejected": -55.788108825683594, + "loss": 0.733, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.143353223800659, + "rewards/margins": 1.3412772417068481, + "rewards/rejected": 1.802075982093811, + "step": 11452 + }, + { + "epoch": 1.86, + "learning_rate": 1.298700653139695e-07, + "logits/chosen": -1.0637147426605225, + "logits/rejected": -1.060531497001648, + "logps/chosen": -3.895683765411377, + "logps/rejected": -12.5840425491333, + "loss": 0.5226, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.544574499130249, + "rewards/margins": -0.2957131862640381, + "rewards/rejected": 0.8402876853942871, + "step": 11453 + }, + { + "epoch": 1.86, + "learning_rate": 1.295726388160412e-07, + "logits/chosen": -1.0088320970535278, + "logits/rejected": -0.9808770418167114, + "logps/chosen": -56.15962600708008, + "logps/rejected": -52.39929962158203, + "loss": 0.4342, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.3617695569992065, + "rewards/margins": -0.2827892303466797, + "rewards/rejected": 1.6445587873458862, + "step": 11454 + }, + { + "epoch": 1.86, + "learning_rate": 1.292755488185582e-07, + "logits/chosen": -1.3385860919952393, + "logits/rejected": -1.4105037450790405, + "logps/chosen": -53.43476104736328, + "logps/rejected": -55.42947006225586, + "loss": 1.0863, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.4355523586273193, + "rewards/margins": -1.6762683391571045, + "rewards/rejected": 5.111820697784424, + "step": 11455 + }, + { + "epoch": 1.86, + "learning_rate": 1.2897879534204848e-07, + "logits/chosen": -1.4412213563919067, + "logits/rejected": -1.4247288703918457, + "logps/chosen": -75.14759826660156, + "logps/rejected": -58.804664611816406, + "loss": 0.2708, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.150076389312744, + "rewards/margins": 0.3725060224533081, + "rewards/rejected": 1.777570366859436, + "step": 11456 + }, + { + "epoch": 1.86, + "learning_rate": 1.2868237840701347e-07, + "logits/chosen": -1.2471712827682495, + "logits/rejected": -0.947018027305603, + "logps/chosen": -83.01241302490234, + "logps/rejected": -40.180274963378906, + "loss": 0.2349, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.855212450027466, + "rewards/margins": 0.5297667980194092, + "rewards/rejected": 2.3254456520080566, + "step": 11457 + }, + { + "epoch": 1.86, + "learning_rate": 1.2838629803393343e-07, + "logits/chosen": -1.6689373254776, + "logits/rejected": -1.6838423013687134, + "logps/chosen": -62.592159271240234, + "logps/rejected": -60.40786361694336, + "loss": 0.4212, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.7708148956298828, + "rewards/margins": -0.08393096923828125, + "rewards/rejected": 1.854745864868164, + "step": 11458 + }, + { + "epoch": 1.86, + "learning_rate": 1.280905542432642e-07, + "logits/chosen": -1.1393349170684814, + "logits/rejected": -1.0887176990509033, + "logps/chosen": -101.22257995605469, + "logps/rejected": -62.479942321777344, + "loss": 0.3944, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.1656067371368408, + "rewards/margins": -0.13363564014434814, + "rewards/rejected": 1.299242377281189, + "step": 11459 + }, + { + "epoch": 1.86, + "learning_rate": 1.277951470554395e-07, + "logits/chosen": -1.6282687187194824, + "logits/rejected": -1.5539473295211792, + "logps/chosen": -117.64324951171875, + "logps/rejected": -93.25785827636719, + "loss": 0.0787, + "rewards/accuracies": 1.0, + "rewards/chosen": 8.651019096374512, + "rewards/margins": 2.385444164276123, + "rewards/rejected": 6.265574932098389, + "step": 11460 + }, + { + "epoch": 1.86, + "learning_rate": 1.275000764908685e-07, + "logits/chosen": -1.5665923357009888, + "logits/rejected": -1.295719861984253, + "logps/chosen": -204.50572204589844, + "logps/rejected": -69.20130920410156, + "loss": 0.0088, + "rewards/accuracies": 1.0, + "rewards/chosen": 9.744148254394531, + "rewards/margins": 4.69183349609375, + "rewards/rejected": 5.052314758300781, + "step": 11461 + }, + { + "epoch": 1.86, + "learning_rate": 1.2720534256993877e-07, + "logits/chosen": -1.516445517539978, + "logits/rejected": -1.4106309413909912, + "logps/chosen": -103.59236145019531, + "logps/rejected": -29.482555389404297, + "loss": 0.3184, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.193732500076294, + "rewards/margins": 0.37561309337615967, + "rewards/rejected": 1.8181194067001343, + "step": 11462 + }, + { + "epoch": 1.86, + "learning_rate": 1.2691094531301296e-07, + "logits/chosen": -1.1520098447799683, + "logits/rejected": -1.1210544109344482, + "logps/chosen": -33.90597152709961, + "logps/rejected": -50.66380310058594, + "loss": 0.7492, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.3899877071380615, + "rewards/margins": 0.33891868591308594, + "rewards/rejected": 2.0510690212249756, + "step": 11463 + }, + { + "epoch": 1.86, + "learning_rate": 1.2661688474043145e-07, + "logits/chosen": -0.926160991191864, + "logits/rejected": -0.8541653752326965, + "logps/chosen": -96.7146987915039, + "logps/rejected": -50.0103645324707, + "loss": 0.6754, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.531491994857788, + "rewards/margins": -0.19582700729370117, + "rewards/rejected": 2.7273190021514893, + "step": 11464 + }, + { + "epoch": 1.86, + "learning_rate": 1.2632316087251017e-07, + "logits/chosen": -1.2533917427062988, + "logits/rejected": -1.3169879913330078, + "logps/chosen": -14.69904613494873, + "logps/rejected": -28.306608200073242, + "loss": 1.497, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.4362977743148804, + "rewards/margins": -2.441455841064453, + "rewards/rejected": 3.877753496170044, + "step": 11465 + }, + { + "epoch": 1.86, + "learning_rate": 1.2602977372954405e-07, + "logits/chosen": -1.3024909496307373, + "logits/rejected": -1.5441826581954956, + "logps/chosen": -61.06187438964844, + "logps/rejected": -37.759521484375, + "loss": 0.0996, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.2794272899627686, + "rewards/margins": 2.0872890949249268, + "rewards/rejected": 0.19213829934597015, + "step": 11466 + }, + { + "epoch": 1.86, + "learning_rate": 1.2573672333180187e-07, + "logits/chosen": -1.122033953666687, + "logits/rejected": -1.2483800649642944, + "logps/chosen": -82.4900894165039, + "logps/rejected": -165.45455932617188, + "loss": 2.3313, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.1759896278381348, + "rewards/margins": -4.634035587310791, + "rewards/rejected": 7.810025215148926, + "step": 11467 + }, + { + "epoch": 1.86, + "learning_rate": 1.2544400969953185e-07, + "logits/chosen": -0.9965231418609619, + "logits/rejected": -0.8906459808349609, + "logps/chosen": -31.664146423339844, + "logps/rejected": -50.38478088378906, + "loss": 0.9741, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.8125572204589844, + "rewards/margins": -0.919365644454956, + "rewards/rejected": 2.7319228649139404, + "step": 11468 + }, + { + "epoch": 1.86, + "learning_rate": 1.2515163285295672e-07, + "logits/chosen": -1.4288345575332642, + "logits/rejected": -1.4089304208755493, + "logps/chosen": -87.47134399414062, + "logps/rejected": -71.70642852783203, + "loss": 0.6328, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.9846184253692627, + "rewards/margins": -0.9290084838867188, + "rewards/rejected": 2.9136269092559814, + "step": 11469 + }, + { + "epoch": 1.86, + "learning_rate": 1.2485959281227865e-07, + "logits/chosen": -1.2587260007858276, + "logits/rejected": -1.2816896438598633, + "logps/chosen": -84.27139282226562, + "logps/rejected": -63.67678451538086, + "loss": 0.343, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.2443695068359375, + "rewards/margins": 1.1397120952606201, + "rewards/rejected": 1.1046574115753174, + "step": 11470 + }, + { + "epoch": 1.86, + "learning_rate": 1.2456788959767264e-07, + "logits/chosen": -1.4997302293777466, + "logits/rejected": -1.5246577262878418, + "logps/chosen": -90.92279815673828, + "logps/rejected": -73.17499542236328, + "loss": 0.2294, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.463696241378784, + "rewards/margins": 2.1028151512145996, + "rewards/rejected": 1.3608810901641846, + "step": 11471 + }, + { + "epoch": 1.86, + "learning_rate": 1.2427652322929419e-07, + "logits/chosen": -1.683771014213562, + "logits/rejected": -1.607800006866455, + "logps/chosen": -90.23727416992188, + "logps/rejected": -75.80780029296875, + "loss": 0.6148, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.5931785106658936, + "rewards/margins": -0.12436914443969727, + "rewards/rejected": 2.717547655105591, + "step": 11472 + }, + { + "epoch": 1.86, + "learning_rate": 1.239854937272733e-07, + "logits/chosen": -1.2733192443847656, + "logits/rejected": -1.2680658102035522, + "logps/chosen": -78.14535522460938, + "logps/rejected": -93.95712280273438, + "loss": 0.5531, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.8796981573104858, + "rewards/margins": 1.1936187744140625, + "rewards/rejected": 0.6860794425010681, + "step": 11473 + }, + { + "epoch": 1.86, + "learning_rate": 1.2369480111171784e-07, + "logits/chosen": -1.2355984449386597, + "logits/rejected": -1.2221225500106812, + "logps/chosen": -55.618431091308594, + "logps/rejected": -75.4568099975586, + "loss": 1.754, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.340993642807007, + "rewards/margins": -0.6981704235076904, + "rewards/rejected": 4.039164066314697, + "step": 11474 + }, + { + "epoch": 1.86, + "learning_rate": 1.2340444540271113e-07, + "logits/chosen": -1.4333871603012085, + "logits/rejected": -1.3765095472335815, + "logps/chosen": -185.94088745117188, + "logps/rejected": -64.13423156738281, + "loss": 0.1165, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.013978481292725, + "rewards/margins": 1.347346305847168, + "rewards/rejected": 4.666632175445557, + "step": 11475 + }, + { + "epoch": 1.86, + "learning_rate": 1.2311442662031493e-07, + "logits/chosen": -1.3827489614486694, + "logits/rejected": -1.2889817953109741, + "logps/chosen": -74.75196838378906, + "logps/rejected": -53.45264434814453, + "loss": 0.6422, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.469067573547363, + "rewards/margins": 0.49909162521362305, + "rewards/rejected": 4.96997594833374, + "step": 11476 + }, + { + "epoch": 1.86, + "learning_rate": 1.2282474478456542e-07, + "logits/chosen": -1.1483232975006104, + "logits/rejected": -1.1386370658874512, + "logps/chosen": -54.381736755371094, + "logps/rejected": -76.49791717529297, + "loss": 1.2415, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.4419662952423096, + "rewards/margins": 0.558178722858429, + "rewards/rejected": 0.8837875723838806, + "step": 11477 + }, + { + "epoch": 1.86, + "learning_rate": 1.225353999154788e-07, + "logits/chosen": -1.260223150253296, + "logits/rejected": -1.246307134628296, + "logps/chosen": -39.042198181152344, + "logps/rejected": -13.282479286193848, + "loss": 0.3283, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.030434489250183, + "rewards/margins": 0.08116394281387329, + "rewards/rejected": 0.9492705464363098, + "step": 11478 + }, + { + "epoch": 1.86, + "learning_rate": 1.2224639203304468e-07, + "logits/chosen": -1.2139592170715332, + "logits/rejected": -1.1607009172439575, + "logps/chosen": -43.75156784057617, + "logps/rejected": -40.941123962402344, + "loss": 0.385, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.122490406036377, + "rewards/margins": -0.1417696475982666, + "rewards/rejected": 3.2642600536346436, + "step": 11479 + }, + { + "epoch": 1.86, + "learning_rate": 1.2195772115723148e-07, + "logits/chosen": -1.2374340295791626, + "logits/rejected": -1.2312679290771484, + "logps/chosen": -54.40773391723633, + "logps/rejected": -54.807743072509766, + "loss": 0.2027, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.5555294752120972, + "rewards/margins": 0.6935238242149353, + "rewards/rejected": 0.8620056509971619, + "step": 11480 + }, + { + "epoch": 1.86, + "learning_rate": 1.2166938730798272e-07, + "logits/chosen": -1.4075461626052856, + "logits/rejected": -1.2762993574142456, + "logps/chosen": -157.67068481445312, + "logps/rejected": -17.42141342163086, + "loss": 1.4217, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.854361057281494, + "rewards/margins": 2.7208454608917236, + "rewards/rejected": 0.13351555168628693, + "step": 11481 + }, + { + "epoch": 1.86, + "learning_rate": 1.2138139050522024e-07, + "logits/chosen": -0.8955307006835938, + "logits/rejected": -1.0330877304077148, + "logps/chosen": -9.081109046936035, + "logps/rejected": -61.919227600097656, + "loss": 3.4644, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.35842496156692505, + "rewards/margins": -5.408775329589844, + "rewards/rejected": 5.767200469970703, + "step": 11482 + }, + { + "epoch": 1.86, + "learning_rate": 1.2109373076884144e-07, + "logits/chosen": -1.3787639141082764, + "logits/rejected": -1.339616060256958, + "logps/chosen": -98.46926879882812, + "logps/rejected": -57.77162551879883, + "loss": 0.4733, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0289809703826904, + "rewards/margins": 0.7820444107055664, + "rewards/rejected": 2.246936559677124, + "step": 11483 + }, + { + "epoch": 1.86, + "learning_rate": 1.2080640811872157e-07, + "logits/chosen": -1.3814561367034912, + "logits/rejected": -1.3476333618164062, + "logps/chosen": -65.278076171875, + "logps/rejected": -54.25670623779297, + "loss": 0.0991, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.176867723464966, + "rewards/margins": 1.761232852935791, + "rewards/rejected": 0.4156349301338196, + "step": 11484 + }, + { + "epoch": 1.86, + "learning_rate": 1.2051942257471195e-07, + "logits/chosen": -1.4758553504943848, + "logits/rejected": -1.5195512771606445, + "logps/chosen": -188.13992309570312, + "logps/rejected": -128.39820861816406, + "loss": 0.6258, + "rewards/accuracies": 1.0, + "rewards/chosen": 8.412747383117676, + "rewards/margins": 1.5783309936523438, + "rewards/rejected": 6.834416389465332, + "step": 11485 + }, + { + "epoch": 1.86, + "learning_rate": 1.2023277415663946e-07, + "logits/chosen": -1.4899476766586304, + "logits/rejected": -1.501050353050232, + "logps/chosen": -44.76216125488281, + "logps/rejected": -78.65157318115234, + "loss": 0.5712, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.030352830886841, + "rewards/margins": -0.6990485191345215, + "rewards/rejected": 2.7294013500213623, + "step": 11486 + }, + { + "epoch": 1.86, + "learning_rate": 1.1994646288431e-07, + "logits/chosen": -1.2588127851486206, + "logits/rejected": -1.267068862915039, + "logps/chosen": -60.88873291015625, + "logps/rejected": -80.36457824707031, + "loss": 0.4019, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.4930237531661987, + "rewards/margins": -0.12038874626159668, + "rewards/rejected": 1.6134124994277954, + "step": 11487 + }, + { + "epoch": 1.86, + "learning_rate": 1.1966048877750435e-07, + "logits/chosen": -1.0430997610092163, + "logits/rejected": -1.0773383378982544, + "logps/chosen": -47.92115783691406, + "logps/rejected": -171.3373565673828, + "loss": 0.1563, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.1376113891601562, + "rewards/margins": 1.248083472251892, + "rewards/rejected": 0.8895279169082642, + "step": 11488 + }, + { + "epoch": 1.86, + "learning_rate": 1.1937485185598064e-07, + "logits/chosen": -0.9424432516098022, + "logits/rejected": -0.9467095136642456, + "logps/chosen": -4.141108512878418, + "logps/rejected": -1.7219595909118652, + "loss": 0.4236, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.3681415021419525, + "rewards/margins": -0.2136978805065155, + "rewards/rejected": 0.581839382648468, + "step": 11489 + }, + { + "epoch": 1.86, + "learning_rate": 1.190895521394736e-07, + "logits/chosen": -1.3248926401138306, + "logits/rejected": -1.2401232719421387, + "logps/chosen": -102.83750915527344, + "logps/rejected": -53.15534591674805, + "loss": 0.7412, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.611554145812988, + "rewards/margins": 4.223729133605957, + "rewards/rejected": 2.387824773788452, + "step": 11490 + }, + { + "epoch": 1.87, + "learning_rate": 1.1880458964769526e-07, + "logits/chosen": -1.5213602781295776, + "logits/rejected": -1.476060390472412, + "logps/chosen": -78.76219177246094, + "logps/rejected": -35.88956069946289, + "loss": 0.6625, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.8034820556640625, + "rewards/margins": 1.6042903661727905, + "rewards/rejected": 0.19919167459011078, + "step": 11491 + }, + { + "epoch": 1.87, + "learning_rate": 1.185199644003332e-07, + "logits/chosen": -1.6008356809616089, + "logits/rejected": -1.6008356809616089, + "logps/chosen": -49.027896881103516, + "logps/rejected": -49.027896881103516, + "loss": 0.714, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.5371463298797607, + "rewards/margins": 0.0, + "rewards/rejected": 2.5371463298797607, + "step": 11492 + }, + { + "epoch": 1.87, + "learning_rate": 1.1823567641705281e-07, + "logits/chosen": -1.3598369359970093, + "logits/rejected": -1.4758970737457275, + "logps/chosen": -51.128570556640625, + "logps/rejected": -91.61921691894531, + "loss": 1.0622, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.9222915172576904, + "rewards/margins": -1.9937455654144287, + "rewards/rejected": 4.916037082672119, + "step": 11493 + }, + { + "epoch": 1.87, + "learning_rate": 1.1795172571749503e-07, + "logits/chosen": -1.395179271697998, + "logits/rejected": -1.4026405811309814, + "logps/chosen": -85.39894104003906, + "logps/rejected": -65.43207550048828, + "loss": 0.3314, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.675602912902832, + "rewards/margins": 0.13473272323608398, + "rewards/rejected": 5.540870189666748, + "step": 11494 + }, + { + "epoch": 1.87, + "learning_rate": 1.1766811232127917e-07, + "logits/chosen": -1.2470589876174927, + "logits/rejected": -1.22127366065979, + "logps/chosen": -92.9422836303711, + "logps/rejected": -56.10971450805664, + "loss": 0.6924, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.3558318614959717, + "rewards/margins": -0.22536277770996094, + "rewards/rejected": 3.5811946392059326, + "step": 11495 + }, + { + "epoch": 1.87, + "learning_rate": 1.1738483624799956e-07, + "logits/chosen": -0.9737900495529175, + "logits/rejected": -0.9737900495529175, + "logps/chosen": -39.57121276855469, + "logps/rejected": -39.57121276855469, + "loss": 0.8514, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.573004126548767, + "rewards/margins": 0.0, + "rewards/rejected": 1.573004126548767, + "step": 11496 + }, + { + "epoch": 1.87, + "learning_rate": 1.1710189751722777e-07, + "logits/chosen": -1.0206270217895508, + "logits/rejected": -1.0681025981903076, + "logps/chosen": -16.923479080200195, + "logps/rejected": -44.653167724609375, + "loss": 1.0769, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.863764226436615, + "rewards/margins": -1.8863334655761719, + "rewards/rejected": 2.7500977516174316, + "step": 11497 + }, + { + "epoch": 1.87, + "learning_rate": 1.1681929614851261e-07, + "logits/chosen": -0.7342500686645508, + "logits/rejected": -0.7518212199211121, + "logps/chosen": -2.534402847290039, + "logps/rejected": -23.773452758789062, + "loss": 0.7354, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.12418723106384277, + "rewards/margins": -0.9990373849868774, + "rewards/rejected": 1.1232246160507202, + "step": 11498 + }, + { + "epoch": 1.87, + "learning_rate": 1.1653703216137957e-07, + "logits/chosen": -0.9485626220703125, + "logits/rejected": -0.9521217346191406, + "logps/chosen": -18.775598526000977, + "logps/rejected": -26.017131805419922, + "loss": 0.3788, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5949918627738953, + "rewards/margins": 0.03013819456100464, + "rewards/rejected": 0.5648536682128906, + "step": 11499 + }, + { + "epoch": 1.87, + "learning_rate": 1.1625510557532916e-07, + "logits/chosen": -1.4780102968215942, + "logits/rejected": -1.5607854127883911, + "logps/chosen": -98.76683044433594, + "logps/rejected": -111.9581298828125, + "loss": 0.7089, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.916368246078491, + "rewards/margins": -1.1399152278900146, + "rewards/rejected": 4.056283473968506, + "step": 11500 + }, + { + "epoch": 1.87, + "learning_rate": 1.1597351640984078e-07, + "logits/chosen": -1.3306169509887695, + "logits/rejected": -1.3558138608932495, + "logps/chosen": -153.60130310058594, + "logps/rejected": -96.02134704589844, + "loss": 0.044, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.070703029632568, + "rewards/margins": 2.916445732116699, + "rewards/rejected": 2.154257297515869, + "step": 11501 + }, + { + "epoch": 1.87, + "learning_rate": 1.1569226468436889e-07, + "logits/chosen": -1.4861184358596802, + "logits/rejected": -1.4234950542449951, + "logps/chosen": -48.1815185546875, + "logps/rejected": -42.1512336730957, + "loss": 0.2166, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9350197315216064, + "rewards/margins": 0.7005259990692139, + "rewards/rejected": 2.2344937324523926, + "step": 11502 + }, + { + "epoch": 1.87, + "learning_rate": 1.1541135041834628e-07, + "logits/chosen": -1.1554477214813232, + "logits/rejected": -1.120535135269165, + "logps/chosen": -102.7374496459961, + "logps/rejected": -70.63294982910156, + "loss": 0.1457, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.078127384185791, + "rewards/margins": 1.2679557800292969, + "rewards/rejected": 4.810171604156494, + "step": 11503 + }, + { + "epoch": 1.87, + "learning_rate": 1.1513077363118075e-07, + "logits/chosen": -1.4841439723968506, + "logits/rejected": -1.3510035276412964, + "logps/chosen": -84.85693359375, + "logps/rejected": -61.81902313232422, + "loss": 0.0788, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.982740879058838, + "rewards/margins": 1.876340627670288, + "rewards/rejected": 3.10640025138855, + "step": 11504 + }, + { + "epoch": 1.87, + "learning_rate": 1.1485053434225791e-07, + "logits/chosen": -1.2746754884719849, + "logits/rejected": -1.175727367401123, + "logps/chosen": -83.2083969116211, + "logps/rejected": -57.56458282470703, + "loss": 0.3208, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.8036720752716064, + "rewards/margins": 1.5734268426895142, + "rewards/rejected": 1.2302452325820923, + "step": 11505 + }, + { + "epoch": 1.87, + "learning_rate": 1.1457063257093892e-07, + "logits/chosen": -1.217964768409729, + "logits/rejected": -1.2577146291732788, + "logps/chosen": -61.35963821411133, + "logps/rejected": -83.2059555053711, + "loss": 0.7106, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.68298077583313, + "rewards/margins": -1.0448932647705078, + "rewards/rejected": 3.7278740406036377, + "step": 11506 + }, + { + "epoch": 1.87, + "learning_rate": 1.1429106833656334e-07, + "logits/chosen": -1.079656720161438, + "logits/rejected": -1.0387543439865112, + "logps/chosen": -60.454532623291016, + "logps/rejected": -58.145294189453125, + "loss": 0.8705, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.0762691497802734, + "rewards/margins": -0.6159985065460205, + "rewards/rejected": 1.692267656326294, + "step": 11507 + }, + { + "epoch": 1.87, + "learning_rate": 1.140118416584457e-07, + "logits/chosen": -1.259734034538269, + "logits/rejected": -1.2684868574142456, + "logps/chosen": -93.7733154296875, + "logps/rejected": -96.63786315917969, + "loss": 1.3954, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.7738120555877686, + "rewards/margins": 0.049115657806396484, + "rewards/rejected": 2.724696397781372, + "step": 11508 + }, + { + "epoch": 1.87, + "learning_rate": 1.1373295255587836e-07, + "logits/chosen": -1.0367087125778198, + "logits/rejected": -0.9403846263885498, + "logps/chosen": -83.3177490234375, + "logps/rejected": -70.82609558105469, + "loss": 0.3374, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.31915283203125, + "rewards/margins": 0.2823660373687744, + "rewards/rejected": 2.0367867946624756, + "step": 11509 + }, + { + "epoch": 1.87, + "learning_rate": 1.1345440104812977e-07, + "logits/chosen": -1.1538639068603516, + "logits/rejected": -1.0882936716079712, + "logps/chosen": -54.573265075683594, + "logps/rejected": -71.48731994628906, + "loss": 0.359, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.4010307788848877, + "rewards/margins": -0.04396367073059082, + "rewards/rejected": 3.4449944496154785, + "step": 11510 + }, + { + "epoch": 1.87, + "learning_rate": 1.1317618715444511e-07, + "logits/chosen": -1.1844923496246338, + "logits/rejected": -1.1735941171646118, + "logps/chosen": -214.3761749267578, + "logps/rejected": -118.64590454101562, + "loss": 0.0677, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.997642517089844, + "rewards/margins": 2.508976697921753, + "rewards/rejected": 3.488665819168091, + "step": 11511 + }, + { + "epoch": 1.87, + "learning_rate": 1.1289831089404568e-07, + "logits/chosen": -1.059683918952942, + "logits/rejected": -1.0396195650100708, + "logps/chosen": -29.556079864501953, + "logps/rejected": -12.138269424438477, + "loss": 0.5127, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.2343113422393799, + "rewards/margins": 0.1360863447189331, + "rewards/rejected": 1.0982249975204468, + "step": 11512 + }, + { + "epoch": 1.87, + "learning_rate": 1.1262077228613167e-07, + "logits/chosen": -0.8331377506256104, + "logits/rejected": -0.6501589417457581, + "logps/chosen": -84.27334594726562, + "logps/rejected": -12.42313003540039, + "loss": 0.8401, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.9472122192382812, + "rewards/margins": 1.104954481124878, + "rewards/rejected": 0.8422576785087585, + "step": 11513 + }, + { + "epoch": 1.87, + "learning_rate": 1.1234357134987717e-07, + "logits/chosen": -1.1531093120574951, + "logits/rejected": -1.2137019634246826, + "logps/chosen": -55.3365478515625, + "logps/rejected": -122.03192138671875, + "loss": 0.5696, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.094741106033325, + "rewards/margins": -0.2893638610839844, + "rewards/rejected": 2.3841049671173096, + "step": 11514 + }, + { + "epoch": 1.87, + "learning_rate": 1.1206670810443466e-07, + "logits/chosen": -1.4995793104171753, + "logits/rejected": -1.5267493724822998, + "logps/chosen": -102.72113037109375, + "logps/rejected": -94.50688171386719, + "loss": 0.8593, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.9299042224884033, + "rewards/margins": -1.3361403942108154, + "rewards/rejected": 4.266044616699219, + "step": 11515 + }, + { + "epoch": 1.87, + "learning_rate": 1.1179018256893215e-07, + "logits/chosen": -1.1535297632217407, + "logits/rejected": -0.9972363710403442, + "logps/chosen": -118.80626678466797, + "logps/rejected": -62.929134368896484, + "loss": 0.0456, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.442276954650879, + "rewards/margins": 3.763003349304199, + "rewards/rejected": 1.6792736053466797, + "step": 11516 + }, + { + "epoch": 1.87, + "learning_rate": 1.1151399476247548e-07, + "logits/chosen": -1.455232858657837, + "logits/rejected": -1.5087954998016357, + "logps/chosen": -49.298500061035156, + "logps/rejected": -91.23385620117188, + "loss": 0.3634, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9263978004455566, + "rewards/margins": 0.1525590419769287, + "rewards/rejected": 2.773838758468628, + "step": 11517 + }, + { + "epoch": 1.87, + "learning_rate": 1.1123814470414607e-07, + "logits/chosen": -1.1711094379425049, + "logits/rejected": -1.1709972620010376, + "logps/chosen": -7.129422664642334, + "logps/rejected": -3.1296768188476562, + "loss": 0.4108, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.48742565512657166, + "rewards/margins": 0.003667384386062622, + "rewards/rejected": 0.48375827074050903, + "step": 11518 + }, + { + "epoch": 1.87, + "learning_rate": 1.1096263241300364e-07, + "logits/chosen": -1.492471694946289, + "logits/rejected": -1.3648687601089478, + "logps/chosen": -165.9637908935547, + "logps/rejected": -104.43521881103516, + "loss": 0.3111, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.528886318206787, + "rewards/margins": 1.6299092769622803, + "rewards/rejected": 2.898977041244507, + "step": 11519 + }, + { + "epoch": 1.87, + "learning_rate": 1.1068745790808244e-07, + "logits/chosen": -1.0904531478881836, + "logits/rejected": -1.0045033693313599, + "logps/chosen": -72.92092895507812, + "logps/rejected": -36.513118743896484, + "loss": 0.1856, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.3434510231018066, + "rewards/margins": 2.664386749267578, + "rewards/rejected": -0.32093581557273865, + "step": 11520 + }, + { + "epoch": 1.87, + "learning_rate": 1.1041262120839503e-07, + "logits/chosen": -1.3056718111038208, + "logits/rejected": -1.3133784532546997, + "logps/chosen": -29.956283569335938, + "logps/rejected": -70.38458251953125, + "loss": 0.2394, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.12680983543396, + "rewards/margins": 0.82700514793396, + "rewards/rejected": 1.2998046875, + "step": 11521 + }, + { + "epoch": 1.87, + "learning_rate": 1.1013812233293008e-07, + "logits/chosen": -1.1242629289627075, + "logits/rejected": -1.1201441287994385, + "logps/chosen": -75.14761352539062, + "logps/rejected": -80.64149475097656, + "loss": 0.5183, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.051203966140747, + "rewards/margins": 0.8013038635253906, + "rewards/rejected": 1.2499001026153564, + "step": 11522 + }, + { + "epoch": 1.87, + "learning_rate": 1.0986396130065247e-07, + "logits/chosen": -1.3809700012207031, + "logits/rejected": -0.9414090514183044, + "logps/chosen": -162.5571746826172, + "logps/rejected": -21.405616760253906, + "loss": 0.0066, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.434413433074951, + "rewards/margins": 5.162678241729736, + "rewards/rejected": 0.27173519134521484, + "step": 11523 + }, + { + "epoch": 1.87, + "learning_rate": 1.0959013813050368e-07, + "logits/chosen": -1.1817665100097656, + "logits/rejected": -1.074369192123413, + "logps/chosen": -93.66716003417969, + "logps/rejected": -93.6982192993164, + "loss": 0.0646, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.7712297439575195, + "rewards/margins": 2.0052895545959473, + "rewards/rejected": 3.7659401893615723, + "step": 11524 + }, + { + "epoch": 1.87, + "learning_rate": 1.0931665284140303e-07, + "logits/chosen": -1.4915331602096558, + "logits/rejected": -1.2541614770889282, + "logps/chosen": -140.61277770996094, + "logps/rejected": -44.0526237487793, + "loss": 0.6075, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.245923042297363, + "rewards/margins": 3.1512556076049805, + "rewards/rejected": 2.094667434692383, + "step": 11525 + }, + { + "epoch": 1.87, + "learning_rate": 1.0904350545224596e-07, + "logits/chosen": -0.6455888748168945, + "logits/rejected": -0.6455888748168945, + "logps/chosen": -8.748785018920898, + "logps/rejected": -8.748785018920898, + "loss": 0.4589, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.5419067740440369, + "rewards/margins": 0.0, + "rewards/rejected": 0.5419067740440369, + "step": 11526 + }, + { + "epoch": 1.87, + "learning_rate": 1.0877069598190348e-07, + "logits/chosen": -0.9695401191711426, + "logits/rejected": -0.9695401191711426, + "logps/chosen": -1.227540135383606, + "logps/rejected": -1.227540135383606, + "loss": 0.3584, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.17093525826931, + "rewards/margins": 0.0, + "rewards/rejected": 0.17093525826931, + "step": 11527 + }, + { + "epoch": 1.87, + "learning_rate": 1.0849822444922553e-07, + "logits/chosen": -1.2412047386169434, + "logits/rejected": -1.2315633296966553, + "logps/chosen": -53.07859802246094, + "logps/rejected": -89.68675231933594, + "loss": 0.2708, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.079646348953247, + "rewards/margins": 0.37887656688690186, + "rewards/rejected": 1.7007697820663452, + "step": 11528 + }, + { + "epoch": 1.87, + "learning_rate": 1.0822609087303538e-07, + "logits/chosen": -1.2122851610183716, + "logits/rejected": -1.2122851610183716, + "logps/chosen": -62.174705505371094, + "logps/rejected": -62.174705505371094, + "loss": 0.412, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.568258762359619, + "rewards/margins": 0.0, + "rewards/rejected": 2.568258762359619, + "step": 11529 + }, + { + "epoch": 1.87, + "learning_rate": 1.0795429527213685e-07, + "logits/chosen": -1.2490191459655762, + "logits/rejected": -1.2208858728408813, + "logps/chosen": -72.69713592529297, + "logps/rejected": -95.1022720336914, + "loss": 0.2192, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.9950240850448608, + "rewards/margins": 1.209812879562378, + "rewards/rejected": 0.7852112054824829, + "step": 11530 + }, + { + "epoch": 1.87, + "learning_rate": 1.0768283766530662e-07, + "logits/chosen": -1.3494166135787964, + "logits/rejected": -1.3465296030044556, + "logps/chosen": -110.39757537841797, + "logps/rejected": -97.14127349853516, + "loss": 0.2365, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.9938499927520752, + "rewards/margins": 0.5323364734649658, + "rewards/rejected": 1.4615135192871094, + "step": 11531 + }, + { + "epoch": 1.87, + "learning_rate": 1.074117180713019e-07, + "logits/chosen": -1.3240032196044922, + "logits/rejected": -1.342621088027954, + "logps/chosen": -49.75944519042969, + "logps/rejected": -41.51506805419922, + "loss": 1.3951, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.8298568725585938, + "rewards/margins": -0.047269582748413086, + "rewards/rejected": 2.877126455307007, + "step": 11532 + }, + { + "epoch": 1.87, + "learning_rate": 1.0714093650885216e-07, + "logits/chosen": -1.5388444662094116, + "logits/rejected": -1.508246898651123, + "logps/chosen": -69.03350830078125, + "logps/rejected": -76.78538513183594, + "loss": 1.802, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.9260757565498352, + "rewards/margins": -0.44591373205184937, + "rewards/rejected": 1.3719894886016846, + "step": 11533 + }, + { + "epoch": 1.87, + "learning_rate": 1.0687049299666796e-07, + "logits/chosen": -0.9558917284011841, + "logits/rejected": -0.9202970862388611, + "logps/chosen": -62.59600067138672, + "logps/rejected": -40.44643020629883, + "loss": 0.1839, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.7965049743652344, + "rewards/margins": 1.1578937768936157, + "rewards/rejected": 1.6386111974716187, + "step": 11534 + }, + { + "epoch": 1.87, + "learning_rate": 1.0660038755343272e-07, + "logits/chosen": -1.3627814054489136, + "logits/rejected": -1.2644007205963135, + "logps/chosen": -49.12271499633789, + "logps/rejected": -33.0222053527832, + "loss": 0.2405, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.3293673992156982, + "rewards/margins": 2.1292333602905273, + "rewards/rejected": 1.2001339197158813, + "step": 11535 + }, + { + "epoch": 1.87, + "learning_rate": 1.0633062019780981e-07, + "logits/chosen": -1.5669186115264893, + "logits/rejected": -1.4914991855621338, + "logps/chosen": -66.14093780517578, + "logps/rejected": -81.77459716796875, + "loss": 0.2321, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.056670665740967, + "rewards/margins": 2.282451868057251, + "rewards/rejected": 2.774218797683716, + "step": 11536 + }, + { + "epoch": 1.87, + "learning_rate": 1.0606119094843603e-07, + "logits/chosen": -1.2267258167266846, + "logits/rejected": -1.246115803718567, + "logps/chosen": -149.13267517089844, + "logps/rejected": -60.9732780456543, + "loss": 0.0353, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.05154275894165, + "rewards/margins": 2.6233158111572266, + "rewards/rejected": 1.4282268285751343, + "step": 11537 + }, + { + "epoch": 1.87, + "learning_rate": 1.0579209982392757e-07, + "logits/chosen": -0.9145457148551941, + "logits/rejected": -0.8899447917938232, + "logps/chosen": -12.665842056274414, + "logps/rejected": -15.221837997436523, + "loss": 1.2851, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.6407545208930969, + "rewards/margins": -2.4034605026245117, + "rewards/rejected": 3.044214963912964, + "step": 11538 + }, + { + "epoch": 1.87, + "learning_rate": 1.0552334684287512e-07, + "logits/chosen": -1.0460456609725952, + "logits/rejected": -1.0479532480239868, + "logps/chosen": -10.546304702758789, + "logps/rejected": -1.6366348266601562, + "loss": 0.5062, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.13301420211791992, + "rewards/margins": -0.07721827924251556, + "rewards/rejected": 0.21023248136043549, + "step": 11539 + }, + { + "epoch": 1.87, + "learning_rate": 1.0525493202384718e-07, + "logits/chosen": -1.251927375793457, + "logits/rejected": -1.2637253999710083, + "logps/chosen": -45.390419006347656, + "logps/rejected": -88.23168182373047, + "loss": 0.653, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.456646680831909, + "rewards/margins": 0.272428035736084, + "rewards/rejected": 2.184218645095825, + "step": 11540 + }, + { + "epoch": 1.87, + "learning_rate": 1.0498685538539e-07, + "logits/chosen": -1.2220207452774048, + "logits/rejected": -1.1702690124511719, + "logps/chosen": -66.33984375, + "logps/rejected": -43.34513854980469, + "loss": 0.6399, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.5261200666427612, + "rewards/margins": 0.5698864459991455, + "rewards/rejected": 0.9562336206436157, + "step": 11541 + }, + { + "epoch": 1.87, + "learning_rate": 1.0471911694602321e-07, + "logits/chosen": -1.6766761541366577, + "logits/rejected": -1.6041228771209717, + "logps/chosen": -80.03143310546875, + "logps/rejected": -23.584800720214844, + "loss": 0.0767, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.925441026687622, + "rewards/margins": 2.0821075439453125, + "rewards/rejected": 0.8433334231376648, + "step": 11542 + }, + { + "epoch": 1.87, + "learning_rate": 1.0445171672424592e-07, + "logits/chosen": -1.33285391330719, + "logits/rejected": -1.3142004013061523, + "logps/chosen": -21.42007064819336, + "logps/rejected": -42.10557556152344, + "loss": 0.879, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.7825844287872314, + "rewards/margins": -1.150956630706787, + "rewards/rejected": 2.9335410594940186, + "step": 11543 + }, + { + "epoch": 1.87, + "learning_rate": 1.0418465473853334e-07, + "logits/chosen": -1.4364454746246338, + "logits/rejected": -1.3234314918518066, + "logps/chosen": -86.36046600341797, + "logps/rejected": -58.253990173339844, + "loss": 0.025, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.073383331298828, + "rewards/margins": 3.627352714538574, + "rewards/rejected": 1.4460304975509644, + "step": 11544 + }, + { + "epoch": 1.87, + "learning_rate": 1.0391793100733626e-07, + "logits/chosen": -0.857068657875061, + "logits/rejected": -0.857068657875061, + "logps/chosen": -4.938997268676758, + "logps/rejected": -4.938997268676758, + "loss": 1.1879, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.2513161599636078, + "rewards/margins": 0.0, + "rewards/rejected": 0.2513161599636078, + "step": 11545 + }, + { + "epoch": 1.87, + "learning_rate": 1.0365154554908274e-07, + "logits/chosen": -1.0977565050125122, + "logits/rejected": -1.0921555757522583, + "logps/chosen": -32.93555450439453, + "logps/rejected": -42.53213119506836, + "loss": 0.4885, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.895533800125122, + "rewards/margins": 0.20751523971557617, + "rewards/rejected": 2.688018560409546, + "step": 11546 + }, + { + "epoch": 1.87, + "learning_rate": 1.0338549838217804e-07, + "logits/chosen": -1.5767477750778198, + "logits/rejected": -1.6521804332733154, + "logps/chosen": -74.2972412109375, + "logps/rejected": -100.59864807128906, + "loss": 1.2022, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.310300588607788, + "rewards/margins": -1.891956090927124, + "rewards/rejected": 5.202256679534912, + "step": 11547 + }, + { + "epoch": 1.87, + "learning_rate": 1.0311978952500301e-07, + "logits/chosen": -1.3285272121429443, + "logits/rejected": -1.3338755369186401, + "logps/chosen": -141.50015258789062, + "logps/rejected": -85.9136734008789, + "loss": 0.196, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.954864501953125, + "rewards/margins": 0.9123449325561523, + "rewards/rejected": 5.042519569396973, + "step": 11548 + }, + { + "epoch": 1.87, + "learning_rate": 1.0285441899591631e-07, + "logits/chosen": -1.3568974733352661, + "logits/rejected": -1.5018187761306763, + "logps/chosen": -214.62246704101562, + "logps/rejected": -34.859527587890625, + "loss": 0.0765, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.3446061611175537, + "rewards/margins": 1.9446535110473633, + "rewards/rejected": 0.3999527096748352, + "step": 11549 + }, + { + "epoch": 1.87, + "learning_rate": 1.0258938681325104e-07, + "logits/chosen": -1.1424169540405273, + "logits/rejected": -1.122534990310669, + "logps/chosen": -34.27922058105469, + "logps/rejected": -70.616455078125, + "loss": 0.2741, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.240220308303833, + "rewards/margins": 0.3253796100616455, + "rewards/rejected": 1.9148406982421875, + "step": 11550 + }, + { + "epoch": 1.87, + "learning_rate": 1.0232469299532032e-07, + "logits/chosen": -1.1042629480361938, + "logits/rejected": -1.1023372411727905, + "logps/chosen": -1.4664063453674316, + "logps/rejected": -6.295313358306885, + "loss": 0.3207, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.38060232996940613, + "rewards/margins": 0.33724766969680786, + "rewards/rejected": 0.04335465654730797, + "step": 11551 + }, + { + "epoch": 1.88, + "learning_rate": 1.0206033756041068e-07, + "logits/chosen": -1.2056913375854492, + "logits/rejected": -1.1486872434616089, + "logps/chosen": -152.15968322753906, + "logps/rejected": -87.81501770019531, + "loss": 0.1316, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.462095737457275, + "rewards/margins": 4.099943161010742, + "rewards/rejected": 2.3621528148651123, + "step": 11552 + }, + { + "epoch": 1.88, + "learning_rate": 1.0179632052678745e-07, + "logits/chosen": -1.1897122859954834, + "logits/rejected": -1.1850303411483765, + "logps/chosen": -70.36017608642578, + "logps/rejected": -49.57374572753906, + "loss": 0.9692, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.054293155670166, + "rewards/margins": -1.7818193435668945, + "rewards/rejected": 4.8361124992370605, + "step": 11553 + }, + { + "epoch": 1.88, + "learning_rate": 1.0153264191269052e-07, + "logits/chosen": -1.1601958274841309, + "logits/rejected": -1.1566581726074219, + "logps/chosen": -83.85391998291016, + "logps/rejected": -97.88894653320312, + "loss": 0.2308, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9529800415039062, + "rewards/margins": 0.6199417114257812, + "rewards/rejected": 2.333038330078125, + "step": 11554 + }, + { + "epoch": 1.88, + "learning_rate": 1.0126930173633864e-07, + "logits/chosen": -1.461110234260559, + "logits/rejected": -1.3537110090255737, + "logps/chosen": -67.4544677734375, + "logps/rejected": -49.98199462890625, + "loss": 2.1473, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.6987223625183105, + "rewards/margins": 0.9788577556610107, + "rewards/rejected": 3.7198646068573, + "step": 11555 + }, + { + "epoch": 1.88, + "learning_rate": 1.010063000159256e-07, + "logits/chosen": -1.4108991622924805, + "logits/rejected": -1.4595177173614502, + "logps/chosen": -107.56363677978516, + "logps/rejected": -128.71240234375, + "loss": 0.4249, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.5570709705352783, + "rewards/margins": -0.2101837396621704, + "rewards/rejected": 1.7672547101974487, + "step": 11556 + }, + { + "epoch": 1.88, + "learning_rate": 1.0074363676962296e-07, + "logits/chosen": -0.7494783401489258, + "logits/rejected": -0.7494783401489258, + "logps/chosen": -1.5789214372634888, + "logps/rejected": -1.5789214372634888, + "loss": 0.3471, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.16601811349391937, + "rewards/margins": 0.0, + "rewards/rejected": 0.16601811349391937, + "step": 11557 + }, + { + "epoch": 1.88, + "learning_rate": 1.0048131201557731e-07, + "logits/chosen": -0.8903363347053528, + "logits/rejected": -0.8224980235099792, + "logps/chosen": -64.21974182128906, + "logps/rejected": -60.23762130737305, + "loss": 0.0464, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.403443813323975, + "rewards/margins": 2.858954906463623, + "rewards/rejected": 1.5444889068603516, + "step": 11558 + }, + { + "epoch": 1.88, + "learning_rate": 1.0021932577191418e-07, + "logits/chosen": -1.4293968677520752, + "logits/rejected": -1.4327973127365112, + "logps/chosen": -51.67136764526367, + "logps/rejected": -113.65461730957031, + "loss": 2.3765, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.561067581176758, + "rewards/margins": -4.742742538452148, + "rewards/rejected": 8.303810119628906, + "step": 11559 + }, + { + "epoch": 1.88, + "learning_rate": 9.99576780567324e-08, + "logits/chosen": -1.1005371809005737, + "logits/rejected": -1.1539138555526733, + "logps/chosen": -54.73735809326172, + "logps/rejected": -116.93510437011719, + "loss": 0.4236, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.0603950023651123, + "rewards/margins": 0.34400856494903564, + "rewards/rejected": 1.7163864374160767, + "step": 11560 + }, + { + "epoch": 1.88, + "learning_rate": 9.96963688881114e-08, + "logits/chosen": -1.285687804222107, + "logits/rejected": -1.2251933813095093, + "logps/chosen": -59.350181579589844, + "logps/rejected": -65.09477996826172, + "loss": 0.1331, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.96311354637146, + "rewards/margins": 1.6482186317443848, + "rewards/rejected": 1.3148949146270752, + "step": 11561 + }, + { + "epoch": 1.88, + "learning_rate": 9.943539828410342e-08, + "logits/chosen": -1.3529157638549805, + "logits/rejected": -1.3494501113891602, + "logps/chosen": -81.30477905273438, + "logps/rejected": -82.62028503417969, + "loss": 1.5493, + "rewards/accuracies": 0.0, + "rewards/chosen": 6.2508697509765625, + "rewards/margins": -0.30924224853515625, + "rewards/rejected": 6.560111999511719, + "step": 11562 + }, + { + "epoch": 1.88, + "learning_rate": 9.917476626274014e-08, + "logits/chosen": -1.1370142698287964, + "logits/rejected": -0.980625331401825, + "logps/chosen": -108.6771240234375, + "logps/rejected": -37.03665542602539, + "loss": 0.478, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.168002605438232, + "rewards/margins": 1.3265783786773682, + "rewards/rejected": 2.8414242267608643, + "step": 11563 + }, + { + "epoch": 1.88, + "learning_rate": 9.891447284202827e-08, + "logits/chosen": -1.484820008277893, + "logits/rejected": -1.6053794622421265, + "logps/chosen": -163.78529357910156, + "logps/rejected": -168.78854370117188, + "loss": 0.3294, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.925207614898682, + "rewards/margins": 0.45105600357055664, + "rewards/rejected": 6.474151611328125, + "step": 11564 + }, + { + "epoch": 1.88, + "learning_rate": 9.86545180399523e-08, + "logits/chosen": -1.2823269367218018, + "logits/rejected": -1.2388567924499512, + "logps/chosen": -57.316314697265625, + "logps/rejected": -46.88957595825195, + "loss": 0.4855, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.898749589920044, + "rewards/margins": 0.20493650436401367, + "rewards/rejected": 2.6938130855560303, + "step": 11565 + }, + { + "epoch": 1.88, + "learning_rate": 9.839490187447176e-08, + "logits/chosen": -1.4548630714416504, + "logits/rejected": -1.3704663515090942, + "logps/chosen": -40.21329116821289, + "logps/rejected": -69.17407989501953, + "loss": 0.103, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.640244960784912, + "rewards/margins": 1.6298422813415527, + "rewards/rejected": 3.0104026794433594, + "step": 11566 + }, + { + "epoch": 1.88, + "learning_rate": 9.813562436352453e-08, + "logits/chosen": -1.5685960054397583, + "logits/rejected": -1.4764349460601807, + "logps/chosen": -103.49961853027344, + "logps/rejected": -78.65274047851562, + "loss": 1.1207, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.3625383377075195, + "rewards/margins": 1.7963640689849854, + "rewards/rejected": 3.566174268722534, + "step": 11567 + }, + { + "epoch": 1.88, + "learning_rate": 9.787668552502349e-08, + "logits/chosen": -1.5069594383239746, + "logits/rejected": -1.4157285690307617, + "logps/chosen": -89.26902770996094, + "logps/rejected": -30.713014602661133, + "loss": 1.7579, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.697319030761719, + "rewards/margins": 3.3486359119415283, + "rewards/rejected": 3.3486831188201904, + "step": 11568 + }, + { + "epoch": 1.88, + "learning_rate": 9.761808537685936e-08, + "logits/chosen": -1.4746829271316528, + "logits/rejected": -1.4862821102142334, + "logps/chosen": -53.66883087158203, + "logps/rejected": -53.40340042114258, + "loss": 0.3823, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.376572370529175, + "rewards/margins": 0.5120768547058105, + "rewards/rejected": 2.8644955158233643, + "step": 11569 + }, + { + "epoch": 1.88, + "learning_rate": 9.735982393689835e-08, + "logits/chosen": -1.276017189025879, + "logits/rejected": -1.2373497486114502, + "logps/chosen": -111.3802490234375, + "logps/rejected": -143.27304077148438, + "loss": 1.8619, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.9858551025390625, + "rewards/margins": -2.7102599143981934, + "rewards/rejected": 5.696115016937256, + "step": 11570 + }, + { + "epoch": 1.88, + "learning_rate": 9.710190122298458e-08, + "logits/chosen": -1.2318369150161743, + "logits/rejected": -1.1999599933624268, + "logps/chosen": -28.669187545776367, + "logps/rejected": -41.821876525878906, + "loss": 0.8696, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.902612566947937, + "rewards/margins": -1.4821242094039917, + "rewards/rejected": 3.3847367763519287, + "step": 11571 + }, + { + "epoch": 1.88, + "learning_rate": 9.684431725293763e-08, + "logits/chosen": -1.618268609046936, + "logits/rejected": -1.5561902523040771, + "logps/chosen": -76.93661499023438, + "logps/rejected": -71.7032470703125, + "loss": 1.2504, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.6841621398925781, + "rewards/margins": -2.37441349029541, + "rewards/rejected": 4.058575630187988, + "step": 11572 + }, + { + "epoch": 1.88, + "learning_rate": 9.658707204455442e-08, + "logits/chosen": -0.9761825203895569, + "logits/rejected": -0.9761825203895569, + "logps/chosen": -30.38009262084961, + "logps/rejected": -30.38009262084961, + "loss": 0.3486, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.4379754066467285, + "rewards/margins": 0.0, + "rewards/rejected": 2.4379754066467285, + "step": 11573 + }, + { + "epoch": 1.88, + "learning_rate": 9.633016561560793e-08, + "logits/chosen": -1.3635473251342773, + "logits/rejected": -1.2630364894866943, + "logps/chosen": -32.271995544433594, + "logps/rejected": -53.10474395751953, + "loss": 0.3896, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.149523973464966, + "rewards/margins": -0.1450188159942627, + "rewards/rejected": 2.2945427894592285, + "step": 11574 + }, + { + "epoch": 1.88, + "learning_rate": 9.607359798384785e-08, + "logits/chosen": -1.038998007774353, + "logits/rejected": -0.9551153182983398, + "logps/chosen": -65.28572082519531, + "logps/rejected": -103.38612365722656, + "loss": 0.5622, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.832006931304932, + "rewards/margins": 1.1530747413635254, + "rewards/rejected": 5.678932189941406, + "step": 11575 + }, + { + "epoch": 1.88, + "learning_rate": 9.581736916700113e-08, + "logits/chosen": -1.5027936697006226, + "logits/rejected": -1.2577346563339233, + "logps/chosen": -116.98402404785156, + "logps/rejected": -49.654273986816406, + "loss": 0.1055, + "rewards/accuracies": 1.0, + "rewards/chosen": 8.46391773223877, + "rewards/margins": 6.71013069152832, + "rewards/rejected": 1.7537872791290283, + "step": 11576 + }, + { + "epoch": 1.88, + "learning_rate": 9.556147918276971e-08, + "logits/chosen": -1.0298231840133667, + "logits/rejected": -1.050748586654663, + "logps/chosen": -60.64982604980469, + "logps/rejected": -47.93802261352539, + "loss": 0.4049, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.6952645778656006, + "rewards/margins": -0.1296069622039795, + "rewards/rejected": 2.82487154006958, + "step": 11577 + }, + { + "epoch": 1.88, + "learning_rate": 9.530592804883498e-08, + "logits/chosen": -1.5464637279510498, + "logits/rejected": -1.5727241039276123, + "logps/chosen": -101.71809387207031, + "logps/rejected": -75.14503479003906, + "loss": 0.4502, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.2582404613494873, + "rewards/margins": -0.3625144958496094, + "rewards/rejected": 2.6207549571990967, + "step": 11578 + }, + { + "epoch": 1.88, + "learning_rate": 9.505071578285119e-08, + "logits/chosen": -1.0780673027038574, + "logits/rejected": -1.1067789793014526, + "logps/chosen": -27.1848087310791, + "logps/rejected": -73.54559326171875, + "loss": 1.9938, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.8244867324829102, + "rewards/margins": -3.525313377380371, + "rewards/rejected": 5.349800109863281, + "step": 11579 + }, + { + "epoch": 1.88, + "learning_rate": 9.479584240245199e-08, + "logits/chosen": -1.2350767850875854, + "logits/rejected": -1.2350767850875854, + "logps/chosen": -53.48884582519531, + "logps/rejected": -53.48884582519531, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.6640716791152954, + "rewards/margins": 0.0, + "rewards/rejected": 1.6640716791152954, + "step": 11580 + }, + { + "epoch": 1.88, + "learning_rate": 9.454130792524663e-08, + "logits/chosen": -1.1020148992538452, + "logits/rejected": -1.0410088300704956, + "logps/chosen": -48.79084014892578, + "logps/rejected": -57.70551300048828, + "loss": 0.3714, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.1450417041778564, + "rewards/margins": -0.03628396987915039, + "rewards/rejected": 2.181325674057007, + "step": 11581 + }, + { + "epoch": 1.88, + "learning_rate": 9.428711236882104e-08, + "logits/chosen": -1.3781639337539673, + "logits/rejected": -1.2964876890182495, + "logps/chosen": -76.73175048828125, + "logps/rejected": -64.29322814941406, + "loss": 0.6329, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.738101959228516, + "rewards/margins": 3.143707275390625, + "rewards/rejected": 1.5943946838378906, + "step": 11582 + }, + { + "epoch": 1.88, + "learning_rate": 9.403325575073785e-08, + "logits/chosen": -1.3100863695144653, + "logits/rejected": -1.3115330934524536, + "logps/chosen": -61.185508728027344, + "logps/rejected": -102.0869140625, + "loss": 0.7348, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.964348554611206, + "rewards/margins": -1.204557180404663, + "rewards/rejected": 4.168905735015869, + "step": 11583 + }, + { + "epoch": 1.88, + "learning_rate": 9.37797380885358e-08, + "logits/chosen": -1.53176748752594, + "logits/rejected": -1.4764941930770874, + "logps/chosen": -200.29185485839844, + "logps/rejected": -404.6583251953125, + "loss": 0.2221, + "rewards/accuracies": 1.0, + "rewards/chosen": 7.827217102050781, + "rewards/margins": 1.1001358032226562, + "rewards/rejected": 6.727081298828125, + "step": 11584 + }, + { + "epoch": 1.88, + "learning_rate": 9.352655939973088e-08, + "logits/chosen": -1.2632008790969849, + "logits/rejected": -1.159970760345459, + "logps/chosen": -83.50559997558594, + "logps/rejected": -68.82524871826172, + "loss": 0.2825, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.8238205909729004, + "rewards/margins": 1.3456261157989502, + "rewards/rejected": 1.4781944751739502, + "step": 11585 + }, + { + "epoch": 1.88, + "learning_rate": 9.32737197018152e-08, + "logits/chosen": -1.3698149919509888, + "logits/rejected": -1.3745492696762085, + "logps/chosen": -90.00509643554688, + "logps/rejected": -75.32745361328125, + "loss": 0.2572, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.1435227394104004, + "rewards/margins": 0.49506235122680664, + "rewards/rejected": 1.6484603881835938, + "step": 11586 + }, + { + "epoch": 1.88, + "learning_rate": 9.302121901225703e-08, + "logits/chosen": -1.3706631660461426, + "logits/rejected": -1.3902353048324585, + "logps/chosen": -224.8345947265625, + "logps/rejected": -133.97897338867188, + "loss": 0.0199, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.2999267578125, + "rewards/margins": 3.900036573410034, + "rewards/rejected": 2.399890184402466, + "step": 11587 + }, + { + "epoch": 1.88, + "learning_rate": 9.27690573485035e-08, + "logits/chosen": -1.4011058807373047, + "logits/rejected": -1.2572696208953857, + "logps/chosen": -68.14898681640625, + "logps/rejected": -49.639991760253906, + "loss": 0.8103, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.297145128250122, + "rewards/margins": -0.7972939014434814, + "rewards/rejected": 3.0944390296936035, + "step": 11588 + }, + { + "epoch": 1.88, + "learning_rate": 9.251723472797457e-08, + "logits/chosen": -1.3937537670135498, + "logits/rejected": -1.362541913986206, + "logps/chosen": -47.235496520996094, + "logps/rejected": -60.83648681640625, + "loss": 0.0658, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.29128360748291, + "rewards/margins": 1.9787111282348633, + "rewards/rejected": 2.312572479248047, + "step": 11589 + }, + { + "epoch": 1.88, + "learning_rate": 9.226575116807023e-08, + "logits/chosen": -1.2984250783920288, + "logits/rejected": -1.3287707567214966, + "logps/chosen": -52.502132415771484, + "logps/rejected": -85.30340576171875, + "loss": 0.6834, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.5811939239501953, + "rewards/margins": -0.9974179267883301, + "rewards/rejected": 3.5786118507385254, + "step": 11590 + }, + { + "epoch": 1.88, + "learning_rate": 9.201460668616435e-08, + "logits/chosen": -1.0800204277038574, + "logits/rejected": -0.9481098651885986, + "logps/chosen": -69.36883544921875, + "logps/rejected": -57.134151458740234, + "loss": 0.2058, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.659675121307373, + "rewards/margins": 2.1030402183532715, + "rewards/rejected": 3.5566349029541016, + "step": 11591 + }, + { + "epoch": 1.88, + "learning_rate": 9.176380129961026e-08, + "logits/chosen": -1.3055528402328491, + "logits/rejected": -1.4439085721969604, + "logps/chosen": -51.45941162109375, + "logps/rejected": -45.46532440185547, + "loss": 0.5052, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.40780067443847656, + "rewards/margins": -0.454837441444397, + "rewards/rejected": 0.8626381158828735, + "step": 11592 + }, + { + "epoch": 1.88, + "learning_rate": 9.151333502573468e-08, + "logits/chosen": -1.0609846115112305, + "logits/rejected": -1.028128981590271, + "logps/chosen": -21.058380126953125, + "logps/rejected": -6.238856792449951, + "loss": 0.3273, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.22102013230323792, + "rewards/margins": 0.10809631645679474, + "rewards/rejected": 0.11292381584644318, + "step": 11593 + }, + { + "epoch": 1.88, + "learning_rate": 9.126320788184374e-08, + "logits/chosen": -1.3984624147415161, + "logits/rejected": -1.389318585395813, + "logps/chosen": -65.80973815917969, + "logps/rejected": -55.23893737792969, + "loss": 0.3943, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.06805419921875, + "rewards/margins": 0.9152603149414062, + "rewards/rejected": 1.1527938842773438, + "step": 11594 + }, + { + "epoch": 1.88, + "learning_rate": 9.10134198852175e-08, + "logits/chosen": -1.201856017112732, + "logits/rejected": -1.2430716753005981, + "logps/chosen": -34.943058013916016, + "logps/rejected": -44.764713287353516, + "loss": 0.7765, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.4349613189697266, + "rewards/margins": -0.6782875061035156, + "rewards/rejected": 3.113248825073242, + "step": 11595 + }, + { + "epoch": 1.88, + "learning_rate": 9.076397105311497e-08, + "logits/chosen": -1.5521655082702637, + "logits/rejected": -1.5400155782699585, + "logps/chosen": -133.57675170898438, + "logps/rejected": -93.47642517089844, + "loss": 0.1498, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.736630439758301, + "rewards/margins": 1.1640520095825195, + "rewards/rejected": 4.572578430175781, + "step": 11596 + }, + { + "epoch": 1.88, + "learning_rate": 9.051486140277066e-08, + "logits/chosen": -1.2141660451889038, + "logits/rejected": -1.2505067586898804, + "logps/chosen": -40.35877990722656, + "logps/rejected": -51.746856689453125, + "loss": 1.1362, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.6961250305175781, + "rewards/margins": -0.7222771644592285, + "rewards/rejected": 2.4184021949768066, + "step": 11597 + }, + { + "epoch": 1.88, + "learning_rate": 9.026609095139527e-08, + "logits/chosen": -1.3345376253128052, + "logits/rejected": -1.3345376253128052, + "logps/chosen": -50.60939407348633, + "logps/rejected": -50.60939407348633, + "loss": 0.4257, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.131866931915283, + "rewards/margins": 0.0, + "rewards/rejected": 2.131866931915283, + "step": 11598 + }, + { + "epoch": 1.88, + "learning_rate": 9.00176597161767e-08, + "logits/chosen": -1.48771333694458, + "logits/rejected": -1.3864076137542725, + "logps/chosen": -69.826416015625, + "logps/rejected": -58.01695251464844, + "loss": 0.7073, + "rewards/accuracies": 1.0, + "rewards/chosen": 7.030333042144775, + "rewards/margins": 1.7438859939575195, + "rewards/rejected": 5.286447048187256, + "step": 11599 + }, + { + "epoch": 1.88, + "learning_rate": 8.976956771427903e-08, + "logits/chosen": -1.5944883823394775, + "logits/rejected": -1.562247633934021, + "logps/chosen": -62.067718505859375, + "logps/rejected": -64.09652709960938, + "loss": 0.2612, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.900705814361572, + "rewards/margins": 0.41494226455688477, + "rewards/rejected": 4.4857635498046875, + "step": 11600 + }, + { + "epoch": 1.88, + "learning_rate": 8.952181496284296e-08, + "logits/chosen": -1.1859843730926514, + "logits/rejected": -1.0608811378479004, + "logps/chosen": -69.96234130859375, + "logps/rejected": -31.363615036010742, + "loss": 0.7317, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.758043050765991, + "rewards/margins": 3.2571768760681152, + "rewards/rejected": -0.4991338849067688, + "step": 11601 + }, + { + "epoch": 1.88, + "learning_rate": 8.927440147898703e-08, + "logits/chosen": -1.46337890625, + "logits/rejected": -1.4193512201309204, + "logps/chosen": -92.3226318359375, + "logps/rejected": -111.54454040527344, + "loss": 0.0415, + "rewards/accuracies": 1.0, + "rewards/chosen": 8.968096733093262, + "rewards/margins": 3.524876117706299, + "rewards/rejected": 5.443220615386963, + "step": 11602 + }, + { + "epoch": 1.88, + "learning_rate": 8.902732727980312e-08, + "logits/chosen": -1.2803308963775635, + "logits/rejected": -1.4173588752746582, + "logps/chosen": -180.11257934570312, + "logps/rejected": -85.79452514648438, + "loss": 0.0742, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.796197414398193, + "rewards/margins": 2.1877150535583496, + "rewards/rejected": 4.608482360839844, + "step": 11603 + }, + { + "epoch": 1.88, + "learning_rate": 8.878059238236369e-08, + "logits/chosen": -1.2713841199874878, + "logits/rejected": -1.2479169368743896, + "logps/chosen": -103.81755065917969, + "logps/rejected": -77.53366088867188, + "loss": 1.3841, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.734462022781372, + "rewards/margins": 0.23415899276733398, + "rewards/rejected": 2.500303030014038, + "step": 11604 + }, + { + "epoch": 1.88, + "learning_rate": 8.853419680371456e-08, + "logits/chosen": -0.9014101028442383, + "logits/rejected": -0.7666477560997009, + "logps/chosen": -134.4791259765625, + "logps/rejected": -44.376220703125, + "loss": 0.2087, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.053704857826233, + "rewards/margins": 0.666974663734436, + "rewards/rejected": 0.3867301940917969, + "step": 11605 + }, + { + "epoch": 1.88, + "learning_rate": 8.828814056087987e-08, + "logits/chosen": -1.5622491836547852, + "logits/rejected": -1.5379055738449097, + "logps/chosen": -65.36005401611328, + "logps/rejected": -87.41207885742188, + "loss": 0.4693, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.6528656482696533, + "rewards/margins": 1.794233798980713, + "rewards/rejected": 0.8586319088935852, + "step": 11606 + }, + { + "epoch": 1.88, + "learning_rate": 8.804242367085936e-08, + "logits/chosen": -1.5076639652252197, + "logits/rejected": -1.6095043420791626, + "logps/chosen": -95.50779724121094, + "logps/rejected": -36.92402648925781, + "loss": 0.1046, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.321894884109497, + "rewards/margins": 2.289047956466675, + "rewards/rejected": 0.032846834510564804, + "step": 11607 + }, + { + "epoch": 1.88, + "learning_rate": 8.779704615063001e-08, + "logits/chosen": -1.1178250312805176, + "logits/rejected": -1.0070667266845703, + "logps/chosen": -55.07447814941406, + "logps/rejected": -71.5168685913086, + "loss": 0.1563, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.6190032958984375, + "rewards/margins": 1.0060631036758423, + "rewards/rejected": 1.6129401922225952, + "step": 11608 + }, + { + "epoch": 1.88, + "learning_rate": 8.755200801714492e-08, + "logits/chosen": -1.2866019010543823, + "logits/rejected": -1.2054308652877808, + "logps/chosen": -123.52365112304688, + "logps/rejected": -98.9575424194336, + "loss": 0.219, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.6514222621917725, + "rewards/margins": 1.0505623817443848, + "rewards/rejected": 1.6008598804473877, + "step": 11609 + }, + { + "epoch": 1.88, + "learning_rate": 8.7307309287335e-08, + "logits/chosen": -1.2714595794677734, + "logits/rejected": -1.345961093902588, + "logps/chosen": -48.80590057373047, + "logps/rejected": -86.7601547241211, + "loss": 1.7344, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.720252275466919, + "rewards/margins": -0.19531702995300293, + "rewards/rejected": 1.9155693054199219, + "step": 11610 + }, + { + "epoch": 1.88, + "learning_rate": 8.70629499781045e-08, + "logits/chosen": -1.2794458866119385, + "logits/rejected": -1.257185459136963, + "logps/chosen": -32.85824203491211, + "logps/rejected": -55.09079360961914, + "loss": 0.4971, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.4197347164154053, + "rewards/margins": -0.5121421813964844, + "rewards/rejected": 2.9318768978118896, + "step": 11611 + }, + { + "epoch": 1.88, + "learning_rate": 8.681893010633768e-08, + "logits/chosen": -1.3929232358932495, + "logits/rejected": -1.1298201084136963, + "logps/chosen": -94.02207946777344, + "logps/rejected": -59.53515625, + "loss": 0.7116, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.897866725921631, + "rewards/margins": 0.8862266540527344, + "rewards/rejected": 4.0116400718688965, + "step": 11612 + }, + { + "epoch": 1.88, + "learning_rate": 8.657524968889442e-08, + "logits/chosen": -1.3985865116119385, + "logits/rejected": -1.33897864818573, + "logps/chosen": -121.44195556640625, + "logps/rejected": -61.318748474121094, + "loss": 0.0025, + "rewards/accuracies": 1.0, + "rewards/chosen": 8.741253852844238, + "rewards/margins": 5.417193412780762, + "rewards/rejected": 3.3240602016448975, + "step": 11613 + }, + { + "epoch": 1.89, + "learning_rate": 8.633190874261011e-08, + "logits/chosen": -1.4612274169921875, + "logits/rejected": -1.391083836555481, + "logps/chosen": -69.36943054199219, + "logps/rejected": -5.000195503234863, + "loss": 0.1358, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.236072540283203, + "rewards/margins": 1.6932566165924072, + "rewards/rejected": 0.5428158640861511, + "step": 11614 + }, + { + "epoch": 1.89, + "learning_rate": 8.608890728429741e-08, + "logits/chosen": -1.3653302192687988, + "logits/rejected": -1.350229024887085, + "logps/chosen": -58.78779220581055, + "logps/rejected": -66.83303833007812, + "loss": 0.1812, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.429409503936768, + "rewards/margins": 1.0664539337158203, + "rewards/rejected": 3.3629555702209473, + "step": 11615 + }, + { + "epoch": 1.89, + "learning_rate": 8.58462453307446e-08, + "logits/chosen": -1.1308938264846802, + "logits/rejected": -1.0946916341781616, + "logps/chosen": -56.85258483886719, + "logps/rejected": -54.5135383605957, + "loss": 0.4924, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.8876495361328125, + "rewards/margins": -0.39227867126464844, + "rewards/rejected": 2.279928207397461, + "step": 11616 + }, + { + "epoch": 1.89, + "learning_rate": 8.560392289871878e-08, + "logits/chosen": -1.2556452751159668, + "logits/rejected": -1.2857961654663086, + "logps/chosen": -99.84481811523438, + "logps/rejected": -154.7307891845703, + "loss": 0.7093, + "rewards/accuracies": 0.0, + "rewards/chosen": 6.490151882171631, + "rewards/margins": -1.1410555839538574, + "rewards/rejected": 7.631207466125488, + "step": 11617 + }, + { + "epoch": 1.89, + "learning_rate": 8.536194000496101e-08, + "logits/chosen": -1.377621054649353, + "logits/rejected": -1.3233745098114014, + "logps/chosen": -36.488956451416016, + "logps/rejected": -80.33285522460938, + "loss": 0.7625, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.0078341960906982, + "rewards/margins": -0.7306942939758301, + "rewards/rejected": 3.7385284900665283, + "step": 11618 + }, + { + "epoch": 1.89, + "learning_rate": 8.512029666619127e-08, + "logits/chosen": -0.7689895629882812, + "logits/rejected": -0.7689895629882812, + "logps/chosen": -36.59378433227539, + "logps/rejected": -36.59378433227539, + "loss": 0.5593, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.095763087272644, + "rewards/margins": 0.0, + "rewards/rejected": 1.095763087272644, + "step": 11619 + }, + { + "epoch": 1.89, + "learning_rate": 8.487899289910284e-08, + "logits/chosen": -1.130286455154419, + "logits/rejected": -1.130286455154419, + "logps/chosen": -0.9966651201248169, + "logps/rejected": -0.9966651201248169, + "loss": 0.35, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.22099041938781738, + "rewards/margins": 0.0, + "rewards/rejected": 0.22099041938781738, + "step": 11620 + }, + { + "epoch": 1.89, + "learning_rate": 8.463802872036964e-08, + "logits/chosen": -1.6664587259292603, + "logits/rejected": -1.5896722078323364, + "logps/chosen": -130.6990203857422, + "logps/rejected": -99.16903686523438, + "loss": 0.7438, + "rewards/accuracies": 0.0, + "rewards/chosen": 6.5247039794921875, + "rewards/margins": -1.1636996269226074, + "rewards/rejected": 7.688403606414795, + "step": 11621 + }, + { + "epoch": 1.89, + "learning_rate": 8.439740414663832e-08, + "logits/chosen": -1.4205254316329956, + "logits/rejected": -1.4651355743408203, + "logps/chosen": -157.9530487060547, + "logps/rejected": -117.39900207519531, + "loss": 0.0415, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.782115459442139, + "rewards/margins": 2.627760648727417, + "rewards/rejected": 2.1543548107147217, + "step": 11622 + }, + { + "epoch": 1.89, + "learning_rate": 8.415711919453506e-08, + "logits/chosen": -1.3173109292984009, + "logits/rejected": -1.4076546430587769, + "logps/chosen": -51.97007751464844, + "logps/rejected": -93.9354476928711, + "loss": 1.8847, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.9070786237716675, + "rewards/margins": -3.5232062339782715, + "rewards/rejected": 5.4302849769592285, + "step": 11623 + }, + { + "epoch": 1.89, + "learning_rate": 8.391717388066045e-08, + "logits/chosen": -1.0734587907791138, + "logits/rejected": -1.0734587907791138, + "logps/chosen": -14.027480125427246, + "logps/rejected": -14.027480125427246, + "loss": 0.4371, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.5984576344490051, + "rewards/margins": 0.0, + "rewards/rejected": 0.5984576344490051, + "step": 11624 + }, + { + "epoch": 1.89, + "learning_rate": 8.36775682215929e-08, + "logits/chosen": -1.5383023023605347, + "logits/rejected": -1.3716115951538086, + "logps/chosen": -102.67153930664062, + "logps/rejected": -15.781072616577148, + "loss": 0.0274, + "rewards/accuracies": 1.0, + "rewards/chosen": 7.421835422515869, + "rewards/margins": 6.469437122344971, + "rewards/rejected": 0.9523981213569641, + "step": 11625 + }, + { + "epoch": 1.89, + "learning_rate": 8.343830223388638e-08, + "logits/chosen": -1.4008864164352417, + "logits/rejected": -1.2248592376708984, + "logps/chosen": -96.7125473022461, + "logps/rejected": -91.37887573242188, + "loss": 0.0109, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.98620080947876, + "rewards/margins": 4.777042388916016, + "rewards/rejected": 2.209158420562744, + "step": 11626 + }, + { + "epoch": 1.89, + "learning_rate": 8.319937593407268e-08, + "logits/chosen": -1.1083083152770996, + "logits/rejected": -1.1528311967849731, + "logps/chosen": -35.77915573120117, + "logps/rejected": -77.34947967529297, + "loss": 1.4409, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.6298885345458984, + "rewards/margins": -2.6938228607177734, + "rewards/rejected": 5.323711395263672, + "step": 11627 + }, + { + "epoch": 1.89, + "learning_rate": 8.296078933865858e-08, + "logits/chosen": -0.9966861009597778, + "logits/rejected": -1.0587406158447266, + "logps/chosen": -58.86111068725586, + "logps/rejected": -42.8952751159668, + "loss": 2.569, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.536945104598999, + "rewards/margins": 0.5499207973480225, + "rewards/rejected": 1.9870243072509766, + "step": 11628 + }, + { + "epoch": 1.89, + "learning_rate": 8.272254246412925e-08, + "logits/chosen": -0.9975493550300598, + "logits/rejected": -0.999168872833252, + "logps/chosen": -70.01090240478516, + "logps/rejected": -39.75295639038086, + "loss": 0.4801, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.557485342025757, + "rewards/margins": 0.04664349555969238, + "rewards/rejected": 2.5108418464660645, + "step": 11629 + }, + { + "epoch": 1.89, + "learning_rate": 8.248463532694373e-08, + "logits/chosen": -1.4658358097076416, + "logits/rejected": -1.5608233213424683, + "logps/chosen": -52.61775207519531, + "logps/rejected": -136.40086364746094, + "loss": 3.4546, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.060312032699585, + "rewards/margins": -6.245621681213379, + "rewards/rejected": 8.305933952331543, + "step": 11630 + }, + { + "epoch": 1.89, + "learning_rate": 8.224706794353998e-08, + "logits/chosen": -1.1427956819534302, + "logits/rejected": -1.1059179306030273, + "logps/chosen": -36.904266357421875, + "logps/rejected": -32.54741668701172, + "loss": 0.9457, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.051501512527466, + "rewards/margins": -1.6647934913635254, + "rewards/rejected": 3.716295003890991, + "step": 11631 + }, + { + "epoch": 1.89, + "learning_rate": 8.20098403303321e-08, + "logits/chosen": -1.2688970565795898, + "logits/rejected": -1.1926506757736206, + "logps/chosen": -114.45938873291016, + "logps/rejected": -104.59019470214844, + "loss": 0.6385, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.062410831451416, + "rewards/margins": 1.2594819068908691, + "rewards/rejected": 1.8029289245605469, + "step": 11632 + }, + { + "epoch": 1.89, + "learning_rate": 8.17729525037092e-08, + "logits/chosen": -1.2256008386611938, + "logits/rejected": -1.2079085111618042, + "logps/chosen": -63.057518005371094, + "logps/rejected": -75.11663818359375, + "loss": 0.7752, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.747615098953247, + "rewards/margins": 1.7115951776504517, + "rewards/rejected": 1.0360199213027954, + "step": 11633 + }, + { + "epoch": 1.89, + "learning_rate": 8.153640448003875e-08, + "logits/chosen": -1.5247852802276611, + "logits/rejected": -1.5957317352294922, + "logps/chosen": -237.08204650878906, + "logps/rejected": -126.98666381835938, + "loss": 0.0629, + "rewards/accuracies": 1.0, + "rewards/chosen": 8.082579612731934, + "rewards/margins": 2.3401007652282715, + "rewards/rejected": 5.742478847503662, + "step": 11634 + }, + { + "epoch": 1.89, + "learning_rate": 8.130019627566377e-08, + "logits/chosen": -1.2657054662704468, + "logits/rejected": -1.3014521598815918, + "logps/chosen": -145.4014892578125, + "logps/rejected": -72.77281951904297, + "loss": 0.1234, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.5246567726135254, + "rewards/margins": 1.6012444496154785, + "rewards/rejected": 1.9234123229980469, + "step": 11635 + }, + { + "epoch": 1.89, + "learning_rate": 8.106432790690454e-08, + "logits/chosen": -1.4631245136260986, + "logits/rejected": -1.515805959701538, + "logps/chosen": -61.15024185180664, + "logps/rejected": -105.69932556152344, + "loss": 0.7778, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.8015363216400146, + "rewards/margins": -1.1975297927856445, + "rewards/rejected": 3.999066114425659, + "step": 11636 + }, + { + "epoch": 1.89, + "learning_rate": 8.082879939005638e-08, + "logits/chosen": -1.39091956615448, + "logits/rejected": -1.1143144369125366, + "logps/chosen": -136.00131225585938, + "logps/rejected": -30.097261428833008, + "loss": 0.2408, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.9828033447265625, + "rewards/margins": 5.162929058074951, + "rewards/rejected": 0.8198744058609009, + "step": 11637 + }, + { + "epoch": 1.89, + "learning_rate": 8.059361074139293e-08, + "logits/chosen": -1.2417751550674438, + "logits/rejected": -1.2417751550674438, + "logps/chosen": -10.062112808227539, + "logps/rejected": -10.062112808227539, + "loss": 0.3669, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.138496160507202, + "rewards/margins": 0.0, + "rewards/rejected": 2.138496160507202, + "step": 11638 + }, + { + "epoch": 1.89, + "learning_rate": 8.03587619771623e-08, + "logits/chosen": -1.1321462392807007, + "logits/rejected": -0.9966564178466797, + "logps/chosen": -40.37834548950195, + "logps/rejected": -22.3128604888916, + "loss": 0.1621, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.7472957372665405, + "rewards/margins": 1.0902986526489258, + "rewards/rejected": 0.6569971442222595, + "step": 11639 + }, + { + "epoch": 1.89, + "learning_rate": 8.012425311359207e-08, + "logits/chosen": -1.327641248703003, + "logits/rejected": -1.3277983665466309, + "logps/chosen": -28.331560134887695, + "logps/rejected": -54.1090087890625, + "loss": 0.7397, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.6435778141021729, + "rewards/margins": -1.1082556247711182, + "rewards/rejected": 2.751833438873291, + "step": 11640 + }, + { + "epoch": 1.89, + "learning_rate": 7.989008416688316e-08, + "logits/chosen": -1.1154439449310303, + "logits/rejected": -0.9285237789154053, + "logps/chosen": -55.11312484741211, + "logps/rejected": -23.64590072631836, + "loss": 0.4712, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.9112744331359863, + "rewards/margins": 2.560094118118286, + "rewards/rejected": 1.3511803150177002, + "step": 11641 + }, + { + "epoch": 1.89, + "learning_rate": 7.96562551532154e-08, + "logits/chosen": -1.1429839134216309, + "logits/rejected": -0.9235692024230957, + "logps/chosen": -56.54972457885742, + "logps/rejected": -57.147708892822266, + "loss": 0.1111, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.3302173614501953, + "rewards/margins": 2.137066602706909, + "rewards/rejected": 0.19315071403980255, + "step": 11642 + }, + { + "epoch": 1.89, + "learning_rate": 7.94227660887431e-08, + "logits/chosen": -1.3875017166137695, + "logits/rejected": -1.3304615020751953, + "logps/chosen": -184.9257354736328, + "logps/rejected": -89.30827331542969, + "loss": 1.4624, + "rewards/accuracies": 0.0, + "rewards/chosen": 5.059274196624756, + "rewards/margins": -2.8693923950195312, + "rewards/rejected": 7.928666591644287, + "step": 11643 + }, + { + "epoch": 1.89, + "learning_rate": 7.918961698959892e-08, + "logits/chosen": -1.2156625986099243, + "logits/rejected": -1.152876377105713, + "logps/chosen": -36.70420837402344, + "logps/rejected": -68.88648986816406, + "loss": 0.3528, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.689716339111328, + "rewards/margins": 0.6351668834686279, + "rewards/rejected": 2.0545494556427, + "step": 11644 + }, + { + "epoch": 1.89, + "learning_rate": 7.895680787189109e-08, + "logits/chosen": -1.5995163917541504, + "logits/rejected": -1.575407862663269, + "logps/chosen": -84.83899688720703, + "logps/rejected": -68.87063598632812, + "loss": 0.8306, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.190062999725342, + "rewards/margins": 0.006496429443359375, + "rewards/rejected": 4.183566570281982, + "step": 11645 + }, + { + "epoch": 1.89, + "learning_rate": 7.872433875170449e-08, + "logits/chosen": -1.0298559665679932, + "logits/rejected": -1.1008427143096924, + "logps/chosen": -45.043006896972656, + "logps/rejected": -48.562015533447266, + "loss": 0.6852, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.9804832935333252, + "rewards/margins": -0.9351680278778076, + "rewards/rejected": 2.915651321411133, + "step": 11646 + }, + { + "epoch": 1.89, + "learning_rate": 7.849220964510073e-08, + "logits/chosen": -0.9116236567497253, + "logits/rejected": -0.9116236567497253, + "logps/chosen": -50.19059753417969, + "logps/rejected": -50.19059753417969, + "loss": 0.3644, + "rewards/accuracies": 0.0, + "rewards/chosen": 4.439218044281006, + "rewards/margins": 0.0, + "rewards/rejected": 4.439218044281006, + "step": 11647 + }, + { + "epoch": 1.89, + "learning_rate": 7.826042056811756e-08, + "logits/chosen": -1.4545748233795166, + "logits/rejected": -1.4662754535675049, + "logps/chosen": -105.0536880493164, + "logps/rejected": -113.2777099609375, + "loss": 1.4943, + "rewards/accuracies": 0.0, + "rewards/chosen": 5.983884334564209, + "rewards/margins": -2.8657965660095215, + "rewards/rejected": 8.84968090057373, + "step": 11648 + }, + { + "epoch": 1.89, + "learning_rate": 7.80289715367688e-08, + "logits/chosen": -0.960159420967102, + "logits/rejected": -0.987927258014679, + "logps/chosen": -53.612937927246094, + "logps/rejected": -53.49427795410156, + "loss": 0.7465, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.1835060119628906, + "rewards/margins": -0.5913094282150269, + "rewards/rejected": 1.7748154401779175, + "step": 11649 + }, + { + "epoch": 1.89, + "learning_rate": 7.779786256704669e-08, + "logits/chosen": -1.2115436792373657, + "logits/rejected": -1.2410532236099243, + "logps/chosen": -40.00468063354492, + "logps/rejected": -75.90349578857422, + "loss": 1.0176, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.3938419818878174, + "rewards/margins": 1.4437007904052734, + "rewards/rejected": 1.950141191482544, + "step": 11650 + }, + { + "epoch": 1.89, + "learning_rate": 7.756709367491732e-08, + "logits/chosen": -1.292661428451538, + "logits/rejected": -1.1778600215911865, + "logps/chosen": -108.67933654785156, + "logps/rejected": -68.44915771484375, + "loss": 0.1885, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.5511550903320312, + "rewards/margins": 1.2781462669372559, + "rewards/rejected": 2.2730088233947754, + "step": 11651 + }, + { + "epoch": 1.89, + "learning_rate": 7.733666487632574e-08, + "logits/chosen": -1.2869367599487305, + "logits/rejected": -1.4250346422195435, + "logps/chosen": -74.3109130859375, + "logps/rejected": -88.94270324707031, + "loss": 1.1033, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.534287214279175, + "rewards/margins": -2.0583198070526123, + "rewards/rejected": 4.592607021331787, + "step": 11652 + }, + { + "epoch": 1.89, + "learning_rate": 7.710657618719198e-08, + "logits/chosen": -1.4191875457763672, + "logits/rejected": -1.3084042072296143, + "logps/chosen": -76.8449478149414, + "logps/rejected": -26.939739227294922, + "loss": 0.237, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.5745339393615723, + "rewards/margins": 0.5349946022033691, + "rewards/rejected": 2.039539337158203, + "step": 11653 + }, + { + "epoch": 1.89, + "learning_rate": 7.687682762341276e-08, + "logits/chosen": -1.3730523586273193, + "logits/rejected": -1.2258667945861816, + "logps/chosen": -47.74571228027344, + "logps/rejected": -16.128032684326172, + "loss": 0.4296, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9416663646698, + "rewards/margins": 2.189143419265747, + "rewards/rejected": 0.752522885799408, + "step": 11654 + }, + { + "epoch": 1.89, + "learning_rate": 7.66474192008615e-08, + "logits/chosen": -1.1542946100234985, + "logits/rejected": -1.1098908185958862, + "logps/chosen": -76.36027526855469, + "logps/rejected": -63.231971740722656, + "loss": 0.6695, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.153163194656372, + "rewards/margins": -0.378695011138916, + "rewards/rejected": 2.531858205795288, + "step": 11655 + }, + { + "epoch": 1.89, + "learning_rate": 7.641835093538885e-08, + "logits/chosen": -1.6194725036621094, + "logits/rejected": -1.6293306350708008, + "logps/chosen": -43.50396728515625, + "logps/rejected": -91.06753540039062, + "loss": 0.3045, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.8525551557540894, + "rewards/margins": 0.19246912002563477, + "rewards/rejected": 1.6600860357284546, + "step": 11656 + }, + { + "epoch": 1.89, + "learning_rate": 7.61896228428205e-08, + "logits/chosen": -1.5083630084991455, + "logits/rejected": -1.490382432937622, + "logps/chosen": -84.14706420898438, + "logps/rejected": -57.89521026611328, + "loss": 0.4599, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.6995155811309814, + "rewards/margins": -0.3936126232147217, + "rewards/rejected": 2.093128204345703, + "step": 11657 + }, + { + "epoch": 1.89, + "learning_rate": 7.59612349389599e-08, + "logits/chosen": -1.175153136253357, + "logits/rejected": -1.0257350206375122, + "logps/chosen": -60.59772872924805, + "logps/rejected": -13.295238494873047, + "loss": 0.0481, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.026228666305542, + "rewards/margins": 2.7391231060028076, + "rewards/rejected": 0.2871055603027344, + "step": 11658 + }, + { + "epoch": 1.89, + "learning_rate": 7.573318723958611e-08, + "logits/chosen": -1.0674043893814087, + "logits/rejected": -1.278732180595398, + "logps/chosen": -80.120361328125, + "logps/rejected": -97.20439910888672, + "loss": 0.4228, + "rewards/accuracies": 0.0, + "rewards/chosen": 4.00808572769165, + "rewards/margins": -0.279604434967041, + "rewards/rejected": 4.287690162658691, + "step": 11659 + }, + { + "epoch": 1.89, + "learning_rate": 7.550547976045541e-08, + "logits/chosen": -1.4543161392211914, + "logits/rejected": -1.5044246912002563, + "logps/chosen": -128.45237731933594, + "logps/rejected": -60.15772247314453, + "loss": 1.076, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.095475912094116, + "rewards/margins": 0.23644804954528809, + "rewards/rejected": 1.8590278625488281, + "step": 11660 + }, + { + "epoch": 1.89, + "learning_rate": 7.527811251729966e-08, + "logits/chosen": -1.270644187927246, + "logits/rejected": -1.252914309501648, + "logps/chosen": -135.30972290039062, + "logps/rejected": -55.210731506347656, + "loss": 0.0249, + "rewards/accuracies": 1.0, + "rewards/chosen": 7.319180488586426, + "rewards/margins": 4.975675582885742, + "rewards/rejected": 2.3435051441192627, + "step": 11661 + }, + { + "epoch": 1.89, + "learning_rate": 7.505108552582852e-08, + "logits/chosen": -1.6069480180740356, + "logits/rejected": -1.305514931678772, + "logps/chosen": -70.61595916748047, + "logps/rejected": -90.59913635253906, + "loss": 1.0352, + "rewards/accuracies": 0.0, + "rewards/chosen": 4.870143890380859, + "rewards/margins": -1.920931339263916, + "rewards/rejected": 6.791075229644775, + "step": 11662 + }, + { + "epoch": 1.89, + "learning_rate": 7.48243988017272e-08, + "logits/chosen": -1.010472059249878, + "logits/rejected": -1.0117132663726807, + "logps/chosen": -2.4283227920532227, + "logps/rejected": -1.189662218093872, + "loss": 1.0656, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.14487695693969727, + "rewards/margins": -0.18386343121528625, + "rewards/rejected": 0.3287403881549835, + "step": 11663 + }, + { + "epoch": 1.89, + "learning_rate": 7.45980523606571e-08, + "logits/chosen": -1.1436009407043457, + "logits/rejected": -1.0859146118164062, + "logps/chosen": -46.994354248046875, + "logps/rejected": -72.66374206542969, + "loss": 0.291, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.6722733974456787, + "rewards/margins": 0.6692886352539062, + "rewards/rejected": 2.0029847621917725, + "step": 11664 + }, + { + "epoch": 1.89, + "learning_rate": 7.437204621825733e-08, + "logits/chosen": -1.1846152544021606, + "logits/rejected": -1.276018500328064, + "logps/chosen": -36.845298767089844, + "logps/rejected": -43.192718505859375, + "loss": 1.4677, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.4836581945419312, + "rewards/margins": -0.6297742128372192, + "rewards/rejected": 2.1134324073791504, + "step": 11665 + }, + { + "epoch": 1.89, + "learning_rate": 7.414638039014266e-08, + "logits/chosen": -1.0621628761291504, + "logits/rejected": -1.2690010070800781, + "logps/chosen": -56.78456115722656, + "logps/rejected": -129.3167724609375, + "loss": 2.4269, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.8541206121444702, + "rewards/margins": -4.843221187591553, + "rewards/rejected": 6.6973419189453125, + "step": 11666 + }, + { + "epoch": 1.89, + "learning_rate": 7.392105489190338e-08, + "logits/chosen": -1.4613878726959229, + "logits/rejected": -1.3916826248168945, + "logps/chosen": -79.277099609375, + "logps/rejected": -60.76312255859375, + "loss": 0.3944, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.9104012250900269, + "rewards/margins": -0.18103396892547607, + "rewards/rejected": 2.091435194015503, + "step": 11667 + }, + { + "epoch": 1.89, + "learning_rate": 7.369606973910869e-08, + "logits/chosen": -1.304925799369812, + "logits/rejected": -1.3417935371398926, + "logps/chosen": -95.01873779296875, + "logps/rejected": -190.22525024414062, + "loss": 0.2975, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.6366195678710938, + "rewards/margins": 0.3563551902770996, + "rewards/rejected": 2.280264377593994, + "step": 11668 + }, + { + "epoch": 1.89, + "learning_rate": 7.347142494730231e-08, + "logits/chosen": -1.315966010093689, + "logits/rejected": -1.2819912433624268, + "logps/chosen": -63.542781829833984, + "logps/rejected": -44.78281784057617, + "loss": 0.253, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.904144048690796, + "rewards/margins": 0.4215364456176758, + "rewards/rejected": 3.48260760307312, + "step": 11669 + }, + { + "epoch": 1.89, + "learning_rate": 7.324712053200566e-08, + "logits/chosen": -1.3209205865859985, + "logits/rejected": -1.045866847038269, + "logps/chosen": -63.48584747314453, + "logps/rejected": -32.49287033081055, + "loss": 0.5612, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.302549839019775, + "rewards/margins": 5.029370307922363, + "rewards/rejected": -0.7268205881118774, + "step": 11670 + }, + { + "epoch": 1.89, + "learning_rate": 7.302315650871527e-08, + "logits/chosen": -1.1138184070587158, + "logits/rejected": -1.0837193727493286, + "logps/chosen": -42.97452163696289, + "logps/rejected": -44.66117858886719, + "loss": 0.3339, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.351688861846924, + "rewards/margins": 0.4931827783584595, + "rewards/rejected": 1.8585060834884644, + "step": 11671 + }, + { + "epoch": 1.89, + "learning_rate": 7.279953289290542e-08, + "logits/chosen": -1.3175463676452637, + "logits/rejected": -1.3175463676452637, + "logps/chosen": -54.06199645996094, + "logps/rejected": -54.06199645996094, + "loss": 0.7721, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.7827820777893066, + "rewards/margins": 0.0, + "rewards/rejected": 3.7827820777893066, + "step": 11672 + }, + { + "epoch": 1.89, + "learning_rate": 7.257624970002596e-08, + "logits/chosen": -1.030043601989746, + "logits/rejected": -0.8730795979499817, + "logps/chosen": -99.134033203125, + "logps/rejected": -22.09132957458496, + "loss": 2.1115, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.1048645973205566, + "rewards/margins": 1.190410852432251, + "rewards/rejected": 0.9144536852836609, + "step": 11673 + }, + { + "epoch": 1.89, + "learning_rate": 7.235330694550402e-08, + "logits/chosen": -0.892395555973053, + "logits/rejected": -0.8675704002380371, + "logps/chosen": -42.073402404785156, + "logps/rejected": -15.179081916809082, + "loss": 0.3447, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.1052677631378174, + "rewards/margins": 0.7835898399353027, + "rewards/rejected": 0.32167789340019226, + "step": 11674 + }, + { + "epoch": 1.89, + "learning_rate": 7.213070464474337e-08, + "logits/chosen": -1.396347165107727, + "logits/rejected": -1.3195468187332153, + "logps/chosen": -92.103271484375, + "logps/rejected": -124.1133041381836, + "loss": 0.1449, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1052186489105225, + "rewards/margins": 1.1029808521270752, + "rewards/rejected": 2.0022377967834473, + "step": 11675 + }, + { + "epoch": 1.9, + "learning_rate": 7.190844281312226e-08, + "logits/chosen": -1.255414605140686, + "logits/rejected": -1.255414605140686, + "logps/chosen": -41.26002883911133, + "logps/rejected": -41.26002883911133, + "loss": 0.8217, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.6316158771514893, + "rewards/margins": 0.0, + "rewards/rejected": 3.6316158771514893, + "step": 11676 + }, + { + "epoch": 1.9, + "learning_rate": 7.168652146599841e-08, + "logits/chosen": -1.1143066883087158, + "logits/rejected": -1.1600611209869385, + "logps/chosen": -41.72895812988281, + "logps/rejected": -63.239688873291016, + "loss": 0.5648, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.0225815773010254, + "rewards/margins": 0.35432934761047363, + "rewards/rejected": 1.6682522296905518, + "step": 11677 + }, + { + "epoch": 1.9, + "learning_rate": 7.14649406187029e-08, + "logits/chosen": -0.7411552667617798, + "logits/rejected": -0.7411552667617798, + "logps/chosen": -0.33897411823272705, + "logps/rejected": -0.33897411823272705, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.14903821051120758, + "rewards/margins": 0.0, + "rewards/rejected": 0.14903821051120758, + "step": 11678 + }, + { + "epoch": 1.9, + "learning_rate": 7.124370028654681e-08, + "logits/chosen": -0.8160727620124817, + "logits/rejected": -0.8317779898643494, + "logps/chosen": -48.14686584472656, + "logps/rejected": -52.08607482910156, + "loss": 0.2353, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.1527678966522217, + "rewards/margins": 0.7075232267379761, + "rewards/rejected": 1.4452446699142456, + "step": 11679 + }, + { + "epoch": 1.9, + "learning_rate": 7.102280048481403e-08, + "logits/chosen": -1.1805237531661987, + "logits/rejected": -1.2717379331588745, + "logps/chosen": -135.80221557617188, + "logps/rejected": -114.84808349609375, + "loss": 0.564, + "rewards/accuracies": 0.0, + "rewards/chosen": 7.080513000488281, + "rewards/margins": -0.6229324340820312, + "rewards/rejected": 7.7034454345703125, + "step": 11680 + }, + { + "epoch": 1.9, + "learning_rate": 7.080224122876734e-08, + "logits/chosen": -1.128519058227539, + "logits/rejected": -1.2303729057312012, + "logps/chosen": -32.22361373901367, + "logps/rejected": -43.849586486816406, + "loss": 0.6442, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.2815799713134766, + "rewards/margins": -0.8875577449798584, + "rewards/rejected": 3.169137716293335, + "step": 11681 + }, + { + "epoch": 1.9, + "learning_rate": 7.058202253364511e-08, + "logits/chosen": -1.6799492835998535, + "logits/rejected": -1.3789910078048706, + "logps/chosen": -109.00296783447266, + "logps/rejected": -46.155147552490234, + "loss": 0.349, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.816410064697266, + "rewards/margins": 1.5618348121643066, + "rewards/rejected": 4.254575252532959, + "step": 11682 + }, + { + "epoch": 1.9, + "learning_rate": 7.03621444146635e-08, + "logits/chosen": -1.1449443101882935, + "logits/rejected": -1.1396806240081787, + "logps/chosen": -96.18047332763672, + "logps/rejected": -8.851540565490723, + "loss": 0.1899, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.0019134283065796, + "rewards/margins": 0.8059995174407959, + "rewards/rejected": 0.1959138959646225, + "step": 11683 + }, + { + "epoch": 1.9, + "learning_rate": 7.014260688701202e-08, + "logits/chosen": -1.437682867050171, + "logits/rejected": -1.4525790214538574, + "logps/chosen": -112.16537475585938, + "logps/rejected": -69.9509048461914, + "loss": 0.4201, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.5305466651916504, + "rewards/margins": 1.664731740951538, + "rewards/rejected": 0.8658149838447571, + "step": 11684 + }, + { + "epoch": 1.9, + "learning_rate": 6.992340996586022e-08, + "logits/chosen": -1.3877129554748535, + "logits/rejected": -1.3877129554748535, + "logps/chosen": -55.585784912109375, + "logps/rejected": -55.585784912109375, + "loss": 0.7224, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.687303304672241, + "rewards/margins": 0.0, + "rewards/rejected": 2.687303304672241, + "step": 11685 + }, + { + "epoch": 1.9, + "learning_rate": 6.970455366635154e-08, + "logits/chosen": -1.6221661567687988, + "logits/rejected": -1.5062233209609985, + "logps/chosen": -131.6830596923828, + "logps/rejected": -132.38262939453125, + "loss": 0.3837, + "rewards/accuracies": 1.0, + "rewards/chosen": 7.002591133117676, + "rewards/margins": 0.3770599365234375, + "rewards/rejected": 6.625531196594238, + "step": 11686 + }, + { + "epoch": 1.9, + "learning_rate": 6.948603800360776e-08, + "logits/chosen": -1.2260576486587524, + "logits/rejected": -1.21309494972229, + "logps/chosen": -72.47330474853516, + "logps/rejected": -30.05773162841797, + "loss": 0.1959, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.1836984157562256, + "rewards/margins": 0.9154645204544067, + "rewards/rejected": 1.2682338953018188, + "step": 11687 + }, + { + "epoch": 1.9, + "learning_rate": 6.926786299272514e-08, + "logits/chosen": -1.2729741334915161, + "logits/rejected": -1.3023220300674438, + "logps/chosen": -38.91649627685547, + "logps/rejected": -97.10163879394531, + "loss": 0.1653, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.582570791244507, + "rewards/margins": 1.1451181173324585, + "rewards/rejected": 1.4374526739120483, + "step": 11688 + }, + { + "epoch": 1.9, + "learning_rate": 6.905002864877886e-08, + "logits/chosen": -1.3039990663528442, + "logits/rejected": -0.9834210872650146, + "logps/chosen": -50.260948181152344, + "logps/rejected": -71.75613403320312, + "loss": 1.1463, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.4974687099456787, + "rewards/margins": -1.8206946849822998, + "rewards/rejected": 4.3181633949279785, + "step": 11689 + }, + { + "epoch": 1.9, + "learning_rate": 6.883253498681797e-08, + "logits/chosen": -1.5068614482879639, + "logits/rejected": -1.4930310249328613, + "logps/chosen": -65.96846008300781, + "logps/rejected": -72.65435028076172, + "loss": 0.2484, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.3760268688201904, + "rewards/margins": 1.0556739568710327, + "rewards/rejected": 1.3203529119491577, + "step": 11690 + }, + { + "epoch": 1.9, + "learning_rate": 6.861538202187046e-08, + "logits/chosen": -1.5325762033462524, + "logits/rejected": -1.5111266374588013, + "logps/chosen": -106.76954650878906, + "logps/rejected": -66.21548461914062, + "loss": 0.0664, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.9881837368011475, + "rewards/margins": 2.0148468017578125, + "rewards/rejected": 1.9733368158340454, + "step": 11691 + }, + { + "epoch": 1.9, + "learning_rate": 6.839856976893822e-08, + "logits/chosen": -1.2253438234329224, + "logits/rejected": -1.2011959552764893, + "logps/chosen": -51.73033905029297, + "logps/rejected": -69.72643280029297, + "loss": 1.2579, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.939499616622925, + "rewards/margins": -1.8333146572113037, + "rewards/rejected": 4.7728142738342285, + "step": 11692 + }, + { + "epoch": 1.9, + "learning_rate": 6.818209824300203e-08, + "logits/chosen": -1.4378107786178589, + "logits/rejected": -1.4031239748001099, + "logps/chosen": -54.96581268310547, + "logps/rejected": -50.25189208984375, + "loss": 0.6874, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.026693105697632, + "rewards/margins": -0.6415166854858398, + "rewards/rejected": 2.6682097911834717, + "step": 11693 + }, + { + "epoch": 1.9, + "learning_rate": 6.796596745901717e-08, + "logits/chosen": -1.377057671546936, + "logits/rejected": -1.4101500511169434, + "logps/chosen": -44.5444221496582, + "logps/rejected": -51.2835578918457, + "loss": 0.8528, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.54687237739563, + "rewards/margins": -0.5249276161193848, + "rewards/rejected": 3.0717999935150146, + "step": 11694 + }, + { + "epoch": 1.9, + "learning_rate": 6.775017743191726e-08, + "logits/chosen": -1.49074125289917, + "logits/rejected": -1.3774347305297852, + "logps/chosen": -96.30561828613281, + "logps/rejected": -66.57223510742188, + "loss": 0.0328, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.266670227050781, + "rewards/margins": 3.544102430343628, + "rewards/rejected": 2.7225677967071533, + "step": 11695 + }, + { + "epoch": 1.9, + "learning_rate": 6.753472817660978e-08, + "logits/chosen": -1.5455752611160278, + "logits/rejected": -1.3021403551101685, + "logps/chosen": -121.75555419921875, + "logps/rejected": -56.202945709228516, + "loss": 0.0826, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.017053127288818, + "rewards/margins": 1.9404220581054688, + "rewards/rejected": 4.07663106918335, + "step": 11696 + }, + { + "epoch": 1.9, + "learning_rate": 6.731961970798173e-08, + "logits/chosen": -0.8316423892974854, + "logits/rejected": -0.8248628377914429, + "logps/chosen": -1.5280078649520874, + "logps/rejected": -4.150888442993164, + "loss": 0.3888, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.37355706095695496, + "rewards/margins": 0.20689336955547333, + "rewards/rejected": 0.16666369140148163, + "step": 11697 + }, + { + "epoch": 1.9, + "learning_rate": 6.710485204089456e-08, + "logits/chosen": -1.3254706859588623, + "logits/rejected": -1.3668454885482788, + "logps/chosen": -100.8663330078125, + "logps/rejected": -55.06163024902344, + "loss": 0.6029, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.841090679168701, + "rewards/margins": 0.33825159072875977, + "rewards/rejected": 4.502839088439941, + "step": 11698 + }, + { + "epoch": 1.9, + "learning_rate": 6.68904251901864e-08, + "logits/chosen": -1.5186302661895752, + "logits/rejected": -1.5500202178955078, + "logps/chosen": -158.25790405273438, + "logps/rejected": -114.69937133789062, + "loss": 0.5307, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.709759473800659, + "rewards/margins": -0.5249910354614258, + "rewards/rejected": 3.234750509262085, + "step": 11699 + }, + { + "epoch": 1.9, + "learning_rate": 6.667633917067206e-08, + "logits/chosen": -1.3609025478363037, + "logits/rejected": -1.3577494621276855, + "logps/chosen": -4.152260780334473, + "logps/rejected": -2.5632591247558594, + "loss": 0.3685, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.4374053180217743, + "rewards/margins": -0.010933876037597656, + "rewards/rejected": 0.44833919405937195, + "step": 11700 + }, + { + "epoch": 1.9, + "learning_rate": 6.646259399714416e-08, + "logits/chosen": -1.4437148571014404, + "logits/rejected": -1.355911135673523, + "logps/chosen": -72.48033905029297, + "logps/rejected": -65.95901489257812, + "loss": 0.3017, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.3761208057403564, + "rewards/margins": 1.2197006940841675, + "rewards/rejected": 1.156420111656189, + "step": 11701 + }, + { + "epoch": 1.9, + "learning_rate": 6.624918968436812e-08, + "logits/chosen": -1.455333948135376, + "logits/rejected": -1.3248646259307861, + "logps/chosen": -65.36539459228516, + "logps/rejected": -97.0632553100586, + "loss": 0.5769, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.23475980758667, + "rewards/margins": 0.4310741424560547, + "rewards/rejected": 4.803685665130615, + "step": 11702 + }, + { + "epoch": 1.9, + "learning_rate": 6.603612624709043e-08, + "logits/chosen": -1.515937328338623, + "logits/rejected": -1.5691229104995728, + "logps/chosen": -49.32183074951172, + "logps/rejected": -101.87340545654297, + "loss": 1.1553, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.797075033187866, + "rewards/margins": -2.200040578842163, + "rewards/rejected": 5.997115612030029, + "step": 11703 + }, + { + "epoch": 1.9, + "learning_rate": 6.582340370003048e-08, + "logits/chosen": -1.1901806592941284, + "logits/rejected": -1.1818557977676392, + "logps/chosen": -1.0603832006454468, + "logps/rejected": -4.868732929229736, + "loss": 0.4645, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.3621043860912323, + "rewards/margins": -0.19737842679023743, + "rewards/rejected": 0.5594828128814697, + "step": 11704 + }, + { + "epoch": 1.9, + "learning_rate": 6.561102205788539e-08, + "logits/chosen": -1.1708637475967407, + "logits/rejected": -1.075963020324707, + "logps/chosen": -65.71894836425781, + "logps/rejected": -58.488250732421875, + "loss": 0.3769, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.096200704574585, + "rewards/margins": 0.09572911262512207, + "rewards/rejected": 2.000471591949463, + "step": 11705 + }, + { + "epoch": 1.9, + "learning_rate": 6.539898133533007e-08, + "logits/chosen": -1.3186547756195068, + "logits/rejected": -1.4042751789093018, + "logps/chosen": -159.07928466796875, + "logps/rejected": -128.0577392578125, + "loss": 0.6962, + "rewards/accuracies": 0.0, + "rewards/chosen": 6.499250888824463, + "rewards/margins": -1.1016249656677246, + "rewards/rejected": 7.6008758544921875, + "step": 11706 + }, + { + "epoch": 1.9, + "learning_rate": 6.518728154701226e-08, + "logits/chosen": -1.2898977994918823, + "logits/rejected": -1.2814258337020874, + "logps/chosen": -68.81523132324219, + "logps/rejected": -80.31327819824219, + "loss": 0.4902, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.3144820928573608, + "rewards/margins": -0.296955943107605, + "rewards/rejected": 1.6114380359649658, + "step": 11707 + }, + { + "epoch": 1.9, + "learning_rate": 6.497592270756081e-08, + "logits/chosen": -0.9480113387107849, + "logits/rejected": -1.007806420326233, + "logps/chosen": -39.39257049560547, + "logps/rejected": -62.949798583984375, + "loss": 0.6003, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.75798499584198, + "rewards/margins": 0.4817047119140625, + "rewards/rejected": 1.2762802839279175, + "step": 11708 + }, + { + "epoch": 1.9, + "learning_rate": 6.476490483157683e-08, + "logits/chosen": -1.1853880882263184, + "logits/rejected": -1.1853880882263184, + "logps/chosen": -24.95107650756836, + "logps/rejected": -24.95107650756836, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.03280601650476456, + "rewards/margins": 0.0, + "rewards/rejected": -0.03280601650476456, + "step": 11709 + }, + { + "epoch": 1.9, + "learning_rate": 6.45542279336403e-08, + "logits/chosen": -1.528465747833252, + "logits/rejected": -1.4029675722122192, + "logps/chosen": -167.41156005859375, + "logps/rejected": -108.50276947021484, + "loss": 0.4355, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.6454360485076904, + "rewards/margins": -0.3282327651977539, + "rewards/rejected": 3.9736688137054443, + "step": 11710 + }, + { + "epoch": 1.9, + "learning_rate": 6.434389202830682e-08, + "logits/chosen": -1.2406738996505737, + "logits/rejected": -1.2406738996505737, + "logps/chosen": -49.70986557006836, + "logps/rejected": -49.70986557006836, + "loss": 0.361, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.2302091121673584, + "rewards/margins": 0.0, + "rewards/rejected": 2.2302091121673584, + "step": 11711 + }, + { + "epoch": 1.9, + "learning_rate": 6.41338971301092e-08, + "logits/chosen": -1.0997155904769897, + "logits/rejected": -1.106148600578308, + "logps/chosen": -36.56341552734375, + "logps/rejected": -117.9374008178711, + "loss": 0.5695, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.506157636642456, + "rewards/margins": -0.1755218505859375, + "rewards/rejected": 2.6816794872283936, + "step": 11712 + }, + { + "epoch": 1.9, + "learning_rate": 6.392424325355584e-08, + "logits/chosen": -1.4254881143569946, + "logits/rejected": -1.3886089324951172, + "logps/chosen": -83.04972839355469, + "logps/rejected": -63.14990234375, + "loss": 0.3586, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.8641343116760254, + "rewards/margins": 0.7253587245941162, + "rewards/rejected": 2.138775587081909, + "step": 11713 + }, + { + "epoch": 1.9, + "learning_rate": 6.371493041313126e-08, + "logits/chosen": -1.0236682891845703, + "logits/rejected": -1.0380760431289673, + "logps/chosen": -54.59450149536133, + "logps/rejected": -46.40757751464844, + "loss": 0.8746, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.195758581161499, + "rewards/margins": -0.3376619815826416, + "rewards/rejected": 2.5334205627441406, + "step": 11714 + }, + { + "epoch": 1.9, + "learning_rate": 6.350595862329722e-08, + "logits/chosen": -0.7767164707183838, + "logits/rejected": -0.7694166302680969, + "logps/chosen": -71.82942962646484, + "logps/rejected": -52.18366241455078, + "loss": 0.3276, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.315929412841797, + "rewards/margins": 0.09796595573425293, + "rewards/rejected": 2.217963457107544, + "step": 11715 + }, + { + "epoch": 1.9, + "learning_rate": 6.329732789849275e-08, + "logits/chosen": -0.8949558734893799, + "logits/rejected": -0.9140040874481201, + "logps/chosen": -79.06574249267578, + "logps/rejected": -47.92698669433594, + "loss": 0.4698, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.8226127624511719, + "rewards/margins": -0.042017340660095215, + "rewards/rejected": 1.864630103111267, + "step": 11716 + }, + { + "epoch": 1.9, + "learning_rate": 6.308903825313074e-08, + "logits/chosen": -1.2106199264526367, + "logits/rejected": -1.3311671018600464, + "logps/chosen": -130.91197204589844, + "logps/rejected": -97.34708404541016, + "loss": 0.6163, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.913810729980469, + "rewards/margins": 0.18039464950561523, + "rewards/rejected": 5.7334160804748535, + "step": 11717 + }, + { + "epoch": 1.9, + "learning_rate": 6.2881089701603e-08, + "logits/chosen": -1.3154428005218506, + "logits/rejected": -1.3651858568191528, + "logps/chosen": -74.58474731445312, + "logps/rejected": -66.2676010131836, + "loss": 1.4145, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.462811231613159, + "rewards/margins": -1.0206873416900635, + "rewards/rejected": 4.483498573303223, + "step": 11718 + }, + { + "epoch": 1.9, + "learning_rate": 6.267348225827641e-08, + "logits/chosen": -1.553784728050232, + "logits/rejected": -1.5126519203186035, + "logps/chosen": -62.980308532714844, + "logps/rejected": -156.44725036621094, + "loss": 0.7368, + "rewards/accuracies": 0.0, + "rewards/chosen": 7.4195380210876465, + "rewards/margins": -1.095201015472412, + "rewards/rejected": 8.514739036560059, + "step": 11719 + }, + { + "epoch": 1.9, + "learning_rate": 6.246621593749446e-08, + "logits/chosen": -1.3771895170211792, + "logits/rejected": -1.353293538093567, + "logps/chosen": -43.160728454589844, + "logps/rejected": -95.88587951660156, + "loss": 0.3863, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.914320468902588, + "rewards/margins": 0.07444238662719727, + "rewards/rejected": 2.8398780822753906, + "step": 11720 + }, + { + "epoch": 1.9, + "learning_rate": 6.225929075357794e-08, + "logits/chosen": -1.0806901454925537, + "logits/rejected": -0.9161907434463501, + "logps/chosen": -63.53776550292969, + "logps/rejected": -25.099714279174805, + "loss": 0.0945, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.5294220447540283, + "rewards/margins": 1.619441032409668, + "rewards/rejected": 1.9099810123443604, + "step": 11721 + }, + { + "epoch": 1.9, + "learning_rate": 6.20527067208232e-08, + "logits/chosen": -1.0519589185714722, + "logits/rejected": -1.0877262353897095, + "logps/chosen": -30.991418838500977, + "logps/rejected": -59.84095001220703, + "loss": 0.943, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.9766825437545776, + "rewards/margins": -0.9803875684738159, + "rewards/rejected": 2.9570701122283936, + "step": 11722 + }, + { + "epoch": 1.9, + "learning_rate": 6.184646385350269e-08, + "logits/chosen": -1.316377878189087, + "logits/rejected": -1.3100073337554932, + "logps/chosen": -42.913841247558594, + "logps/rejected": -78.05778503417969, + "loss": 1.026, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.528540849685669, + "rewards/margins": -1.8809599876403809, + "rewards/rejected": 3.40950083732605, + "step": 11723 + }, + { + "epoch": 1.9, + "learning_rate": 6.164056216586722e-08, + "logits/chosen": -0.8067418336868286, + "logits/rejected": -0.8055784702301025, + "logps/chosen": -2.049614906311035, + "logps/rejected": -1.565247654914856, + "loss": 0.4051, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.18222156167030334, + "rewards/margins": -0.10619053244590759, + "rewards/rejected": 0.28841209411621094, + "step": 11724 + }, + { + "epoch": 1.9, + "learning_rate": 6.143500167214045e-08, + "logits/chosen": -1.1091750860214233, + "logits/rejected": -1.1139127016067505, + "logps/chosen": -54.55014419555664, + "logps/rejected": -57.18610382080078, + "loss": 0.2878, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.4271602630615234, + "rewards/margins": 0.9692233800888062, + "rewards/rejected": 1.4579368829727173, + "step": 11725 + }, + { + "epoch": 1.9, + "learning_rate": 6.12297823865271e-08, + "logits/chosen": -1.193932294845581, + "logits/rejected": -1.2651454210281372, + "logps/chosen": -76.94557189941406, + "logps/rejected": -133.37442016601562, + "loss": 1.1722, + "rewards/accuracies": 0.0, + "rewards/chosen": 5.424919128417969, + "rewards/margins": -2.2220230102539062, + "rewards/rejected": 7.646942138671875, + "step": 11726 + }, + { + "epoch": 1.9, + "learning_rate": 6.102490432320362e-08, + "logits/chosen": -1.538924217224121, + "logits/rejected": -1.4920321702957153, + "logps/chosen": -50.65503692626953, + "logps/rejected": -31.623119354248047, + "loss": 1.2201, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.351752519607544, + "rewards/margins": 2.012604236602783, + "rewards/rejected": 0.3391483426094055, + "step": 11727 + }, + { + "epoch": 1.9, + "learning_rate": 6.082036749632702e-08, + "logits/chosen": -1.3634506464004517, + "logits/rejected": -1.155480980873108, + "logps/chosen": -152.39364624023438, + "logps/rejected": -40.40126037597656, + "loss": 0.0061, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.258983135223389, + "rewards/margins": 4.419374465942383, + "rewards/rejected": 1.8396084308624268, + "step": 11728 + }, + { + "epoch": 1.9, + "learning_rate": 6.061617192002767e-08, + "logits/chosen": -1.1516258716583252, + "logits/rejected": -1.1516258716583252, + "logps/chosen": -38.20136642456055, + "logps/rejected": -38.20136642456055, + "loss": 1.0845, + "rewards/accuracies": 0.0, + "rewards/chosen": 5.4111647605896, + "rewards/margins": 0.0, + "rewards/rejected": 5.4111647605896, + "step": 11729 + }, + { + "epoch": 1.9, + "learning_rate": 6.041231760841426e-08, + "logits/chosen": -1.1041216850280762, + "logits/rejected": -1.3157830238342285, + "logps/chosen": -85.87897491455078, + "logps/rejected": -99.68016052246094, + "loss": 2.0243, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.4818146228790283, + "rewards/margins": -3.8242170810699463, + "rewards/rejected": 6.306031703948975, + "step": 11730 + }, + { + "epoch": 1.9, + "learning_rate": 6.020880457557054e-08, + "logits/chosen": -1.379267692565918, + "logits/rejected": -1.4816315174102783, + "logps/chosen": -40.91899871826172, + "logps/rejected": -76.40625, + "loss": 1.3736, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.8054649829864502, + "rewards/margins": -2.671184778213501, + "rewards/rejected": 4.476649761199951, + "step": 11731 + }, + { + "epoch": 1.9, + "learning_rate": 6.000563283555805e-08, + "logits/chosen": -1.4131569862365723, + "logits/rejected": -1.2687758207321167, + "logps/chosen": -101.9752426147461, + "logps/rejected": -56.35784149169922, + "loss": 0.0465, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.949199676513672, + "rewards/margins": 5.80135440826416, + "rewards/rejected": 1.1478455066680908, + "step": 11732 + }, + { + "epoch": 1.9, + "learning_rate": 5.980280240241332e-08, + "logits/chosen": -1.2323271036148071, + "logits/rejected": -1.2236485481262207, + "logps/chosen": -69.23770141601562, + "logps/rejected": -78.92240905761719, + "loss": 0.1376, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.045982360839844, + "rewards/margins": 1.1788878440856934, + "rewards/rejected": 2.8670945167541504, + "step": 11733 + }, + { + "epoch": 1.9, + "learning_rate": 5.96003132901507e-08, + "logits/chosen": -1.3386621475219727, + "logits/rejected": -1.3309640884399414, + "logps/chosen": -60.246726989746094, + "logps/rejected": -54.59881591796875, + "loss": 0.0996, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.6497268676757812, + "rewards/margins": 1.5148032903671265, + "rewards/rejected": 1.1349235773086548, + "step": 11734 + }, + { + "epoch": 1.9, + "learning_rate": 5.939816551276012e-08, + "logits/chosen": -1.1683510541915894, + "logits/rejected": -1.1682878732681274, + "logps/chosen": -83.42475891113281, + "logps/rejected": -99.62149047851562, + "loss": 0.1119, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0897057056427, + "rewards/margins": 1.7401924133300781, + "rewards/rejected": 1.349513292312622, + "step": 11735 + }, + { + "epoch": 1.9, + "learning_rate": 5.919635908420762e-08, + "logits/chosen": -1.2103716135025024, + "logits/rejected": -0.8529422879219055, + "logps/chosen": -119.0322265625, + "logps/rejected": -18.12047576904297, + "loss": 0.1225, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.136867046356201, + "rewards/margins": 2.1234934329986572, + "rewards/rejected": 2.013373613357544, + "step": 11736 + }, + { + "epoch": 1.91, + "learning_rate": 5.89948940184365e-08, + "logits/chosen": -1.2498432397842407, + "logits/rejected": -1.2126264572143555, + "logps/chosen": -49.23846435546875, + "logps/rejected": -39.016998291015625, + "loss": 0.7769, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.806341528892517, + "rewards/margins": -0.4835876226425171, + "rewards/rejected": 2.289929151535034, + "step": 11737 + }, + { + "epoch": 1.91, + "learning_rate": 5.879377032936562e-08, + "logits/chosen": -1.3530324697494507, + "logits/rejected": -1.2908674478530884, + "logps/chosen": -56.3072509765625, + "logps/rejected": -63.79963302612305, + "loss": 0.443, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.6468751430511475, + "rewards/margins": -0.3400545120239258, + "rewards/rejected": 2.9869296550750732, + "step": 11738 + }, + { + "epoch": 1.91, + "learning_rate": 5.859298803089164e-08, + "logits/chosen": -0.8906542658805847, + "logits/rejected": -0.8906542658805847, + "logps/chosen": -25.003271102905273, + "logps/rejected": -25.003271102905273, + "loss": 1.8828, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.4805525541305542, + "rewards/margins": 0.0, + "rewards/rejected": 1.4805525541305542, + "step": 11739 + }, + { + "epoch": 1.91, + "learning_rate": 5.8392547136885133e-08, + "logits/chosen": -1.3807945251464844, + "logits/rejected": -1.4125704765319824, + "logps/chosen": -39.13309860229492, + "logps/rejected": -30.980363845825195, + "loss": 0.2685, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.4465324878692627, + "rewards/margins": 0.6826696395874023, + "rewards/rejected": 1.7638628482818604, + "step": 11740 + }, + { + "epoch": 1.91, + "learning_rate": 5.8192447661196694e-08, + "logits/chosen": -0.9990955591201782, + "logits/rejected": -1.0128439664840698, + "logps/chosen": -40.138710021972656, + "logps/rejected": -40.352760314941406, + "loss": 0.3362, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.235952377319336, + "rewards/margins": 0.22339892387390137, + "rewards/rejected": 2.0125534534454346, + "step": 11741 + }, + { + "epoch": 1.91, + "learning_rate": 5.79926896176497e-08, + "logits/chosen": -1.4093208312988281, + "logits/rejected": -1.352912187576294, + "logps/chosen": -78.90070343017578, + "logps/rejected": -38.25566101074219, + "loss": 0.5573, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.745581030845642, + "rewards/margins": -0.7152694463729858, + "rewards/rejected": 2.460850477218628, + "step": 11742 + }, + { + "epoch": 1.91, + "learning_rate": 5.7793273020047004e-08, + "logits/chosen": -1.411808967590332, + "logits/rejected": -1.3588991165161133, + "logps/chosen": -59.401123046875, + "logps/rejected": -62.55278015136719, + "loss": 1.0187, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.343187093734741, + "rewards/margins": 0.04675912857055664, + "rewards/rejected": 3.2964279651641846, + "step": 11743 + }, + { + "epoch": 1.91, + "learning_rate": 5.75941978821648e-08, + "logits/chosen": -1.2557599544525146, + "logits/rejected": -1.1704294681549072, + "logps/chosen": -116.65663146972656, + "logps/rejected": -97.95684814453125, + "loss": 0.0053, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.961280822753906, + "rewards/margins": 5.202112674713135, + "rewards/rejected": 0.759168267250061, + "step": 11744 + }, + { + "epoch": 1.91, + "learning_rate": 5.739546421775821e-08, + "logits/chosen": -1.2010935544967651, + "logits/rejected": -1.1738229990005493, + "logps/chosen": -85.27457427978516, + "logps/rejected": -39.44728469848633, + "loss": 0.4505, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.4770530462265015, + "rewards/margins": -0.13666844367980957, + "rewards/rejected": 1.613721489906311, + "step": 11745 + }, + { + "epoch": 1.91, + "learning_rate": 5.7197072040557356e-08, + "logits/chosen": -1.328933835029602, + "logits/rejected": -1.327754020690918, + "logps/chosen": -60.41656494140625, + "logps/rejected": -118.77757263183594, + "loss": 0.205, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.4539475440979004, + "rewards/margins": 0.7225006818771362, + "rewards/rejected": 1.7314468622207642, + "step": 11746 + }, + { + "epoch": 1.91, + "learning_rate": 5.6999021364270155e-08, + "logits/chosen": -1.3341985940933228, + "logits/rejected": -1.4747493267059326, + "logps/chosen": -64.891845703125, + "logps/rejected": -116.25788116455078, + "loss": 0.8171, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.1674866676330566, + "rewards/margins": -0.7182822227478027, + "rewards/rejected": 3.8857688903808594, + "step": 11747 + }, + { + "epoch": 1.91, + "learning_rate": 5.680131220257901e-08, + "logits/chosen": -1.467199683189392, + "logits/rejected": -1.467199683189392, + "logps/chosen": -13.317892074584961, + "logps/rejected": -13.317892074584961, + "loss": 0.3931, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.3312171697616577, + "rewards/margins": 0.0, + "rewards/rejected": 1.3312171697616577, + "step": 11748 + }, + { + "epoch": 1.91, + "learning_rate": 5.660394456914464e-08, + "logits/chosen": -1.1989854574203491, + "logits/rejected": -1.162982702255249, + "logps/chosen": -77.0185546875, + "logps/rejected": -57.629905700683594, + "loss": 0.5044, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.4166488647460938, + "rewards/margins": -0.07363677024841309, + "rewards/rejected": 2.490285634994507, + "step": 11749 + }, + { + "epoch": 1.91, + "learning_rate": 5.640691847760227e-08, + "logits/chosen": -1.0920324325561523, + "logits/rejected": -1.0706262588500977, + "logps/chosen": -77.35086059570312, + "logps/rejected": -113.3454818725586, + "loss": 0.4182, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.613063097000122, + "rewards/margins": 0.5451477766036987, + "rewards/rejected": 1.0679153203964233, + "step": 11750 + }, + { + "epoch": 1.91, + "learning_rate": 5.6210233941565996e-08, + "logits/chosen": -1.251076579093933, + "logits/rejected": -1.320117712020874, + "logps/chosen": -92.0293197631836, + "logps/rejected": -108.51716613769531, + "loss": 1.5983, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.0370185375213623, + "rewards/margins": -3.1487467288970947, + "rewards/rejected": 6.185765266418457, + "step": 11751 + }, + { + "epoch": 1.91, + "learning_rate": 5.6013890974623286e-08, + "logits/chosen": -1.2092028856277466, + "logits/rejected": -1.218482494354248, + "logps/chosen": -58.641963958740234, + "logps/rejected": -39.00962829589844, + "loss": 0.4883, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.810944080352783, + "rewards/margins": 0.5950419902801514, + "rewards/rejected": 3.215902090072632, + "step": 11752 + }, + { + "epoch": 1.91, + "learning_rate": 5.581788959034051e-08, + "logits/chosen": -1.0656205415725708, + "logits/rejected": -1.0656205415725708, + "logps/chosen": -98.43019104003906, + "logps/rejected": -98.43019104003906, + "loss": 0.3546, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.173661947250366, + "rewards/margins": 0.0, + "rewards/rejected": 3.173661947250366, + "step": 11753 + }, + { + "epoch": 1.91, + "learning_rate": 5.562222980225907e-08, + "logits/chosen": -0.9995510578155518, + "logits/rejected": -0.9949749708175659, + "logps/chosen": -7.827856063842773, + "logps/rejected": -13.31688404083252, + "loss": 0.6419, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.369768351316452, + "rewards/margins": -0.5598050355911255, + "rewards/rejected": 0.9295733571052551, + "step": 11754 + }, + { + "epoch": 1.91, + "learning_rate": 5.5426911623897574e-08, + "logits/chosen": -1.402307152748108, + "logits/rejected": -1.371989369392395, + "logps/chosen": -85.61753845214844, + "logps/rejected": -218.05599975585938, + "loss": 0.0693, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.826396465301514, + "rewards/margins": 3.135908842086792, + "rewards/rejected": 3.6904876232147217, + "step": 11755 + }, + { + "epoch": 1.91, + "learning_rate": 5.523193506875024e-08, + "logits/chosen": -1.4142264127731323, + "logits/rejected": -1.395123839378357, + "logps/chosen": -50.25953674316406, + "logps/rejected": -102.19413757324219, + "loss": 0.354, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.2970077991485596, + "rewards/margins": 1.732100009918213, + "rewards/rejected": 0.5649078488349915, + "step": 11756 + }, + { + "epoch": 1.91, + "learning_rate": 5.503730015028908e-08, + "logits/chosen": -1.1021711826324463, + "logits/rejected": -1.1021711826324463, + "logps/chosen": -54.71868133544922, + "logps/rejected": -54.71868133544922, + "loss": 0.347, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.0341179370880127, + "rewards/margins": 0.0, + "rewards/rejected": 2.0341179370880127, + "step": 11757 + }, + { + "epoch": 1.91, + "learning_rate": 5.484300688195998e-08, + "logits/chosen": -1.0417567491531372, + "logits/rejected": -1.051293969154358, + "logps/chosen": -55.50297164916992, + "logps/rejected": -122.78666687011719, + "loss": 1.7404, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.298318862915039, + "rewards/margins": 0.47550541162490845, + "rewards/rejected": 0.8228134512901306, + "step": 11758 + }, + { + "epoch": 1.91, + "learning_rate": 5.4649055277187776e-08, + "logits/chosen": -1.0929193496704102, + "logits/rejected": -1.0751899480819702, + "logps/chosen": -7.2272772789001465, + "logps/rejected": -3.680333137512207, + "loss": 0.2913, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.644684076309204, + "rewards/margins": 0.3266078233718872, + "rewards/rejected": 1.318076252937317, + "step": 11759 + }, + { + "epoch": 1.91, + "learning_rate": 5.445544534937286e-08, + "logits/chosen": -1.3429268598556519, + "logits/rejected": -1.2598838806152344, + "logps/chosen": -151.64236450195312, + "logps/rejected": -41.91615295410156, + "loss": 0.2078, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.4029388427734375, + "rewards/margins": 0.6744322776794434, + "rewards/rejected": 4.728506565093994, + "step": 11760 + }, + { + "epoch": 1.91, + "learning_rate": 5.426217711189119e-08, + "logits/chosen": -1.3559236526489258, + "logits/rejected": -1.2869116067886353, + "logps/chosen": -81.28987884521484, + "logps/rejected": -85.0897216796875, + "loss": 0.0851, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.217336177825928, + "rewards/margins": 1.7036902904510498, + "rewards/rejected": 3.513645887374878, + "step": 11761 + }, + { + "epoch": 1.91, + "learning_rate": 5.406925057809653e-08, + "logits/chosen": -1.3063124418258667, + "logits/rejected": -1.2448484897613525, + "logps/chosen": -42.6854248046875, + "logps/rejected": -54.8870849609375, + "loss": 0.3095, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.9378547668457031, + "rewards/margins": 0.31077349185943604, + "rewards/rejected": 1.627081274986267, + "step": 11762 + }, + { + "epoch": 1.91, + "learning_rate": 5.387666576131711e-08, + "logits/chosen": -0.9417495727539062, + "logits/rejected": -0.9172396063804626, + "logps/chosen": -47.87358856201172, + "logps/rejected": -75.79548645019531, + "loss": 0.4487, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.9610527753829956, + "rewards/margins": -0.3328360319137573, + "rewards/rejected": 2.293888807296753, + "step": 11763 + }, + { + "epoch": 1.91, + "learning_rate": 5.368442267486007e-08, + "logits/chosen": -1.7975835800170898, + "logits/rejected": -1.7069679498672485, + "logps/chosen": -68.19844055175781, + "logps/rejected": -39.73666763305664, + "loss": 0.2038, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1167755126953125, + "rewards/margins": 3.222169876098633, + "rewards/rejected": -0.10539436340332031, + "step": 11764 + }, + { + "epoch": 1.91, + "learning_rate": 5.349252133200644e-08, + "logits/chosen": -0.8818014860153198, + "logits/rejected": -0.878871500492096, + "logps/chosen": -1.973414421081543, + "logps/rejected": -7.688405990600586, + "loss": 0.3226, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.374843567609787, + "rewards/margins": 0.10202547907829285, + "rewards/rejected": 0.27281808853149414, + "step": 11765 + }, + { + "epoch": 1.91, + "learning_rate": 5.3300961746016175e-08, + "logits/chosen": -0.9469553232192993, + "logits/rejected": -0.9469553232192993, + "logps/chosen": -38.936668395996094, + "logps/rejected": -38.936668395996094, + "loss": 0.4647, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.481044054031372, + "rewards/margins": 0.0, + "rewards/rejected": 2.481044054031372, + "step": 11766 + }, + { + "epoch": 1.91, + "learning_rate": 5.310974393012258e-08, + "logits/chosen": -1.2516522407531738, + "logits/rejected": -1.1999496221542358, + "logps/chosen": -46.21865463256836, + "logps/rejected": -47.073699951171875, + "loss": 1.0133, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.407726764678955, + "rewards/margins": -0.4320080280303955, + "rewards/rejected": 2.8397347927093506, + "step": 11767 + }, + { + "epoch": 1.91, + "learning_rate": 5.291886789753786e-08, + "logits/chosen": -1.169144868850708, + "logits/rejected": -1.1354531049728394, + "logps/chosen": -100.42607116699219, + "logps/rejected": -83.83280181884766, + "loss": 0.4316, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.153337240219116, + "rewards/margins": 2.200765371322632, + "rewards/rejected": 0.9525718688964844, + "step": 11768 + }, + { + "epoch": 1.91, + "learning_rate": 5.2728333661449806e-08, + "logits/chosen": -1.2207908630371094, + "logits/rejected": -1.14692223072052, + "logps/chosen": -93.1654052734375, + "logps/rejected": -46.176239013671875, + "loss": 0.1959, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.578965902328491, + "rewards/margins": 1.1282514333724976, + "rewards/rejected": 1.4507144689559937, + "step": 11769 + }, + { + "epoch": 1.91, + "learning_rate": 5.253814123502232e-08, + "logits/chosen": -1.2905648946762085, + "logits/rejected": -1.48934006690979, + "logps/chosen": -58.199241638183594, + "logps/rejected": -122.70823669433594, + "loss": 1.3408, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.933779239654541, + "rewards/margins": -2.551633358001709, + "rewards/rejected": 5.48541259765625, + "step": 11770 + }, + { + "epoch": 1.91, + "learning_rate": 5.2348290631396014e-08, + "logits/chosen": -0.9660971760749817, + "logits/rejected": -1.1319464445114136, + "logps/chosen": -35.574462890625, + "logps/rejected": -80.56822204589844, + "loss": 0.2045, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.94176185131073, + "rewards/margins": 2.3394861221313477, + "rewards/rejected": -0.3977241516113281, + "step": 11771 + }, + { + "epoch": 1.91, + "learning_rate": 5.215878186368761e-08, + "logits/chosen": -1.8020532131195068, + "logits/rejected": -1.7851663827896118, + "logps/chosen": -78.0634765625, + "logps/rejected": -21.42009925842285, + "loss": 0.2666, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.3944694995880127, + "rewards/margins": 0.36603498458862305, + "rewards/rejected": 2.0284345149993896, + "step": 11772 + }, + { + "epoch": 1.91, + "learning_rate": 5.1969614944989976e-08, + "logits/chosen": -1.3955416679382324, + "logits/rejected": -1.5566352605819702, + "logps/chosen": -68.40848541259766, + "logps/rejected": -107.37255859375, + "loss": 3.9582, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.5423295497894287, + "rewards/margins": -7.222491264343262, + "rewards/rejected": 9.76482105255127, + "step": 11773 + }, + { + "epoch": 1.91, + "learning_rate": 5.178078988837432e-08, + "logits/chosen": -1.2708044052124023, + "logits/rejected": -1.347856879234314, + "logps/chosen": -88.66122436523438, + "logps/rejected": -77.97340393066406, + "loss": 1.519, + "rewards/accuracies": 0.0, + "rewards/chosen": 6.813023567199707, + "rewards/margins": -2.244089126586914, + "rewards/rejected": 9.057112693786621, + "step": 11774 + }, + { + "epoch": 1.91, + "learning_rate": 5.1592306706884644e-08, + "logits/chosen": -1.6162270307540894, + "logits/rejected": -1.4984465837478638, + "logps/chosen": -163.65963745117188, + "logps/rejected": -84.22611236572266, + "loss": 0.3801, + "rewards/accuracies": 1.0, + "rewards/chosen": 7.4162750244140625, + "rewards/margins": 0.8662862777709961, + "rewards/rejected": 6.549988746643066, + "step": 11775 + }, + { + "epoch": 1.91, + "learning_rate": 5.140416541354498e-08, + "logits/chosen": -1.47897207736969, + "logits/rejected": -1.481168508529663, + "logps/chosen": -64.88975524902344, + "logps/rejected": -58.57373809814453, + "loss": 0.9703, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.6919265985488892, + "rewards/margins": -1.4348593950271606, + "rewards/rejected": 3.12678599357605, + "step": 11776 + }, + { + "epoch": 1.91, + "learning_rate": 5.121636602135327e-08, + "logits/chosen": -1.7052075862884521, + "logits/rejected": -1.5468357801437378, + "logps/chosen": -180.6928253173828, + "logps/rejected": -52.64513397216797, + "loss": 0.1658, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.750099182128906, + "rewards/margins": 1.1095547676086426, + "rewards/rejected": 4.640544414520264, + "step": 11777 + }, + { + "epoch": 1.91, + "learning_rate": 5.1028908543284686e-08, + "logits/chosen": -1.5231938362121582, + "logits/rejected": -1.520285725593567, + "logps/chosen": -200.43511962890625, + "logps/rejected": -69.83309936523438, + "loss": 0.0248, + "rewards/accuracies": 1.0, + "rewards/chosen": 7.27675199508667, + "rewards/margins": 3.0818681716918945, + "rewards/rejected": 4.194883823394775, + "step": 11778 + }, + { + "epoch": 1.91, + "learning_rate": 5.084179299229053e-08, + "logits/chosen": -1.4688504934310913, + "logits/rejected": -1.228723406791687, + "logps/chosen": -106.66854095458984, + "logps/rejected": -42.83511734008789, + "loss": 0.1712, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.6657328605651855, + "rewards/margins": 0.9737629890441895, + "rewards/rejected": 4.691969871520996, + "step": 11779 + }, + { + "epoch": 1.91, + "learning_rate": 5.065501938129991e-08, + "logits/chosen": -1.4123508930206299, + "logits/rejected": -1.6070692539215088, + "logps/chosen": -83.92170715332031, + "logps/rejected": -33.95637512207031, + "loss": 0.0448, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9255967140197754, + "rewards/margins": 2.395200729370117, + "rewards/rejected": 0.5303959250450134, + "step": 11780 + }, + { + "epoch": 1.91, + "learning_rate": 5.046858772321583e-08, + "logits/chosen": -1.389085292816162, + "logits/rejected": -1.389085292816162, + "logps/chosen": -51.12660217285156, + "logps/rejected": -51.12660217285156, + "loss": 0.5695, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.2351882457733154, + "rewards/margins": 0.0, + "rewards/rejected": 3.2351882457733154, + "step": 11781 + }, + { + "epoch": 1.91, + "learning_rate": 5.028249803091967e-08, + "logits/chosen": -1.3883906602859497, + "logits/rejected": -1.3532549142837524, + "logps/chosen": -41.23720169067383, + "logps/rejected": -38.98198699951172, + "loss": 1.0342, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.173727512359619, + "rewards/margins": -1.9283866882324219, + "rewards/rejected": 4.102114200592041, + "step": 11782 + }, + { + "epoch": 1.91, + "learning_rate": 5.009675031726779e-08, + "logits/chosen": -1.464666485786438, + "logits/rejected": -1.6127716302871704, + "logps/chosen": -166.93209838867188, + "logps/rejected": -129.21109008789062, + "loss": 2.2034, + "rewards/accuracies": 1.0, + "rewards/chosen": 9.03930377960205, + "rewards/margins": 3.36232328414917, + "rewards/rejected": 5.676980495452881, + "step": 11783 + }, + { + "epoch": 1.91, + "learning_rate": 4.991134459509495e-08, + "logits/chosen": -1.2685787677764893, + "logits/rejected": -1.3001874685287476, + "logps/chosen": -70.98164367675781, + "logps/rejected": -62.54389190673828, + "loss": 0.3141, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.127389669418335, + "rewards/margins": 0.21374905109405518, + "rewards/rejected": 1.9136406183242798, + "step": 11784 + }, + { + "epoch": 1.91, + "learning_rate": 4.972628087720921e-08, + "logits/chosen": -1.0055913925170898, + "logits/rejected": -0.9619297981262207, + "logps/chosen": -63.9803466796875, + "logps/rejected": -105.87659454345703, + "loss": 0.7935, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.364324927330017, + "rewards/margins": 1.2411330938339233, + "rewards/rejected": 0.12319183349609375, + "step": 11785 + }, + { + "epoch": 1.91, + "learning_rate": 4.954155917639758e-08, + "logits/chosen": -1.3546322584152222, + "logits/rejected": -1.3309721946716309, + "logps/chosen": -18.980836868286133, + "logps/rejected": -1.9923107624053955, + "loss": 0.5827, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.2801307439804077, + "rewards/margins": 0.6876398921012878, + "rewards/rejected": 0.5924908518791199, + "step": 11786 + }, + { + "epoch": 1.91, + "learning_rate": 4.9357179505422625e-08, + "logits/chosen": -1.3908541202545166, + "logits/rejected": -1.326314091682434, + "logps/chosen": -68.19425964355469, + "logps/rejected": -52.58844757080078, + "loss": 0.6319, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.9077529907226562, + "rewards/margins": -0.5747811794281006, + "rewards/rejected": 3.482534170150757, + "step": 11787 + }, + { + "epoch": 1.91, + "learning_rate": 4.91731418770236e-08, + "logits/chosen": -1.3928524255752563, + "logits/rejected": -1.566124439239502, + "logps/chosen": -58.51560592651367, + "logps/rejected": -195.09259033203125, + "loss": 3.7012, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.272981643676758, + "rewards/margins": -7.2603654861450195, + "rewards/rejected": 10.533347129821777, + "step": 11788 + }, + { + "epoch": 1.91, + "learning_rate": 4.898944630391478e-08, + "logits/chosen": -0.9526135325431824, + "logits/rejected": -0.9418277144432068, + "logps/chosen": -65.01871490478516, + "logps/rejected": -85.19367980957031, + "loss": 0.5277, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.7120674848556519, + "rewards/margins": 0.1922905445098877, + "rewards/rejected": 1.5197769403457642, + "step": 11789 + }, + { + "epoch": 1.91, + "learning_rate": 4.8806092798788786e-08, + "logits/chosen": -0.9980080127716064, + "logits/rejected": -0.993821918964386, + "logps/chosen": -0.9001381397247314, + "logps/rejected": -4.333057403564453, + "loss": 0.4353, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.15305840969085693, + "rewards/margins": -0.1644500494003296, + "rewards/rejected": 0.3175084590911865, + "step": 11790 + }, + { + "epoch": 1.91, + "learning_rate": 4.862308137431271e-08, + "logits/chosen": -1.146896243095398, + "logits/rejected": -1.2235666513442993, + "logps/chosen": -87.9021224975586, + "logps/rejected": -103.07615661621094, + "loss": 1.039, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.1346611976623535, + "rewards/margins": -1.0966253280639648, + "rewards/rejected": 4.231286525726318, + "step": 11791 + }, + { + "epoch": 1.91, + "learning_rate": 4.8440412043131435e-08, + "logits/chosen": -1.3537356853485107, + "logits/rejected": -1.337915301322937, + "logps/chosen": -109.37621307373047, + "logps/rejected": -54.99618148803711, + "loss": 1.0976, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9695992469787598, + "rewards/margins": 0.9183087348937988, + "rewards/rejected": 2.051290512084961, + "step": 11792 + }, + { + "epoch": 1.91, + "learning_rate": 4.825808481786542e-08, + "logits/chosen": -1.2687122821807861, + "logits/rejected": -1.2687122821807861, + "logps/chosen": -31.59267807006836, + "logps/rejected": -31.59267807006836, + "loss": 0.3766, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.9633636474609375, + "rewards/margins": 0.0, + "rewards/rejected": 1.9633636474609375, + "step": 11793 + }, + { + "epoch": 1.91, + "learning_rate": 4.807609971111238e-08, + "logits/chosen": -1.4416923522949219, + "logits/rejected": -1.3298225402832031, + "logps/chosen": -46.7298583984375, + "logps/rejected": -22.760616302490234, + "loss": 1.3278, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.097904920578003, + "rewards/margins": 2.21610951423645, + "rewards/rejected": 0.881795346736908, + "step": 11794 + }, + { + "epoch": 1.91, + "learning_rate": 4.789445673544502e-08, + "logits/chosen": -1.1203230619430542, + "logits/rejected": -1.1669259071350098, + "logps/chosen": -33.302490234375, + "logps/rejected": -41.54573059082031, + "loss": 0.8836, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.9884880185127258, + "rewards/margins": -1.5793724060058594, + "rewards/rejected": 2.5678603649139404, + "step": 11795 + }, + { + "epoch": 1.91, + "learning_rate": 4.77131559034133e-08, + "logits/chosen": -0.9768211841583252, + "logits/rejected": -0.9768211841583252, + "logps/chosen": -58.997093200683594, + "logps/rejected": -58.997093200683594, + "loss": 0.3491, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.8559211492538452, + "rewards/margins": 0.0, + "rewards/rejected": 1.8559211492538452, + "step": 11796 + }, + { + "epoch": 1.91, + "learning_rate": 4.753219722754387e-08, + "logits/chosen": -1.549670934677124, + "logits/rejected": -1.4382253885269165, + "logps/chosen": -195.39288330078125, + "logps/rejected": -66.56788635253906, + "loss": 0.7336, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.4044556617736816, + "rewards/margins": -0.290130615234375, + "rewards/rejected": 2.6945862770080566, + "step": 11797 + }, + { + "epoch": 1.91, + "learning_rate": 4.7351580720338965e-08, + "logits/chosen": -1.1540776491165161, + "logits/rejected": -1.1377512216567993, + "logps/chosen": -82.60552978515625, + "logps/rejected": -60.108795166015625, + "loss": 0.3895, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.9103683233261108, + "rewards/margins": 0.2971618175506592, + "rewards/rejected": 1.6132065057754517, + "step": 11798 + }, + { + "epoch": 1.92, + "learning_rate": 4.717130639427747e-08, + "logits/chosen": -0.8677854537963867, + "logits/rejected": -0.8677854537963867, + "logps/chosen": -23.51153564453125, + "logps/rejected": -23.51153564453125, + "loss": 0.3496, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.5124613046646118, + "rewards/margins": 0.0, + "rewards/rejected": 0.5124613046646118, + "step": 11799 + }, + { + "epoch": 1.92, + "learning_rate": 4.699137426181444e-08, + "logits/chosen": -0.9702420830726624, + "logits/rejected": -1.0108470916748047, + "logps/chosen": -40.293582916259766, + "logps/rejected": -219.90298461914062, + "loss": 1.9961, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.0728862285614014, + "rewards/margins": -3.5718557834625244, + "rewards/rejected": 6.644742012023926, + "step": 11800 + }, + { + "epoch": 1.92, + "learning_rate": 4.6811784335381584e-08, + "logits/chosen": -1.1565203666687012, + "logits/rejected": -1.1802053451538086, + "logps/chosen": -114.89342498779297, + "logps/rejected": -86.61555480957031, + "loss": 0.7771, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.4849801063537598, + "rewards/margins": 0.07091832160949707, + "rewards/rejected": 2.4140617847442627, + "step": 11801 + }, + { + "epoch": 1.92, + "learning_rate": 4.663253662738676e-08, + "logits/chosen": -1.3363152742385864, + "logits/rejected": -1.3406802415847778, + "logps/chosen": -70.88783264160156, + "logps/rejected": -59.96241760253906, + "loss": 0.7444, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.0212143659591675, + "rewards/margins": -0.0048934221267700195, + "rewards/rejected": 1.0261077880859375, + "step": 11802 + }, + { + "epoch": 1.92, + "learning_rate": 4.6453631150215065e-08, + "logits/chosen": -1.5505911111831665, + "logits/rejected": -1.2104350328445435, + "logps/chosen": -161.4393310546875, + "logps/rejected": -25.052026748657227, + "loss": 0.0072, + "rewards/accuracies": 1.0, + "rewards/chosen": 7.456234931945801, + "rewards/margins": 6.682276725769043, + "rewards/rejected": 0.7739583849906921, + "step": 11803 + }, + { + "epoch": 1.92, + "learning_rate": 4.6275067916226044e-08, + "logits/chosen": -1.0638847351074219, + "logits/rejected": -1.0338094234466553, + "logps/chosen": -42.55140686035156, + "logps/rejected": -70.90040588378906, + "loss": 0.7089, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.2764909267425537, + "rewards/margins": -0.08433365821838379, + "rewards/rejected": 2.3608245849609375, + "step": 11804 + }, + { + "epoch": 1.92, + "learning_rate": 4.609684693775707e-08, + "logits/chosen": -0.8073712587356567, + "logits/rejected": -0.8050499558448792, + "logps/chosen": -3.1217050552368164, + "logps/rejected": -1.434571385383606, + "loss": 0.3209, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.4588751792907715, + "rewards/margins": 0.13388028740882874, + "rewards/rejected": 0.32499489188194275, + "step": 11805 + }, + { + "epoch": 1.92, + "learning_rate": 4.591896822712216e-08, + "logits/chosen": -1.5425931215286255, + "logits/rejected": -1.5343314409255981, + "logps/chosen": -69.14309692382812, + "logps/rejected": -85.52098083496094, + "loss": 0.1276, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.773616313934326, + "rewards/margins": 4.49261474609375, + "rewards/rejected": 2.281001329421997, + "step": 11806 + }, + { + "epoch": 1.92, + "learning_rate": 4.5741431796610394e-08, + "logits/chosen": -1.2470096349716187, + "logits/rejected": -1.4054030179977417, + "logps/chosen": -23.866323471069336, + "logps/rejected": -33.21134948730469, + "loss": 1.4873, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.503997325897217, + "rewards/margins": -0.6318027973175049, + "rewards/rejected": 3.1358001232147217, + "step": 11807 + }, + { + "epoch": 1.92, + "learning_rate": 4.5564237658488055e-08, + "logits/chosen": -1.0321152210235596, + "logits/rejected": -1.0777392387390137, + "logps/chosen": -73.70030212402344, + "logps/rejected": -92.93966674804688, + "loss": 0.2078, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.6941208839416504, + "rewards/margins": 0.9201431274414062, + "rewards/rejected": 2.773977756500244, + "step": 11808 + }, + { + "epoch": 1.92, + "learning_rate": 4.538738582499758e-08, + "logits/chosen": -1.2393122911453247, + "logits/rejected": -1.267844557762146, + "logps/chosen": -39.32726287841797, + "logps/rejected": -54.42070770263672, + "loss": 0.2629, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.740193247795105, + "rewards/margins": 0.3727356195449829, + "rewards/rejected": 1.367457628250122, + "step": 11809 + }, + { + "epoch": 1.92, + "learning_rate": 4.5210876308358077e-08, + "logits/chosen": -1.169668436050415, + "logits/rejected": -1.2015199661254883, + "logps/chosen": -64.57942962646484, + "logps/rejected": -90.98869323730469, + "loss": 0.4707, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.506826877593994, + "rewards/margins": -0.3958601951599121, + "rewards/rejected": 2.9026870727539062, + "step": 11810 + }, + { + "epoch": 1.92, + "learning_rate": 4.503470912076424e-08, + "logits/chosen": -0.819410502910614, + "logits/rejected": -0.814159095287323, + "logps/chosen": -2.195552349090576, + "logps/rejected": -22.432300567626953, + "loss": 1.3249, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.29156556725502014, + "rewards/margins": 0.3380396366119385, + "rewards/rejected": -0.04647407680749893, + "step": 11811 + }, + { + "epoch": 1.92, + "learning_rate": 4.485888427438745e-08, + "logits/chosen": -1.3371515274047852, + "logits/rejected": -1.3371515274047852, + "logps/chosen": -59.88164520263672, + "logps/rejected": -59.88164520263672, + "loss": 0.837, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.682318925857544, + "rewards/margins": 0.0, + "rewards/rejected": 2.682318925857544, + "step": 11812 + }, + { + "epoch": 1.92, + "learning_rate": 4.468340178137576e-08, + "logits/chosen": -1.2805798053741455, + "logits/rejected": -1.1800315380096436, + "logps/chosen": -95.12651062011719, + "logps/rejected": -79.79356384277344, + "loss": 0.2326, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.7712600231170654, + "rewards/margins": 0.7938491106033325, + "rewards/rejected": 1.977410912513733, + "step": 11813 + }, + { + "epoch": 1.92, + "learning_rate": 4.450826165385336e-08, + "logits/chosen": -1.2238075733184814, + "logits/rejected": -1.1720352172851562, + "logps/chosen": -44.102783203125, + "logps/rejected": -38.14729309082031, + "loss": 0.3654, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.5338596105575562, + "rewards/margins": 0.37188565731048584, + "rewards/rejected": 1.1619739532470703, + "step": 11814 + }, + { + "epoch": 1.92, + "learning_rate": 4.4333463903921125e-08, + "logits/chosen": -1.1663726568222046, + "logits/rejected": -1.229804277420044, + "logps/chosen": -77.02677917480469, + "logps/rejected": -79.68276977539062, + "loss": 0.1897, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9398019313812256, + "rewards/margins": 0.8680710792541504, + "rewards/rejected": 2.071730852127075, + "step": 11815 + }, + { + "epoch": 1.92, + "learning_rate": 4.415900854365551e-08, + "logits/chosen": -1.3197382688522339, + "logits/rejected": -1.1829564571380615, + "logps/chosen": -84.37364196777344, + "logps/rejected": -53.81081771850586, + "loss": 0.022, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.912980556488037, + "rewards/margins": 3.2849185466766357, + "rewards/rejected": 3.6280620098114014, + "step": 11816 + }, + { + "epoch": 1.92, + "learning_rate": 4.398489558510966e-08, + "logits/chosen": -1.1366366147994995, + "logits/rejected": -1.0975048542022705, + "logps/chosen": -47.691619873046875, + "logps/rejected": -79.03919982910156, + "loss": 0.524, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.9670665860176086, + "rewards/margins": -0.4887729287147522, + "rewards/rejected": 1.4558395147323608, + "step": 11817 + }, + { + "epoch": 1.92, + "learning_rate": 4.381112504031337e-08, + "logits/chosen": -1.016664743423462, + "logits/rejected": -0.972777783870697, + "logps/chosen": -91.00721740722656, + "logps/rejected": -75.45008087158203, + "loss": 0.4754, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0154998302459717, + "rewards/margins": 1.1181960105895996, + "rewards/rejected": 1.897303819656372, + "step": 11818 + }, + { + "epoch": 1.92, + "learning_rate": 4.363769692127207e-08, + "logits/chosen": -1.4143702983856201, + "logits/rejected": -1.5302056074142456, + "logps/chosen": -144.00518798828125, + "logps/rejected": -130.81842041015625, + "loss": 0.7509, + "rewards/accuracies": 0.0, + "rewards/chosen": 6.568899631500244, + "rewards/margins": -0.6749467849731445, + "rewards/rejected": 7.243846416473389, + "step": 11819 + }, + { + "epoch": 1.92, + "learning_rate": 4.3464611239968925e-08, + "logits/chosen": -1.4290586709976196, + "logits/rejected": -1.4484883546829224, + "logps/chosen": -26.538114547729492, + "logps/rejected": -66.69210815429688, + "loss": 0.722, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.560718059539795, + "rewards/margins": -1.1518642902374268, + "rewards/rejected": 3.7125823497772217, + "step": 11820 + }, + { + "epoch": 1.92, + "learning_rate": 4.3291868008361604e-08, + "logits/chosen": -1.3924803733825684, + "logits/rejected": -1.401406168937683, + "logps/chosen": -67.5227279663086, + "logps/rejected": -80.38967895507812, + "loss": 0.8123, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.1988112926483154, + "rewards/margins": -1.240628957748413, + "rewards/rejected": 4.4394402503967285, + "step": 11821 + }, + { + "epoch": 1.92, + "learning_rate": 4.311946723838556e-08, + "logits/chosen": -0.9148507118225098, + "logits/rejected": -0.9148507118225098, + "logps/chosen": -15.396484375, + "logps/rejected": -15.396484375, + "loss": 0.7987, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.475511372089386, + "rewards/margins": 0.0, + "rewards/rejected": 0.475511372089386, + "step": 11822 + }, + { + "epoch": 1.92, + "learning_rate": 4.2947408941951815e-08, + "logits/chosen": -1.3308247327804565, + "logits/rejected": -1.1963753700256348, + "logps/chosen": -89.14567565917969, + "logps/rejected": -19.456958770751953, + "loss": 0.4667, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.2504875659942627, + "rewards/margins": -0.1570887565612793, + "rewards/rejected": 2.407576322555542, + "step": 11823 + }, + { + "epoch": 1.92, + "learning_rate": 4.2775693130948094e-08, + "logits/chosen": -1.461620569229126, + "logits/rejected": -1.3027151823043823, + "logps/chosen": -60.850555419921875, + "logps/rejected": -32.60327911376953, + "loss": 0.2095, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.281291484832764, + "rewards/margins": 0.6538295745849609, + "rewards/rejected": 4.627461910247803, + "step": 11824 + }, + { + "epoch": 1.92, + "learning_rate": 4.260431981723823e-08, + "logits/chosen": -1.0598268508911133, + "logits/rejected": -1.0312726497650146, + "logps/chosen": -67.44837951660156, + "logps/rejected": -67.77256774902344, + "loss": 0.4145, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.795056939125061, + "rewards/margins": 1.0815117359161377, + "rewards/rejected": 0.7135452628135681, + "step": 11825 + }, + { + "epoch": 1.92, + "learning_rate": 4.243328901266219e-08, + "logits/chosen": -1.1467247009277344, + "logits/rejected": -0.911365270614624, + "logps/chosen": -57.946083068847656, + "logps/rejected": -60.03753662109375, + "loss": 1.2431, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.3768486976623535, + "rewards/margins": -1.258610486984253, + "rewards/rejected": 3.6354591846466064, + "step": 11826 + }, + { + "epoch": 1.92, + "learning_rate": 4.226260072903776e-08, + "logits/chosen": -1.4646841287612915, + "logits/rejected": -1.321249008178711, + "logps/chosen": -161.19931030273438, + "logps/rejected": -35.1264762878418, + "loss": 0.17, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.066784858703613, + "rewards/margins": 2.1281728744506836, + "rewards/rejected": 3.9386119842529297, + "step": 11827 + }, + { + "epoch": 1.92, + "learning_rate": 4.209225497815661e-08, + "logits/chosen": -1.6925432682037354, + "logits/rejected": -1.644384741783142, + "logps/chosen": -193.54730224609375, + "logps/rejected": -200.59649658203125, + "loss": 1.5781, + "rewards/accuracies": 0.0, + "rewards/chosen": 5.611944675445557, + "rewards/margins": -3.069530963897705, + "rewards/rejected": 8.681475639343262, + "step": 11828 + }, + { + "epoch": 1.92, + "learning_rate": 4.192225177178877e-08, + "logits/chosen": -1.1353259086608887, + "logits/rejected": -1.214882254600525, + "logps/chosen": -109.85983276367188, + "logps/rejected": -111.66879272460938, + "loss": 0.1492, + "rewards/accuracies": 1.0, + "rewards/chosen": 7.31605863571167, + "rewards/margins": 1.5937776565551758, + "rewards/rejected": 5.722280979156494, + "step": 11829 + }, + { + "epoch": 1.92, + "learning_rate": 4.1752591121678733e-08, + "logits/chosen": -1.0324114561080933, + "logits/rejected": -1.0227959156036377, + "logps/chosen": -18.93631362915039, + "logps/rejected": -21.32024383544922, + "loss": 0.6306, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.3308395445346832, + "rewards/margins": -0.22231921553611755, + "rewards/rejected": 0.5531587600708008, + "step": 11830 + }, + { + "epoch": 1.92, + "learning_rate": 4.15832730395499e-08, + "logits/chosen": -1.453018069267273, + "logits/rejected": -1.5021001100540161, + "logps/chosen": -55.328556060791016, + "logps/rejected": -94.38935089111328, + "loss": 1.3801, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.9053173065185547, + "rewards/margins": -1.979835033416748, + "rewards/rejected": 5.885152339935303, + "step": 11831 + }, + { + "epoch": 1.92, + "learning_rate": 4.141429753710013e-08, + "logits/chosen": -1.237463116645813, + "logits/rejected": -1.0576151609420776, + "logps/chosen": -53.599422454833984, + "logps/rejected": -33.28592300415039, + "loss": 0.4337, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2140636444091797, + "rewards/margins": 1.6588481664657593, + "rewards/rejected": 1.5552154779434204, + "step": 11832 + }, + { + "epoch": 1.92, + "learning_rate": 4.1245664626003435e-08, + "logits/chosen": -0.8986005783081055, + "logits/rejected": -0.9051946997642517, + "logps/chosen": -14.173337936401367, + "logps/rejected": -21.057716369628906, + "loss": 1.3562, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.4815135896205902, + "rewards/margins": -0.2882574498653412, + "rewards/rejected": 0.7697710394859314, + "step": 11833 + }, + { + "epoch": 1.92, + "learning_rate": 4.107737431791159e-08, + "logits/chosen": -1.5118179321289062, + "logits/rejected": -1.509757399559021, + "logps/chosen": -180.42984008789062, + "logps/rejected": -80.1806869506836, + "loss": 0.4687, + "rewards/accuracies": 1.0, + "rewards/chosen": 7.10678243637085, + "rewards/margins": 0.8932957649230957, + "rewards/rejected": 6.213486671447754, + "step": 11834 + }, + { + "epoch": 1.92, + "learning_rate": 4.0909426624450856e-08, + "logits/chosen": -1.6371992826461792, + "logits/rejected": -1.6487181186676025, + "logps/chosen": -46.74300003051758, + "logps/rejected": -66.6785659790039, + "loss": 0.5952, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.3896825313568115, + "rewards/margins": 0.19445466995239258, + "rewards/rejected": 2.195227861404419, + "step": 11835 + }, + { + "epoch": 1.92, + "learning_rate": 4.074182155722583e-08, + "logits/chosen": -1.3431767225265503, + "logits/rejected": -1.3213512897491455, + "logps/chosen": -75.1104965209961, + "logps/rejected": -62.336402893066406, + "loss": 1.0011, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.5893807411193848, + "rewards/margins": -1.5301742553710938, + "rewards/rejected": 4.1195549964904785, + "step": 11836 + }, + { + "epoch": 1.92, + "learning_rate": 4.0574559127815604e-08, + "logits/chosen": -1.443304181098938, + "logits/rejected": -1.3163034915924072, + "logps/chosen": -103.98554229736328, + "logps/rejected": -57.55926513671875, + "loss": 1.1939, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.5619957447052, + "rewards/margins": 2.063847780227661, + "rewards/rejected": 0.49814796447753906, + "step": 11837 + }, + { + "epoch": 1.92, + "learning_rate": 4.0407639347777025e-08, + "logits/chosen": -0.9571987986564636, + "logits/rejected": -0.870545506477356, + "logps/chosen": -57.19020080566406, + "logps/rejected": -34.03590393066406, + "loss": 0.5109, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.6116859912872314, + "rewards/margins": -0.1723182201385498, + "rewards/rejected": 1.7840042114257812, + "step": 11838 + }, + { + "epoch": 1.92, + "learning_rate": 4.0241062228642544e-08, + "logits/chosen": -1.2884763479232788, + "logits/rejected": -1.1980286836624146, + "logps/chosen": -97.21878814697266, + "logps/rejected": -73.74702453613281, + "loss": 0.4631, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.9216132164001465, + "rewards/margins": 3.3348867893218994, + "rewards/rejected": 2.586726427078247, + "step": 11839 + }, + { + "epoch": 1.92, + "learning_rate": 4.007482778192073e-08, + "logits/chosen": -1.0917248725891113, + "logits/rejected": -1.0835994482040405, + "logps/chosen": -1.2716822624206543, + "logps/rejected": -12.174179077148438, + "loss": 0.5378, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.4096899926662445, + "rewards/margins": -0.48981961607933044, + "rewards/rejected": 0.899509608745575, + "step": 11840 + }, + { + "epoch": 1.92, + "learning_rate": 3.99089360190974e-08, + "logits/chosen": -1.085205316543579, + "logits/rejected": -1.1160622835159302, + "logps/chosen": -43.11812973022461, + "logps/rejected": -31.824012756347656, + "loss": 0.6921, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.09435539692640305, + "rewards/margins": -1.0815849304199219, + "rewards/rejected": 0.9872295260429382, + "step": 11841 + }, + { + "epoch": 1.92, + "learning_rate": 3.974338695163393e-08, + "logits/chosen": -1.282979130744934, + "logits/rejected": -1.282979130744934, + "logps/chosen": -25.434316635131836, + "logps/rejected": -25.434316635131836, + "loss": 0.7208, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.1294000148773193, + "rewards/margins": 0.0, + "rewards/rejected": 2.1294000148773193, + "step": 11842 + }, + { + "epoch": 1.92, + "learning_rate": 3.957818059096785e-08, + "logits/chosen": -1.4845718145370483, + "logits/rejected": -1.4296345710754395, + "logps/chosen": -70.0661392211914, + "logps/rejected": -113.56498718261719, + "loss": 0.73, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.8793907165527344, + "rewards/margins": 1.7258093357086182, + "rewards/rejected": 2.153581380844116, + "step": 11843 + }, + { + "epoch": 1.92, + "learning_rate": 3.9413316948513356e-08, + "logits/chosen": -1.1825523376464844, + "logits/rejected": -1.324352502822876, + "logps/chosen": -133.26370239257812, + "logps/rejected": -124.94639587402344, + "loss": 1.5854, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.3574081659317017, + "rewards/margins": -3.0742359161376953, + "rewards/rejected": 4.431643962860107, + "step": 11844 + }, + { + "epoch": 1.92, + "learning_rate": 3.924879603566134e-08, + "logits/chosen": -1.0149794816970825, + "logits/rejected": -1.0588375329971313, + "logps/chosen": -66.32941436767578, + "logps/rejected": -65.44252014160156, + "loss": 0.2974, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.728899359703064, + "rewards/margins": 0.26973795890808105, + "rewards/rejected": 1.459161400794983, + "step": 11845 + }, + { + "epoch": 1.92, + "learning_rate": 3.908461786377826e-08, + "logits/chosen": -1.1363952159881592, + "logits/rejected": -1.0055700540542603, + "logps/chosen": -30.835430145263672, + "logps/rejected": -5.216728210449219, + "loss": 0.7607, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.6539723873138428, + "rewards/margins": 2.8587327003479004, + "rewards/rejected": 0.7952395677566528, + "step": 11846 + }, + { + "epoch": 1.92, + "learning_rate": 3.892078244420727e-08, + "logits/chosen": -0.9200108647346497, + "logits/rejected": -0.9155118465423584, + "logps/chosen": -13.175716400146484, + "logps/rejected": -8.607227325439453, + "loss": 0.3569, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5632362365722656, + "rewards/margins": 0.1823640763759613, + "rewards/rejected": 0.3808721601963043, + "step": 11847 + }, + { + "epoch": 1.92, + "learning_rate": 3.8757289788268756e-08, + "logits/chosen": -1.1918070316314697, + "logits/rejected": -1.1918070316314697, + "logps/chosen": -17.3497314453125, + "logps/rejected": -17.3497314453125, + "loss": 0.7783, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.4548068940639496, + "rewards/margins": 0.0, + "rewards/rejected": 0.4548068940639496, + "step": 11848 + }, + { + "epoch": 1.92, + "learning_rate": 3.859413990725702e-08, + "logits/chosen": -1.3995075225830078, + "logits/rejected": -1.3511772155761719, + "logps/chosen": -49.861717224121094, + "logps/rejected": -45.49739074707031, + "loss": 0.4165, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.682070255279541, + "rewards/margins": 0.15339136123657227, + "rewards/rejected": 2.5286788940429688, + "step": 11849 + }, + { + "epoch": 1.92, + "learning_rate": 3.8431332812445265e-08, + "logits/chosen": -1.5432980060577393, + "logits/rejected": -1.4129670858383179, + "logps/chosen": -78.1089096069336, + "logps/rejected": -40.55600357055664, + "loss": 0.467, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.70025634765625, + "rewards/margins": 0.32184791564941406, + "rewards/rejected": 2.378408432006836, + "step": 11850 + }, + { + "epoch": 1.92, + "learning_rate": 3.8268868515081716e-08, + "logits/chosen": -1.44913911819458, + "logits/rejected": -1.4449893236160278, + "logps/chosen": -86.34632873535156, + "logps/rejected": -141.09771728515625, + "loss": 2.1941, + "rewards/accuracies": 0.0, + "rewards/chosen": 6.36226224899292, + "rewards/margins": -2.795485019683838, + "rewards/rejected": 9.157747268676758, + "step": 11851 + }, + { + "epoch": 1.92, + "learning_rate": 3.810674702639072e-08, + "logits/chosen": -0.9293461441993713, + "logits/rejected": -0.9449703693389893, + "logps/chosen": -5.136471271514893, + "logps/rejected": -20.30853843688965, + "loss": 0.7952, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.32328563928604126, + "rewards/margins": -0.21763521432876587, + "rewards/rejected": 0.5409208536148071, + "step": 11852 + }, + { + "epoch": 1.92, + "learning_rate": 3.794496835757444e-08, + "logits/chosen": -1.1925691366195679, + "logits/rejected": -1.1219233274459839, + "logps/chosen": -78.60137939453125, + "logps/rejected": -25.802207946777344, + "loss": 0.1311, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.700703501701355, + "rewards/margins": 1.2047070264816284, + "rewards/rejected": 0.49599647521972656, + "step": 11853 + }, + { + "epoch": 1.92, + "learning_rate": 3.7783532519808376e-08, + "logits/chosen": -0.9402124881744385, + "logits/rejected": -0.9093149900436401, + "logps/chosen": -84.28797912597656, + "logps/rejected": -58.008140563964844, + "loss": 0.3356, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0970237255096436, + "rewards/margins": 1.9679869413375854, + "rewards/rejected": 1.129036784172058, + "step": 11854 + }, + { + "epoch": 1.92, + "learning_rate": 3.762243952424804e-08, + "logits/chosen": -1.3516091108322144, + "logits/rejected": -0.9826154112815857, + "logps/chosen": -94.478271484375, + "logps/rejected": -87.84630584716797, + "loss": 0.2635, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.3118438720703125, + "rewards/margins": 4.088380813598633, + "rewards/rejected": 2.2234628200531006, + "step": 11855 + }, + { + "epoch": 1.92, + "learning_rate": 3.7461689382022326e-08, + "logits/chosen": -1.356194257736206, + "logits/rejected": -1.1303969621658325, + "logps/chosen": -59.091922760009766, + "logps/rejected": -20.273544311523438, + "loss": 0.3499, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.950680136680603, + "rewards/margins": 1.457366704940796, + "rewards/rejected": 0.49331340193748474, + "step": 11856 + }, + { + "epoch": 1.92, + "learning_rate": 3.7301282104238444e-08, + "logits/chosen": -1.2602075338363647, + "logits/rejected": -1.3961944580078125, + "logps/chosen": -34.52190017700195, + "logps/rejected": -134.16729736328125, + "loss": 2.049, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.160478353500366, + "rewards/margins": -3.95943284034729, + "rewards/rejected": 6.119911193847656, + "step": 11857 + }, + { + "epoch": 1.92, + "learning_rate": 3.714121770197754e-08, + "logits/chosen": -1.440424919128418, + "logits/rejected": -1.4308767318725586, + "logps/chosen": -59.06509780883789, + "logps/rejected": -111.35586547851562, + "loss": 0.4223, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.8240368366241455, + "rewards/margins": 0.7744114398956299, + "rewards/rejected": 1.0496253967285156, + "step": 11858 + }, + { + "epoch": 1.92, + "learning_rate": 3.69814961863002e-08, + "logits/chosen": -1.7260621786117554, + "logits/rejected": -1.757839322090149, + "logps/chosen": -113.54666900634766, + "logps/rejected": -103.10441589355469, + "loss": 0.9587, + "rewards/accuracies": 0.0, + "rewards/chosen": 6.249602794647217, + "rewards/margins": -1.7569661140441895, + "rewards/rejected": 8.006568908691406, + "step": 11859 + }, + { + "epoch": 1.93, + "learning_rate": 3.6822117568240945e-08, + "logits/chosen": -1.7256629467010498, + "logits/rejected": -1.6070451736450195, + "logps/chosen": -99.9120864868164, + "logps/rejected": -100.69070434570312, + "loss": 0.0263, + "rewards/accuracies": 1.0, + "rewards/chosen": 8.682928085327148, + "rewards/margins": 5.805892467498779, + "rewards/rejected": 2.877035617828369, + "step": 11860 + }, + { + "epoch": 1.93, + "learning_rate": 3.666308185881096e-08, + "logits/chosen": -1.2115458250045776, + "logits/rejected": -1.2115458250045776, + "logps/chosen": -36.599586486816406, + "logps/rejected": -36.599586486816406, + "loss": 0.6812, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.847784399986267, + "rewards/margins": 0.0, + "rewards/rejected": 1.847784399986267, + "step": 11861 + }, + { + "epoch": 1.93, + "learning_rate": 3.650438906899867e-08, + "logits/chosen": -1.3642634153366089, + "logits/rejected": -1.3129477500915527, + "logps/chosen": -129.83474731445312, + "logps/rejected": -117.53520202636719, + "loss": 0.2144, + "rewards/accuracies": 1.0, + "rewards/chosen": 7.24200439453125, + "rewards/margins": 1.7605619430541992, + "rewards/rejected": 5.481442451477051, + "step": 11862 + }, + { + "epoch": 1.93, + "learning_rate": 3.634603920976809e-08, + "logits/chosen": -1.3458812236785889, + "logits/rejected": -1.407767415046692, + "logps/chosen": -69.45394134521484, + "logps/rejected": -61.173946380615234, + "loss": 0.9474, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.9507766962051392, + "rewards/margins": -0.35108983516693115, + "rewards/rejected": 2.3018665313720703, + "step": 11863 + }, + { + "epoch": 1.93, + "learning_rate": 3.618803229205936e-08, + "logits/chosen": -1.020338535308838, + "logits/rejected": -0.9601836800575256, + "logps/chosen": -35.74443817138672, + "logps/rejected": -22.580781936645508, + "loss": 0.7541, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.199154257774353, + "rewards/margins": -1.240571141242981, + "rewards/rejected": 2.439725399017334, + "step": 11864 + }, + { + "epoch": 1.93, + "learning_rate": 3.603036832679041e-08, + "logits/chosen": -1.31788969039917, + "logits/rejected": -1.31788969039917, + "logps/chosen": -41.051780700683594, + "logps/rejected": -41.051780700683594, + "loss": 0.5057, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.449573516845703, + "rewards/margins": 0.0, + "rewards/rejected": 2.449573516845703, + "step": 11865 + }, + { + "epoch": 1.93, + "learning_rate": 3.5873047324852526e-08, + "logits/chosen": -1.3901962041854858, + "logits/rejected": -1.2006179094314575, + "logps/chosen": -151.42935180664062, + "logps/rejected": -47.01686096191406, + "loss": 0.0261, + "rewards/accuracies": 1.0, + "rewards/chosen": 8.173652648925781, + "rewards/margins": 3.9543166160583496, + "rewards/rejected": 4.219336032867432, + "step": 11866 + }, + { + "epoch": 1.93, + "learning_rate": 3.571606929711646e-08, + "logits/chosen": -0.8649225831031799, + "logits/rejected": -0.8650372624397278, + "logps/chosen": -1.2661453485488892, + "logps/rejected": -2.2684147357940674, + "loss": 0.3555, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.3557569980621338, + "rewards/margins": -0.006350457668304443, + "rewards/rejected": 0.36210745573043823, + "step": 11867 + }, + { + "epoch": 1.93, + "learning_rate": 3.555943425442743e-08, + "logits/chosen": -1.2278236150741577, + "logits/rejected": -1.1765438318252563, + "logps/chosen": -17.712202072143555, + "logps/rejected": -2.250702381134033, + "loss": 0.8728, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.283058762550354, + "rewards/margins": 0.5741590857505798, + "rewards/rejected": 0.7088996767997742, + "step": 11868 + }, + { + "epoch": 1.93, + "learning_rate": 3.5403142207607876e-08, + "logits/chosen": -0.9885048270225525, + "logits/rejected": -0.9933171272277832, + "logps/chosen": -53.58255386352539, + "logps/rejected": -60.87742614746094, + "loss": 0.4203, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.3532657623291016, + "rewards/margins": 0.6945910453796387, + "rewards/rejected": 2.658674716949463, + "step": 11869 + }, + { + "epoch": 1.93, + "learning_rate": 3.524719316745529e-08, + "logits/chosen": -0.8921826481819153, + "logits/rejected": -0.8921826481819153, + "logps/chosen": -32.39992904663086, + "logps/rejected": -32.39992904663086, + "loss": 0.4937, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.8321033716201782, + "rewards/margins": 0.0, + "rewards/rejected": 1.8321033716201782, + "step": 11870 + }, + { + "epoch": 1.93, + "learning_rate": 3.5091587144745475e-08, + "logits/chosen": -1.2083640098571777, + "logits/rejected": -1.257245659828186, + "logps/chosen": -50.67242431640625, + "logps/rejected": -125.29583740234375, + "loss": 1.2619, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.676837205886841, + "rewards/margins": 0.02371525764465332, + "rewards/rejected": 3.6531219482421875, + "step": 11871 + }, + { + "epoch": 1.93, + "learning_rate": 3.493632415022763e-08, + "logits/chosen": -1.357632040977478, + "logits/rejected": -1.2849211692810059, + "logps/chosen": -77.746337890625, + "logps/rejected": -44.17544174194336, + "loss": 0.1905, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9188919067382812, + "rewards/margins": 0.7943079471588135, + "rewards/rejected": 2.1245839595794678, + "step": 11872 + }, + { + "epoch": 1.93, + "learning_rate": 3.4781404194630386e-08, + "logits/chosen": -0.984920859336853, + "logits/rejected": -0.984920859336853, + "logps/chosen": -3.127124786376953, + "logps/rejected": -3.127124786376953, + "loss": 0.6081, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.5587871670722961, + "rewards/margins": 0.0, + "rewards/rejected": 0.5587871670722961, + "step": 11873 + }, + { + "epoch": 1.93, + "learning_rate": 3.462682728865685e-08, + "logits/chosen": -1.2348507642745972, + "logits/rejected": -1.2536157369613647, + "logps/chosen": -111.01761627197266, + "logps/rejected": -119.08784484863281, + "loss": 1.0157, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.472920894622803, + "rewards/margins": 1.8065495491027832, + "rewards/rejected": 4.6663713455200195, + "step": 11874 + }, + { + "epoch": 1.93, + "learning_rate": 3.447259344298681e-08, + "logits/chosen": -1.062179446220398, + "logits/rejected": -1.062179446220398, + "logps/chosen": -58.06307601928711, + "logps/rejected": -58.06307601928711, + "loss": 0.4007, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.5889408588409424, + "rewards/margins": 0.0, + "rewards/rejected": 1.5889408588409424, + "step": 11875 + }, + { + "epoch": 1.93, + "learning_rate": 3.431870266827564e-08, + "logits/chosen": -1.531590223312378, + "logits/rejected": -1.4406319856643677, + "logps/chosen": -93.78829956054688, + "logps/rejected": -47.53502655029297, + "loss": 0.0555, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.935333251953125, + "rewards/margins": 3.4625468254089355, + "rewards/rejected": 1.4727863073349, + "step": 11876 + }, + { + "epoch": 1.93, + "learning_rate": 3.416515497515704e-08, + "logits/chosen": -1.117072343826294, + "logits/rejected": -1.1280075311660767, + "logps/chosen": -81.21342468261719, + "logps/rejected": -76.67999267578125, + "loss": 0.3041, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.581254720687866, + "rewards/margins": 0.7621934413909912, + "rewards/rejected": 2.819061279296875, + "step": 11877 + }, + { + "epoch": 1.93, + "learning_rate": 3.4011950374238655e-08, + "logits/chosen": -1.54252290725708, + "logits/rejected": -1.5543400049209595, + "logps/chosen": -73.33487701416016, + "logps/rejected": -92.4521713256836, + "loss": 0.1073, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.133371829986572, + "rewards/margins": 1.4384765625, + "rewards/rejected": 2.6948952674865723, + "step": 11878 + }, + { + "epoch": 1.93, + "learning_rate": 3.385908887610645e-08, + "logits/chosen": -1.240130066871643, + "logits/rejected": -1.2496265172958374, + "logps/chosen": -103.32449340820312, + "logps/rejected": -97.15507507324219, + "loss": 0.0241, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.726658582687378, + "rewards/margins": 3.128582000732422, + "rewards/rejected": 0.5980766415596008, + "step": 11879 + }, + { + "epoch": 1.93, + "learning_rate": 3.3706570491320866e-08, + "logits/chosen": -1.2250924110412598, + "logits/rejected": -1.2250924110412598, + "logps/chosen": -56.31440734863281, + "logps/rejected": -56.31440734863281, + "loss": 0.3834, + "rewards/accuracies": 0.0, + "rewards/chosen": 5.314640998840332, + "rewards/margins": 0.0, + "rewards/rejected": 5.314640998840332, + "step": 11880 + }, + { + "epoch": 1.93, + "learning_rate": 3.355439523041959e-08, + "logits/chosen": -1.3459254503250122, + "logits/rejected": -1.2851364612579346, + "logps/chosen": -88.0078125, + "logps/rejected": -141.52008056640625, + "loss": 0.006, + "rewards/accuracies": 1.0, + "rewards/chosen": 7.499493598937988, + "rewards/margins": 4.497103691101074, + "rewards/rejected": 3.002389669418335, + "step": 11881 + }, + { + "epoch": 1.93, + "learning_rate": 3.340256310391699e-08, + "logits/chosen": -1.1298022270202637, + "logits/rejected": -1.1242012977600098, + "logps/chosen": -11.572683334350586, + "logps/rejected": -14.565094947814941, + "loss": 3.9951, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.34677574038505554, + "rewards/margins": -0.46254482865333557, + "rewards/rejected": 0.8093205690383911, + "step": 11882 + }, + { + "epoch": 1.93, + "learning_rate": 3.325107412230244e-08, + "logits/chosen": -1.1455914974212646, + "logits/rejected": -1.171482801437378, + "logps/chosen": -82.82501983642578, + "logps/rejected": -122.02587127685547, + "loss": 0.3903, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.6018052101135254, + "rewards/margins": -0.11536478996276855, + "rewards/rejected": 3.717170000076294, + "step": 11883 + }, + { + "epoch": 1.93, + "learning_rate": 3.309992829604314e-08, + "logits/chosen": -1.5852936506271362, + "logits/rejected": -1.5989011526107788, + "logps/chosen": -86.67717742919922, + "logps/rejected": -70.48373413085938, + "loss": 1.313, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.611309051513672, + "rewards/margins": -1.753242015838623, + "rewards/rejected": 4.364551067352295, + "step": 11884 + }, + { + "epoch": 1.93, + "learning_rate": 3.294912563558128e-08, + "logits/chosen": -1.3036847114562988, + "logits/rejected": -1.253692865371704, + "logps/chosen": -59.774662017822266, + "logps/rejected": -60.934940338134766, + "loss": 0.5007, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.8612897396087646, + "rewards/margins": -0.3430602550506592, + "rewards/rejected": 3.204349994659424, + "step": 11885 + }, + { + "epoch": 1.93, + "learning_rate": 3.279866615133687e-08, + "logits/chosen": -1.10472571849823, + "logits/rejected": -1.092677354812622, + "logps/chosen": -41.97295379638672, + "logps/rejected": -41.4432373046875, + "loss": 0.752, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.3996498584747314, + "rewards/margins": 0.5559444427490234, + "rewards/rejected": 1.843705415725708, + "step": 11886 + }, + { + "epoch": 1.93, + "learning_rate": 3.2648549853703803e-08, + "logits/chosen": -1.0235931873321533, + "logits/rejected": -1.0491507053375244, + "logps/chosen": -1.270481824874878, + "logps/rejected": -29.24976348876953, + "loss": 0.5806, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.2653459906578064, + "rewards/margins": -0.5873788595199585, + "rewards/rejected": 0.8527248501777649, + "step": 11887 + }, + { + "epoch": 1.93, + "learning_rate": 3.2498776753054907e-08, + "logits/chosen": -0.9946886301040649, + "logits/rejected": -0.9930053949356079, + "logps/chosen": -21.979642868041992, + "logps/rejected": -31.704763412475586, + "loss": 0.5812, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.157234787940979, + "rewards/margins": -0.4462699890136719, + "rewards/rejected": 1.6035047769546509, + "step": 11888 + }, + { + "epoch": 1.93, + "learning_rate": 3.2349346859737454e-08, + "logits/chosen": -1.7175662517547607, + "logits/rejected": -1.563488245010376, + "logps/chosen": -93.03732299804688, + "logps/rejected": -24.017946243286133, + "loss": 0.5564, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.133459568023682, + "rewards/margins": 4.499751567840576, + "rewards/rejected": 0.6337078213691711, + "step": 11889 + }, + { + "epoch": 1.93, + "learning_rate": 3.220026018407541e-08, + "logits/chosen": -1.4015542268753052, + "logits/rejected": -1.4117772579193115, + "logps/chosen": -78.58110809326172, + "logps/rejected": -239.2803497314453, + "loss": 1.5122, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.3652000427246094, + "rewards/margins": -2.9675192832946777, + "rewards/rejected": 5.332719326019287, + "step": 11890 + }, + { + "epoch": 1.93, + "learning_rate": 3.205151673636997e-08, + "logits/chosen": -1.386925220489502, + "logits/rejected": -1.3591983318328857, + "logps/chosen": -115.92242431640625, + "logps/rejected": -119.71802520751953, + "loss": 0.7333, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.637408494949341, + "rewards/margins": 1.4059576988220215, + "rewards/rejected": 2.2314507961273193, + "step": 11891 + }, + { + "epoch": 1.93, + "learning_rate": 3.1903116526897925e-08, + "logits/chosen": -1.0571833848953247, + "logits/rejected": -1.2956959009170532, + "logps/chosen": -117.5813217163086, + "logps/rejected": -36.21250915527344, + "loss": 0.7133, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.364983558654785, + "rewards/margins": 1.0324454307556152, + "rewards/rejected": 4.33253812789917, + "step": 11892 + }, + { + "epoch": 1.93, + "learning_rate": 3.1755059565911074e-08, + "logits/chosen": -1.2980390787124634, + "logits/rejected": -1.3237504959106445, + "logps/chosen": -43.97517776489258, + "logps/rejected": -65.40581512451172, + "loss": 0.2501, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.3236920833587646, + "rewards/margins": 1.3566029071807861, + "rewards/rejected": 0.9670891165733337, + "step": 11893 + }, + { + "epoch": 1.93, + "learning_rate": 3.1607345863640114e-08, + "logits/chosen": -1.2837305068969727, + "logits/rejected": -1.2764390707015991, + "logps/chosen": -45.27772903442383, + "logps/rejected": -66.05567932128906, + "loss": 0.2351, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.7171132564544678, + "rewards/margins": 0.5335240364074707, + "rewards/rejected": 2.183589220046997, + "step": 11894 + }, + { + "epoch": 1.93, + "learning_rate": 3.145997543029022e-08, + "logits/chosen": -1.202424168586731, + "logits/rejected": -1.3042417764663696, + "logps/chosen": -37.49797439575195, + "logps/rejected": -100.706787109375, + "loss": 1.8452, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.715949773788452, + "rewards/margins": -3.6491963863372803, + "rewards/rejected": 6.365146160125732, + "step": 11895 + }, + { + "epoch": 1.93, + "learning_rate": 3.1312948276043254e-08, + "logits/chosen": -1.2119393348693848, + "logits/rejected": -0.903752326965332, + "logps/chosen": -58.942413330078125, + "logps/rejected": -40.798980712890625, + "loss": 0.7241, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.203100562095642, + "rewards/margins": -1.1387566328048706, + "rewards/rejected": 2.3418571949005127, + "step": 11896 + }, + { + "epoch": 1.93, + "learning_rate": 3.1166264411057745e-08, + "logits/chosen": -1.7942131757736206, + "logits/rejected": -1.74354088306427, + "logps/chosen": -149.41238403320312, + "logps/rejected": -56.03013610839844, + "loss": 0.0556, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.242109775543213, + "rewards/margins": 4.064906597137451, + "rewards/rejected": 0.1772029846906662, + "step": 11897 + }, + { + "epoch": 1.93, + "learning_rate": 3.101992384546726e-08, + "logits/chosen": -1.2598674297332764, + "logits/rejected": -1.2439262866973877, + "logps/chosen": -32.17797088623047, + "logps/rejected": -13.355318069458008, + "loss": 0.6041, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.33447265625, + "rewards/margins": 0.30806219577789307, + "rewards/rejected": 1.026410460472107, + "step": 11898 + }, + { + "epoch": 1.93, + "learning_rate": 3.087392658938315e-08, + "logits/chosen": -1.5732530355453491, + "logits/rejected": -1.3177036046981812, + "logps/chosen": -103.48109436035156, + "logps/rejected": -21.573917388916016, + "loss": 0.0055, + "rewards/accuracies": 1.0, + "rewards/chosen": 9.26421070098877, + "rewards/margins": 8.738563537597656, + "rewards/rejected": 0.5256468057632446, + "step": 11899 + }, + { + "epoch": 1.93, + "learning_rate": 3.072827265289291e-08, + "logits/chosen": -1.4277524948120117, + "logits/rejected": -1.3963567018508911, + "logps/chosen": -81.00022888183594, + "logps/rejected": -89.0938949584961, + "loss": 0.6339, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.3747589588165283, + "rewards/margins": -0.9012200832366943, + "rewards/rejected": 4.275979042053223, + "step": 11900 + }, + { + "epoch": 1.93, + "learning_rate": 3.058296204605904e-08, + "logits/chosen": -1.2360154390335083, + "logits/rejected": -1.1692062616348267, + "logps/chosen": -109.86727142333984, + "logps/rejected": -65.63327026367188, + "loss": 0.2075, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.968873023986816, + "rewards/margins": 4.1161088943481445, + "rewards/rejected": 2.852764129638672, + "step": 11901 + }, + { + "epoch": 1.93, + "learning_rate": 3.043799477892129e-08, + "logits/chosen": -1.3940269947052002, + "logits/rejected": -1.416308879852295, + "logps/chosen": -152.7633514404297, + "logps/rejected": -114.73884582519531, + "loss": 0.3293, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.001106262207031, + "rewards/margins": 0.30655527114868164, + "rewards/rejected": 5.69455099105835, + "step": 11902 + }, + { + "epoch": 1.93, + "learning_rate": 3.029337086149553e-08, + "logits/chosen": -1.3542723655700684, + "logits/rejected": -1.3925966024398804, + "logps/chosen": -81.05443572998047, + "logps/rejected": -54.851966857910156, + "loss": 0.4344, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.9011476039886475, + "rewards/margins": 1.4500932693481445, + "rewards/rejected": 2.451054334640503, + "step": 11903 + }, + { + "epoch": 1.93, + "learning_rate": 3.0149090303774334e-08, + "logits/chosen": -1.3551874160766602, + "logits/rejected": -1.2819987535476685, + "logps/chosen": -72.47600555419922, + "logps/rejected": -40.10307312011719, + "loss": 0.6283, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.6765769720077515, + "rewards/margins": -0.3799225091934204, + "rewards/rejected": 2.056499481201172, + "step": 11904 + }, + { + "epoch": 1.93, + "learning_rate": 3.0005153115725826e-08, + "logits/chosen": -1.5469719171524048, + "logits/rejected": -1.5469719171524048, + "logps/chosen": -68.9105224609375, + "logps/rejected": -68.9105224609375, + "loss": 1.8273, + "rewards/accuracies": 0.0, + "rewards/chosen": 4.033621311187744, + "rewards/margins": 0.0, + "rewards/rejected": 4.033621311187744, + "step": 11905 + }, + { + "epoch": 1.93, + "learning_rate": 2.986155930729484e-08, + "logits/chosen": -1.405032753944397, + "logits/rejected": -1.3995295763015747, + "logps/chosen": -83.92868041992188, + "logps/rejected": -56.406654357910156, + "loss": 0.3893, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.431082248687744, + "rewards/margins": 0.788668155670166, + "rewards/rejected": 1.6424140930175781, + "step": 11906 + }, + { + "epoch": 1.93, + "learning_rate": 2.971830888840177e-08, + "logits/chosen": -1.1770285367965698, + "logits/rejected": -1.0368990898132324, + "logps/chosen": -121.1651611328125, + "logps/rejected": -93.68196868896484, + "loss": 0.2632, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.9169158935546875, + "rewards/margins": 0.9061988592147827, + "rewards/rejected": 1.0107170343399048, + "step": 11907 + }, + { + "epoch": 1.93, + "learning_rate": 2.957540186894481e-08, + "logits/chosen": -1.4070696830749512, + "logits/rejected": -1.4077242612838745, + "logps/chosen": -62.26274871826172, + "logps/rejected": -72.81716918945312, + "loss": 0.8372, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.0540084838867188, + "rewards/margins": -1.3424973487854004, + "rewards/rejected": 3.396505832672119, + "step": 11908 + }, + { + "epoch": 1.93, + "learning_rate": 2.9432838258796614e-08, + "logits/chosen": -1.216068983078003, + "logits/rejected": -1.1511894464492798, + "logps/chosen": -62.29620361328125, + "logps/rejected": -99.0660400390625, + "loss": 0.4434, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9174020290374756, + "rewards/margins": 1.5435235500335693, + "rewards/rejected": 1.3738784790039062, + "step": 11909 + }, + { + "epoch": 1.93, + "learning_rate": 2.9290618067807642e-08, + "logits/chosen": -1.4306870698928833, + "logits/rejected": -1.3817884922027588, + "logps/chosen": -63.68848419189453, + "logps/rejected": -70.65328979492188, + "loss": 0.8134, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.373491048812866, + "rewards/margins": -1.231795310974121, + "rewards/rejected": 3.6052863597869873, + "step": 11910 + }, + { + "epoch": 1.93, + "learning_rate": 2.9148741305803364e-08, + "logits/chosen": -0.9917737245559692, + "logits/rejected": -1.0237088203430176, + "logps/chosen": -3.950624704360962, + "logps/rejected": -35.30875015258789, + "loss": 0.4279, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.19494493305683136, + "rewards/margins": -0.18517516553401947, + "rewards/rejected": 0.38012009859085083, + "step": 11911 + }, + { + "epoch": 1.93, + "learning_rate": 2.900720798258705e-08, + "logits/chosen": -1.3933888673782349, + "logits/rejected": -1.3516130447387695, + "logps/chosen": -8.641105651855469, + "logps/rejected": -16.82961082458496, + "loss": 0.3769, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.9821319580078125, + "rewards/margins": 0.4155846834182739, + "rewards/rejected": 1.5665472745895386, + "step": 11912 + }, + { + "epoch": 1.93, + "learning_rate": 2.8866018107935878e-08, + "logits/chosen": -1.2518739700317383, + "logits/rejected": -1.3075406551361084, + "logps/chosen": -47.6163444519043, + "logps/rejected": -88.29241943359375, + "loss": 0.2993, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.730412721633911, + "rewards/margins": 0.20274853706359863, + "rewards/rejected": 2.5276641845703125, + "step": 11913 + }, + { + "epoch": 1.93, + "learning_rate": 2.8725171691605934e-08, + "logits/chosen": -1.249653697013855, + "logits/rejected": -1.2263072729110718, + "logps/chosen": -49.62464904785156, + "logps/rejected": -46.40032196044922, + "loss": 0.3713, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.5407378673553467, + "rewards/margins": 0.7323691844940186, + "rewards/rejected": 2.808368682861328, + "step": 11914 + }, + { + "epoch": 1.93, + "learning_rate": 2.8584668743328325e-08, + "logits/chosen": -1.1806166172027588, + "logits/rejected": -1.1806166172027588, + "logps/chosen": -44.34596252441406, + "logps/rejected": -44.34596252441406, + "loss": 1.1075, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.6072661876678467, + "rewards/margins": 0.0, + "rewards/rejected": 2.6072661876678467, + "step": 11915 + }, + { + "epoch": 1.93, + "learning_rate": 2.8444509272809728e-08, + "logits/chosen": -1.564250111579895, + "logits/rejected": -1.6213558912277222, + "logps/chosen": -64.03056335449219, + "logps/rejected": -46.813331604003906, + "loss": 0.7706, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1470437049865723, + "rewards/margins": 0.9284698963165283, + "rewards/rejected": 2.218573808670044, + "step": 11916 + }, + { + "epoch": 1.93, + "learning_rate": 2.8304693289734064e-08, + "logits/chosen": -1.0331438779830933, + "logits/rejected": -1.0745712518692017, + "logps/chosen": -64.20867919921875, + "logps/rejected": -92.05049133300781, + "loss": 0.3438, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.399688720703125, + "rewards/margins": 0.9714881181716919, + "rewards/rejected": 1.428200602531433, + "step": 11917 + }, + { + "epoch": 1.93, + "learning_rate": 2.816522080376194e-08, + "logits/chosen": -1.6284654140472412, + "logits/rejected": -1.566114068031311, + "logps/chosen": -95.66028594970703, + "logps/rejected": -40.49264907836914, + "loss": 0.5233, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.7422332763671875, + "rewards/margins": 0.37187695503234863, + "rewards/rejected": 2.370356321334839, + "step": 11918 + }, + { + "epoch": 1.93, + "learning_rate": 2.802609182452842e-08, + "logits/chosen": -1.288779377937317, + "logits/rejected": -1.2535810470581055, + "logps/chosen": -31.313138961791992, + "logps/rejected": -27.938434600830078, + "loss": 0.5635, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.352813243865967, + "rewards/margins": 0.7009298801422119, + "rewards/rejected": 1.6518833637237549, + "step": 11919 + }, + { + "epoch": 1.93, + "learning_rate": 2.788730636164749e-08, + "logits/chosen": -1.274535894393921, + "logits/rejected": -1.199438214302063, + "logps/chosen": -139.03121948242188, + "logps/rejected": -25.746097564697266, + "loss": 0.5834, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.842492580413818, + "rewards/margins": 0.653994083404541, + "rewards/rejected": 5.188498497009277, + "step": 11920 + }, + { + "epoch": 1.93, + "learning_rate": 2.7748864424707034e-08, + "logits/chosen": -0.9781346321105957, + "logits/rejected": -0.9332790970802307, + "logps/chosen": -78.03636932373047, + "logps/rejected": -38.20967102050781, + "loss": 0.2842, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.4534896612167358, + "rewards/margins": 0.9926658272743225, + "rewards/rejected": 0.46082383394241333, + "step": 11921 + }, + { + "epoch": 1.94, + "learning_rate": 2.7610766023271618e-08, + "logits/chosen": -1.314565658569336, + "logits/rejected": -1.2773292064666748, + "logps/chosen": -93.38301086425781, + "logps/rejected": -49.33618927001953, + "loss": 0.9909, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.8994882106781006, + "rewards/margins": -0.7479109764099121, + "rewards/rejected": 3.6473991870880127, + "step": 11922 + }, + { + "epoch": 1.94, + "learning_rate": 2.7473011166883613e-08, + "logits/chosen": -1.2631454467773438, + "logits/rejected": -1.1129589080810547, + "logps/chosen": -36.86935806274414, + "logps/rejected": -15.347768783569336, + "loss": 0.2953, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.275022506713867, + "rewards/margins": 1.7396671772003174, + "rewards/rejected": 0.5353553891181946, + "step": 11923 + }, + { + "epoch": 1.94, + "learning_rate": 2.733559986505985e-08, + "logits/chosen": -1.545559287071228, + "logits/rejected": -1.534698486328125, + "logps/chosen": -36.18153762817383, + "logps/rejected": -32.60527038574219, + "loss": 0.608, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.4785192012786865, + "rewards/margins": 0.3758687973022461, + "rewards/rejected": 2.1026504039764404, + "step": 11924 + }, + { + "epoch": 1.94, + "learning_rate": 2.7198532127294398e-08, + "logits/chosen": -1.350282907485962, + "logits/rejected": -1.4642661809921265, + "logps/chosen": -60.49681091308594, + "logps/rejected": -79.67167663574219, + "loss": 1.7681, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.9607315063476562, + "rewards/margins": -3.230947971343994, + "rewards/rejected": 6.19167947769165, + "step": 11925 + }, + { + "epoch": 1.94, + "learning_rate": 2.706180796305691e-08, + "logits/chosen": -1.3925377130508423, + "logits/rejected": -1.3925377130508423, + "logps/chosen": -75.65232849121094, + "logps/rejected": -75.65232849121094, + "loss": 0.6367, + "rewards/accuracies": 0.0, + "rewards/chosen": 4.645822048187256, + "rewards/margins": 0.0, + "rewards/rejected": 4.645822048187256, + "step": 11926 + }, + { + "epoch": 1.94, + "learning_rate": 2.6925427381794822e-08, + "logits/chosen": -1.1419094800949097, + "logits/rejected": -1.0820963382720947, + "logps/chosen": -52.55377197265625, + "logps/rejected": -43.96906280517578, + "loss": 0.5217, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.9650039672851562, + "rewards/margins": -0.28734445571899414, + "rewards/rejected": 2.2523484230041504, + "step": 11927 + }, + { + "epoch": 1.94, + "learning_rate": 2.6789390392929493e-08, + "logits/chosen": -0.6191418170928955, + "logits/rejected": -0.6128215789794922, + "logps/chosen": -8.213247299194336, + "logps/rejected": -5.728484630584717, + "loss": 0.3443, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.7853021621704102, + "rewards/margins": 0.3013932704925537, + "rewards/rejected": 0.48390889167785645, + "step": 11928 + }, + { + "epoch": 1.94, + "learning_rate": 2.6653697005860625e-08, + "logits/chosen": -0.9438913464546204, + "logits/rejected": -0.8069933652877808, + "logps/chosen": -87.92583465576172, + "logps/rejected": -20.24381446838379, + "loss": 0.4857, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.320171356201172, + "rewards/margins": 0.23469758033752441, + "rewards/rejected": 2.0854737758636475, + "step": 11929 + }, + { + "epoch": 1.94, + "learning_rate": 2.6518347229962382e-08, + "logits/chosen": -1.0317203998565674, + "logits/rejected": -1.0317203998565674, + "logps/chosen": -17.671768188476562, + "logps/rejected": -17.671768188476562, + "loss": 0.4499, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.40003204345703125, + "rewards/margins": 0.0, + "rewards/rejected": 0.40003204345703125, + "step": 11930 + }, + { + "epoch": 1.94, + "learning_rate": 2.6383341074587844e-08, + "logits/chosen": -1.4701071977615356, + "logits/rejected": -1.5100181102752686, + "logps/chosen": -32.02984619140625, + "logps/rejected": -43.58368682861328, + "loss": 0.389, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9390275478363037, + "rewards/margins": 0.1955254077911377, + "rewards/rejected": 2.743502140045166, + "step": 11931 + }, + { + "epoch": 1.94, + "learning_rate": 2.624867854906288e-08, + "logits/chosen": -1.3404972553253174, + "logits/rejected": -1.3289660215377808, + "logps/chosen": -74.33399963378906, + "logps/rejected": -83.12881469726562, + "loss": 0.1864, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.854336738586426, + "rewards/margins": 0.8019928932189941, + "rewards/rejected": 6.052343845367432, + "step": 11932 + }, + { + "epoch": 1.94, + "learning_rate": 2.6114359662692823e-08, + "logits/chosen": -1.4220449924468994, + "logits/rejected": -1.433698058128357, + "logps/chosen": -48.1922607421875, + "logps/rejected": -86.66697692871094, + "loss": 0.5233, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.518932342529297, + "rewards/margins": -0.40845489501953125, + "rewards/rejected": 2.927387237548828, + "step": 11933 + }, + { + "epoch": 1.94, + "learning_rate": 2.5980384424756366e-08, + "logits/chosen": -1.1995553970336914, + "logits/rejected": -1.3922677040100098, + "logps/chosen": -47.996646881103516, + "logps/rejected": -115.781494140625, + "loss": 1.5983, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.35363507270813, + "rewards/margins": -2.9468166828155518, + "rewards/rejected": 6.300451755523682, + "step": 11934 + }, + { + "epoch": 1.94, + "learning_rate": 2.5846752844511102e-08, + "logits/chosen": -1.5287036895751953, + "logits/rejected": -1.3940720558166504, + "logps/chosen": -102.19984436035156, + "logps/rejected": -72.75253295898438, + "loss": 0.0457, + "rewards/accuracies": 1.0, + "rewards/chosen": 7.837977886199951, + "rewards/margins": 3.7811694145202637, + "rewards/rejected": 4.0568084716796875, + "step": 11935 + }, + { + "epoch": 1.94, + "learning_rate": 2.5713464931189647e-08, + "logits/chosen": -1.253324031829834, + "logits/rejected": -1.2929004430770874, + "logps/chosen": -63.491783142089844, + "logps/rejected": -108.37966918945312, + "loss": 1.058, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.3763198852539062, + "rewards/margins": 0.37050628662109375, + "rewards/rejected": 2.0058135986328125, + "step": 11936 + }, + { + "epoch": 1.94, + "learning_rate": 2.5580520694000188e-08, + "logits/chosen": -0.9787992835044861, + "logits/rejected": -1.0144649744033813, + "logps/chosen": -1.4383482933044434, + "logps/rejected": -20.671878814697266, + "loss": 0.5266, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.24543534219264984, + "rewards/margins": -0.17977161705493927, + "rewards/rejected": 0.4252069592475891, + "step": 11937 + }, + { + "epoch": 1.94, + "learning_rate": 2.5447920142128712e-08, + "logits/chosen": -0.9285791516304016, + "logits/rejected": -0.9243517518043518, + "logps/chosen": -1.0311188697814941, + "logps/rejected": -14.59489631652832, + "loss": 0.466, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.4012579619884491, + "rewards/margins": -0.11116501688957214, + "rewards/rejected": 0.5124229788780212, + "step": 11938 + }, + { + "epoch": 1.94, + "learning_rate": 2.5315663284736225e-08, + "logits/chosen": -1.2074965238571167, + "logits/rejected": -1.1935325860977173, + "logps/chosen": -49.489837646484375, + "logps/rejected": -43.773529052734375, + "loss": 0.958, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.1129196882247925, + "rewards/margins": 0.21233755350112915, + "rewards/rejected": 0.9005821347236633, + "step": 11939 + }, + { + "epoch": 1.94, + "learning_rate": 2.5183750130960415e-08, + "logits/chosen": -1.559964656829834, + "logits/rejected": -1.5277749300003052, + "logps/chosen": -83.1236572265625, + "logps/rejected": -52.12272644042969, + "loss": 0.1117, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.229422092437744, + "rewards/margins": 1.425912857055664, + "rewards/rejected": 0.8035091757774353, + "step": 11940 + }, + { + "epoch": 1.94, + "learning_rate": 2.5052180689915108e-08, + "logits/chosen": -1.143479585647583, + "logits/rejected": -1.3546310663223267, + "logps/chosen": -93.14214324951172, + "logps/rejected": -59.559730529785156, + "loss": 0.5256, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.567127227783203, + "rewards/margins": -0.408052921295166, + "rewards/rejected": 3.975180149078369, + "step": 11941 + }, + { + "epoch": 1.94, + "learning_rate": 2.49209549706908e-08, + "logits/chosen": -1.1772276163101196, + "logits/rejected": -1.1714463233947754, + "logps/chosen": -51.9638671875, + "logps/rejected": -73.53648376464844, + "loss": 0.7355, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.031407117843628, + "rewards/margins": -0.9324159622192383, + "rewards/rejected": 2.963823080062866, + "step": 11942 + }, + { + "epoch": 1.94, + "learning_rate": 2.4790072982354142e-08, + "logits/chosen": -1.2385066747665405, + "logits/rejected": -1.3160814046859741, + "logps/chosen": -68.06047058105469, + "logps/rejected": -71.4075927734375, + "loss": 0.5357, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.5687057971954346, + "rewards/margins": -0.5655884742736816, + "rewards/rejected": 3.134294271469116, + "step": 11943 + }, + { + "epoch": 1.94, + "learning_rate": 2.4659534733947333e-08, + "logits/chosen": -0.8752460479736328, + "logits/rejected": -0.8738982677459717, + "logps/chosen": -0.7219251394271851, + "logps/rejected": -13.895794868469238, + "loss": 0.9134, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.3012167513370514, + "rewards/margins": -0.5239453315734863, + "rewards/rejected": 0.8251620531082153, + "step": 11944 + }, + { + "epoch": 1.94, + "learning_rate": 2.452934023448983e-08, + "logits/chosen": -1.4014045000076294, + "logits/rejected": -1.4456852674484253, + "logps/chosen": -59.712581634521484, + "logps/rejected": -111.36264038085938, + "loss": 0.5025, + "rewards/accuracies": 0.0, + "rewards/chosen": 7.024833679199219, + "rewards/margins": -0.2602510452270508, + "rewards/rejected": 7.2850847244262695, + "step": 11945 + }, + { + "epoch": 1.94, + "learning_rate": 2.4399489492976103e-08, + "logits/chosen": -1.0996205806732178, + "logits/rejected": -1.0837650299072266, + "logps/chosen": -58.48145294189453, + "logps/rejected": -26.097230911254883, + "loss": 0.1411, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.104235887527466, + "rewards/margins": 1.248166799545288, + "rewards/rejected": 0.856069028377533, + "step": 11946 + }, + { + "epoch": 1.94, + "learning_rate": 2.4269982518378422e-08, + "logits/chosen": -1.266317367553711, + "logits/rejected": -1.1947216987609863, + "logps/chosen": -192.54428100585938, + "logps/rejected": -23.169677734375, + "loss": 0.0382, + "rewards/accuracies": 1.0, + "rewards/chosen": 7.754367351531982, + "rewards/margins": 5.257994651794434, + "rewards/rejected": 2.496372938156128, + "step": 11947 + }, + { + "epoch": 1.94, + "learning_rate": 2.4140819319644072e-08, + "logits/chosen": -1.3227801322937012, + "logits/rejected": -1.162922978401184, + "logps/chosen": -85.79214477539062, + "logps/rejected": -56.551944732666016, + "loss": 1.4771, + "rewards/accuracies": 1.0, + "rewards/chosen": 7.090617656707764, + "rewards/margins": 3.7233901023864746, + "rewards/rejected": 3.367227554321289, + "step": 11948 + }, + { + "epoch": 1.94, + "learning_rate": 2.4011999905697582e-08, + "logits/chosen": -1.494375467300415, + "logits/rejected": -1.5206248760223389, + "logps/chosen": -68.43873596191406, + "logps/rejected": -96.05874633789062, + "loss": 0.4715, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.79290771484375, + "rewards/margins": 0.602423906326294, + "rewards/rejected": 2.190483808517456, + "step": 11949 + }, + { + "epoch": 1.94, + "learning_rate": 2.388352428543794e-08, + "logits/chosen": -1.3552560806274414, + "logits/rejected": -1.3416167497634888, + "logps/chosen": -45.65856170654297, + "logps/rejected": -73.2791748046875, + "loss": 0.9759, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.8923370838165283, + "rewards/margins": -1.7623693943023682, + "rewards/rejected": 4.6547064781188965, + "step": 11950 + }, + { + "epoch": 1.94, + "learning_rate": 2.3755392467742498e-08, + "logits/chosen": -1.3423304557800293, + "logits/rejected": -1.3512271642684937, + "logps/chosen": -40.25725555419922, + "logps/rejected": -49.97910690307617, + "loss": 0.3784, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.95588755607605, + "rewards/margins": 0.6980135440826416, + "rewards/rejected": 2.257874011993408, + "step": 11951 + }, + { + "epoch": 1.94, + "learning_rate": 2.362760446146417e-08, + "logits/chosen": -0.9618549346923828, + "logits/rejected": -1.0214778184890747, + "logps/chosen": -68.52911376953125, + "logps/rejected": -74.99000549316406, + "loss": 2.1089, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.1215118169784546, + "rewards/margins": -2.7611160278320312, + "rewards/rejected": 3.8826279640197754, + "step": 11952 + }, + { + "epoch": 1.94, + "learning_rate": 2.3500160275430895e-08, + "logits/chosen": -1.5534247159957886, + "logits/rejected": -0.9140369892120361, + "logps/chosen": -75.25434875488281, + "logps/rejected": -83.83778381347656, + "loss": 0.311, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.9310760498046875, + "rewards/margins": 0.9013152122497559, + "rewards/rejected": 4.029760837554932, + "step": 11953 + }, + { + "epoch": 1.94, + "learning_rate": 2.3373059918448958e-08, + "logits/chosen": -1.3708511590957642, + "logits/rejected": -1.3224397897720337, + "logps/chosen": -117.3520736694336, + "logps/rejected": -88.24359893798828, + "loss": 0.1134, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9266107082366943, + "rewards/margins": 1.4401787519454956, + "rewards/rejected": 1.4864319562911987, + "step": 11954 + }, + { + "epoch": 1.94, + "learning_rate": 2.324630339929912e-08, + "logits/chosen": -1.2161526679992676, + "logits/rejected": -1.1730877161026, + "logps/chosen": -57.442710876464844, + "logps/rejected": -73.20569610595703, + "loss": 0.3939, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.237027883529663, + "rewards/margins": -0.00243377685546875, + "rewards/rejected": 2.239461660385132, + "step": 11955 + }, + { + "epoch": 1.94, + "learning_rate": 2.311989072673937e-08, + "logits/chosen": -1.3510076999664307, + "logits/rejected": -1.3932005167007446, + "logps/chosen": -104.97271728515625, + "logps/rejected": -62.35231018066406, + "loss": 0.1385, + "rewards/accuracies": 1.0, + "rewards/chosen": 7.1145524978637695, + "rewards/margins": 2.5311546325683594, + "rewards/rejected": 4.58339786529541, + "step": 11956 + }, + { + "epoch": 1.94, + "learning_rate": 2.2993821909503278e-08, + "logits/chosen": -1.2353943586349487, + "logits/rejected": -1.2917560338974, + "logps/chosen": -26.901662826538086, + "logps/rejected": -76.2203140258789, + "loss": 0.7516, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.0096086263656616, + "rewards/margins": -1.1806105375289917, + "rewards/rejected": 2.1902191638946533, + "step": 11957 + }, + { + "epoch": 1.94, + "learning_rate": 2.28680969563011e-08, + "logits/chosen": -1.3076893091201782, + "logits/rejected": -1.230114221572876, + "logps/chosen": -70.63922119140625, + "logps/rejected": -33.513267517089844, + "loss": 0.5196, + "rewards/accuracies": 0.0, + "rewards/chosen": 4.38153600692749, + "rewards/margins": -0.5511283874511719, + "rewards/rejected": 4.932664394378662, + "step": 11958 + }, + { + "epoch": 1.94, + "learning_rate": 2.2742715875819778e-08, + "logits/chosen": -1.1428604125976562, + "logits/rejected": -1.0831900835037231, + "logps/chosen": -43.5709228515625, + "logps/rejected": -44.898536682128906, + "loss": 0.9175, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.6115611791610718, + "rewards/margins": -0.44322478771209717, + "rewards/rejected": 2.054785966873169, + "step": 11959 + }, + { + "epoch": 1.94, + "learning_rate": 2.2617678676721268e-08, + "logits/chosen": -1.6637115478515625, + "logits/rejected": -1.6498472690582275, + "logps/chosen": -88.75177001953125, + "logps/rejected": -68.06214141845703, + "loss": 0.726, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.9914276599884033, + "rewards/margins": -0.2040574550628662, + "rewards/rejected": 4.1954851150512695, + "step": 11960 + }, + { + "epoch": 1.94, + "learning_rate": 2.249298536764477e-08, + "logits/chosen": -1.3336639404296875, + "logits/rejected": -1.375652551651001, + "logps/chosen": -43.206581115722656, + "logps/rejected": -112.53864288330078, + "loss": 2.6932, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.4054107666015625, + "rewards/margins": -2.555828094482422, + "rewards/rejected": 5.961238861083984, + "step": 11961 + }, + { + "epoch": 1.94, + "learning_rate": 2.236863595720562e-08, + "logits/chosen": -1.273216724395752, + "logits/rejected": -1.3151541948318481, + "logps/chosen": -135.21188354492188, + "logps/rejected": -120.20484924316406, + "loss": 0.0389, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.524264812469482, + "rewards/margins": 2.561433792114258, + "rewards/rejected": 1.9628311395645142, + "step": 11962 + }, + { + "epoch": 1.94, + "learning_rate": 2.2244630453994165e-08, + "logits/chosen": -0.9564872980117798, + "logits/rejected": -0.9791176319122314, + "logps/chosen": -68.27162170410156, + "logps/rejected": -100.88694763183594, + "loss": 0.3872, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.429086446762085, + "rewards/margins": 1.6464265584945679, + "rewards/rejected": 1.782659888267517, + "step": 11963 + }, + { + "epoch": 1.94, + "learning_rate": 2.212096886657966e-08, + "logits/chosen": -1.424169898033142, + "logits/rejected": -1.2840189933776855, + "logps/chosen": -51.87244415283203, + "logps/rejected": -25.241943359375, + "loss": 0.272, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2182412147521973, + "rewards/margins": 3.412973403930664, + "rewards/rejected": -0.19473209977149963, + "step": 11964 + }, + { + "epoch": 1.94, + "learning_rate": 2.1997651203504722e-08, + "logits/chosen": -1.56109619140625, + "logits/rejected": -1.5443342924118042, + "logps/chosen": -70.53117370605469, + "logps/rejected": -76.33131408691406, + "loss": 0.4456, + "rewards/accuracies": 0.0, + "rewards/chosen": 5.907872200012207, + "rewards/margins": -0.3394041061401367, + "rewards/rejected": 6.247276306152344, + "step": 11965 + }, + { + "epoch": 1.94, + "learning_rate": 2.1874677473289753e-08, + "logits/chosen": -1.2823970317840576, + "logits/rejected": -1.2747457027435303, + "logps/chosen": -39.38629913330078, + "logps/rejected": -51.22692108154297, + "loss": 1.9883, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.0418758392333984, + "rewards/margins": 0.10853302478790283, + "rewards/rejected": 1.9333428144454956, + "step": 11966 + }, + { + "epoch": 1.94, + "learning_rate": 2.175204768443129e-08, + "logits/chosen": -1.3005220890045166, + "logits/rejected": -1.3385907411575317, + "logps/chosen": -58.94187927246094, + "logps/rejected": -107.7550048828125, + "loss": 1.0349, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.3721526861190796, + "rewards/margins": -1.0876039266586304, + "rewards/rejected": 2.45975661277771, + "step": 11967 + }, + { + "epoch": 1.94, + "learning_rate": 2.1629761845401442e-08, + "logits/chosen": -1.2556661367416382, + "logits/rejected": -1.2531342506408691, + "logps/chosen": -1.627548098564148, + "logps/rejected": -1.5726011991500854, + "loss": 0.3669, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.4390568435192108, + "rewards/margins": 0.008132845163345337, + "rewards/rejected": 0.4309239983558655, + "step": 11968 + }, + { + "epoch": 1.94, + "learning_rate": 2.1507819964649568e-08, + "logits/chosen": -1.3823293447494507, + "logits/rejected": -1.2862682342529297, + "logps/chosen": -44.170955657958984, + "logps/rejected": -19.477136611938477, + "loss": 0.2154, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.3666598796844482, + "rewards/margins": 0.8428037166595459, + "rewards/rejected": 1.5238561630249023, + "step": 11969 + }, + { + "epoch": 1.94, + "learning_rate": 2.1386222050600036e-08, + "logits/chosen": -1.3131487369537354, + "logits/rejected": -1.3444476127624512, + "logps/chosen": -78.14549255371094, + "logps/rejected": -122.59976196289062, + "loss": 1.29, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.0819015502929688, + "rewards/margins": -0.4421219825744629, + "rewards/rejected": 2.5240235328674316, + "step": 11970 + }, + { + "epoch": 1.94, + "learning_rate": 2.1264968111655015e-08, + "logits/chosen": -0.8533851504325867, + "logits/rejected": -0.8555752038955688, + "logps/chosen": -0.7242964506149292, + "logps/rejected": -13.65360164642334, + "loss": 0.3353, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.35139980912208557, + "rewards/margins": 0.20320120453834534, + "rewards/rejected": 0.14819860458374023, + "step": 11971 + }, + { + "epoch": 1.94, + "learning_rate": 2.1144058156191138e-08, + "logits/chosen": -1.6136852502822876, + "logits/rejected": -1.723294734954834, + "logps/chosen": -88.90666961669922, + "logps/rejected": -137.86331176757812, + "loss": 0.7284, + "rewards/accuracies": 0.0, + "rewards/chosen": 5.959693431854248, + "rewards/margins": -1.0910038948059082, + "rewards/rejected": 7.050697326660156, + "step": 11972 + }, + { + "epoch": 1.94, + "learning_rate": 2.1023492192562277e-08, + "logits/chosen": -1.145395040512085, + "logits/rejected": -1.258946418762207, + "logps/chosen": -49.35881423950195, + "logps/rejected": -90.40772247314453, + "loss": 0.482, + "rewards/accuracies": 0.0, + "rewards/chosen": 1.4817135334014893, + "rewards/margins": -0.4805492162704468, + "rewards/rejected": 1.962262749671936, + "step": 11973 + }, + { + "epoch": 1.94, + "learning_rate": 2.0903270229098992e-08, + "logits/chosen": -1.382939338684082, + "logits/rejected": -1.2489421367645264, + "logps/chosen": -102.14982604980469, + "logps/rejected": -34.78037643432617, + "loss": 0.3544, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.4787185192108154, + "rewards/margins": 1.126530647277832, + "rewards/rejected": 2.3521878719329834, + "step": 11974 + }, + { + "epoch": 1.94, + "learning_rate": 2.078339227410686e-08, + "logits/chosen": -0.9607126712799072, + "logits/rejected": -0.9266356825828552, + "logps/chosen": -96.63041687011719, + "logps/rejected": -50.76914978027344, + "loss": 0.0751, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.7224724292755127, + "rewards/margins": 1.8288662433624268, + "rewards/rejected": 0.8936061859130859, + "step": 11975 + }, + { + "epoch": 1.94, + "learning_rate": 2.0663858335868704e-08, + "logits/chosen": -1.5345290899276733, + "logits/rejected": -1.6141432523727417, + "logps/chosen": -54.847713470458984, + "logps/rejected": -163.98976135253906, + "loss": 2.1991, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.2024388313293457, + "rewards/margins": -3.4655070304870605, + "rewards/rejected": 6.667945861816406, + "step": 11976 + }, + { + "epoch": 1.94, + "learning_rate": 2.054466842264291e-08, + "logits/chosen": -1.0795364379882812, + "logits/rejected": -1.0952316522598267, + "logps/chosen": -53.57542419433594, + "logps/rejected": -61.47120666503906, + "loss": 0.1917, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.7625160217285156, + "rewards/margins": 0.8101158142089844, + "rewards/rejected": 1.9524002075195312, + "step": 11977 + }, + { + "epoch": 1.94, + "learning_rate": 2.042582254266401e-08, + "logits/chosen": -1.2852140665054321, + "logits/rejected": -1.2607285976409912, + "logps/chosen": -51.130130767822266, + "logps/rejected": -75.18772888183594, + "loss": 1.7778, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.1560161113739014, + "rewards/margins": 1.5698626041412354, + "rewards/rejected": 0.5861534476280212, + "step": 11978 + }, + { + "epoch": 1.94, + "learning_rate": 2.0307320704144316e-08, + "logits/chosen": -1.3427276611328125, + "logits/rejected": -1.1125966310501099, + "logps/chosen": -115.53307342529297, + "logps/rejected": -87.1922378540039, + "loss": 0.0592, + "rewards/accuracies": 1.0, + "rewards/chosen": 7.9058756828308105, + "rewards/margins": 2.4248719215393066, + "rewards/rejected": 5.481003761291504, + "step": 11979 + }, + { + "epoch": 1.94, + "learning_rate": 2.0189162915270065e-08, + "logits/chosen": -1.293299674987793, + "logits/rejected": -1.2445435523986816, + "logps/chosen": -85.07232666015625, + "logps/rejected": -145.8856964111328, + "loss": 2.4007, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.858264923095703, + "rewards/margins": -4.534976959228516, + "rewards/rejected": 7.393241882324219, + "step": 11980 + }, + { + "epoch": 1.94, + "learning_rate": 2.007134918420528e-08, + "logits/chosen": -1.3839967250823975, + "logits/rejected": -1.4258042573928833, + "logps/chosen": -156.6507110595703, + "logps/rejected": -172.05819702148438, + "loss": 0.6778, + "rewards/accuracies": 0.0, + "rewards/chosen": 8.930708885192871, + "rewards/margins": -1.0555553436279297, + "rewards/rejected": 9.9862642288208, + "step": 11981 + }, + { + "epoch": 1.94, + "learning_rate": 1.9953879519090115e-08, + "logits/chosen": -1.4722998142242432, + "logits/rejected": -1.5048552751541138, + "logps/chosen": -59.49737548828125, + "logps/rejected": -80.52842712402344, + "loss": 0.6329, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.3354203701019287, + "rewards/margins": -0.9319722652435303, + "rewards/rejected": 4.267392635345459, + "step": 11982 + }, + { + "epoch": 1.94, + "learning_rate": 1.9836753928039742e-08, + "logits/chosen": -1.2740615606307983, + "logits/rejected": -1.2753010988235474, + "logps/chosen": -52.150943756103516, + "logps/rejected": -45.704410552978516, + "loss": 0.4072, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.553812026977539, + "rewards/margins": 0.15158379077911377, + "rewards/rejected": 1.4022282361984253, + "step": 11983 + }, + { + "epoch": 1.95, + "learning_rate": 1.971997241914714e-08, + "logits/chosen": -1.3384008407592773, + "logits/rejected": -1.3228154182434082, + "logps/chosen": -17.598114013671875, + "logps/rejected": -37.391075134277344, + "loss": 0.1424, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.3165359497070312, + "rewards/margins": 1.208792805671692, + "rewards/rejected": 1.1077431440353394, + "step": 11984 + }, + { + "epoch": 1.95, + "learning_rate": 1.9603535000480845e-08, + "logits/chosen": -1.0774762630462646, + "logits/rejected": -0.8767428398132324, + "logps/chosen": -46.197208404541016, + "logps/rejected": -74.88163757324219, + "loss": 1.2552, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.5795414447784424, + "rewards/margins": 0.5452625751495361, + "rewards/rejected": 2.0342788696289062, + "step": 11985 + }, + { + "epoch": 1.95, + "learning_rate": 1.9487441680084983e-08, + "logits/chosen": -1.3690437078475952, + "logits/rejected": -1.2301284074783325, + "logps/chosen": -162.13299560546875, + "logps/rejected": -45.599098205566406, + "loss": 0.1493, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.671856880187988, + "rewards/margins": 3.248037815093994, + "rewards/rejected": 2.423819065093994, + "step": 11986 + }, + { + "epoch": 1.95, + "learning_rate": 1.937169246598092e-08, + "logits/chosen": -1.7166117429733276, + "logits/rejected": -1.7768429517745972, + "logps/chosen": -130.3933868408203, + "logps/rejected": -111.54405212402344, + "loss": 0.0495, + "rewards/accuracies": 1.0, + "rewards/chosen": 6.685246467590332, + "rewards/margins": 6.240723609924316, + "rewards/rejected": 0.4445228576660156, + "step": 11987 + }, + { + "epoch": 1.95, + "learning_rate": 1.925628736616614e-08, + "logits/chosen": -1.2506496906280518, + "logits/rejected": -1.2555471658706665, + "logps/chosen": -74.49358367919922, + "logps/rejected": -131.1499786376953, + "loss": 0.1661, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2398993968963623, + "rewards/margins": 1.2089393138885498, + "rewards/rejected": 2.0309600830078125, + "step": 11988 + }, + { + "epoch": 1.95, + "learning_rate": 1.9141226388613156e-08, + "logits/chosen": -1.064614176750183, + "logits/rejected": -1.0670727491378784, + "logps/chosen": -40.98163604736328, + "logps/rejected": -46.113059997558594, + "loss": 2.2045, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.498024344444275, + "rewards/margins": 0.03560972213745117, + "rewards/rejected": 1.4624146223068237, + "step": 11989 + }, + { + "epoch": 1.95, + "learning_rate": 1.9026509541272276e-08, + "logits/chosen": -1.6515599489212036, + "logits/rejected": -1.4880454540252686, + "logps/chosen": -113.751220703125, + "logps/rejected": -55.29668426513672, + "loss": 0.0048, + "rewards/accuracies": 1.0, + "rewards/chosen": 8.618498802185059, + "rewards/margins": 4.936097145080566, + "rewards/rejected": 3.682401418685913, + "step": 11990 + }, + { + "epoch": 1.95, + "learning_rate": 1.8912136832069384e-08, + "logits/chosen": -1.3080928325653076, + "logits/rejected": -1.3328443765640259, + "logps/chosen": -74.68250274658203, + "logps/rejected": -82.07676696777344, + "loss": 0.2232, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.7373993396759033, + "rewards/margins": 0.730501651763916, + "rewards/rejected": 2.0068976879119873, + "step": 11991 + }, + { + "epoch": 1.95, + "learning_rate": 1.8798108268905934e-08, + "logits/chosen": -1.1350480318069458, + "logits/rejected": -1.2116187810897827, + "logps/chosen": -60.143333435058594, + "logps/rejected": -136.35049438476562, + "loss": 1.3885, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.1356942653656006, + "rewards/margins": -2.7086784839630127, + "rewards/rejected": 4.844372749328613, + "step": 11992 + }, + { + "epoch": 1.95, + "learning_rate": 1.8684423859661182e-08, + "logits/chosen": -1.2228862047195435, + "logits/rejected": -1.3652335405349731, + "logps/chosen": -75.31480407714844, + "logps/rejected": -101.81614685058594, + "loss": 1.7142, + "rewards/accuracies": 0.0, + "rewards/chosen": 3.054325819015503, + "rewards/margins": -3.1365573406219482, + "rewards/rejected": 6.190883159637451, + "step": 11993 + }, + { + "epoch": 1.95, + "learning_rate": 1.8571083612188845e-08, + "logits/chosen": -1.1007921695709229, + "logits/rejected": -1.0861613750457764, + "logps/chosen": -6.137171268463135, + "logps/rejected": -15.955050468444824, + "loss": 0.3038, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.8385130167007446, + "rewards/margins": 0.676440954208374, + "rewards/rejected": 1.1620720624923706, + "step": 11994 + }, + { + "epoch": 1.95, + "learning_rate": 1.845808753431988e-08, + "logits/chosen": -1.4487106800079346, + "logits/rejected": -1.4719899892807007, + "logps/chosen": -66.85892486572266, + "logps/rejected": -78.16093444824219, + "loss": 1.1197, + "rewards/accuracies": 0.0, + "rewards/chosen": 2.6425750255584717, + "rewards/margins": -1.5103118419647217, + "rewards/rejected": 4.152886867523193, + "step": 11995 + }, + { + "epoch": 1.95, + "learning_rate": 1.834543563386082e-08, + "logits/chosen": -1.5006886720657349, + "logits/rejected": -1.513986349105835, + "logps/chosen": -211.73634338378906, + "logps/rejected": -84.92008972167969, + "loss": 0.0366, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.777522563934326, + "rewards/margins": 2.6188387870788574, + "rewards/rejected": 2.1586837768554688, + "step": 11996 + }, + { + "epoch": 1.95, + "learning_rate": 1.8233127918595994e-08, + "logits/chosen": -0.8780903816223145, + "logits/rejected": -0.8552095890045166, + "logps/chosen": -13.622811317443848, + "logps/rejected": -1.1991206407546997, + "loss": 0.5225, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.40204212069511414, + "rewards/margins": -0.06012237071990967, + "rewards/rejected": 0.4621644914150238, + "step": 11997 + }, + { + "epoch": 1.95, + "learning_rate": 1.8121164396283643e-08, + "logits/chosen": -1.2350208759307861, + "logits/rejected": -1.0695457458496094, + "logps/chosen": -49.888614654541016, + "logps/rejected": -26.22658348083496, + "loss": 0.8163, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.161945819854736, + "rewards/margins": 3.2685680389404297, + "rewards/rejected": 0.8933779001235962, + "step": 11998 + }, + { + "epoch": 1.95, + "learning_rate": 1.800954507466035e-08, + "logits/chosen": -1.4980601072311401, + "logits/rejected": -1.3699467182159424, + "logps/chosen": -105.24908447265625, + "logps/rejected": -41.342254638671875, + "loss": 0.0163, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.739694118499756, + "rewards/margins": 3.488145351409912, + "rewards/rejected": 2.2515487670898438, + "step": 11999 + }, + { + "epoch": 1.95, + "learning_rate": 1.7898269961437177e-08, + "logits/chosen": -1.5802429914474487, + "logits/rejected": -1.310302734375, + "logps/chosen": -117.92749786376953, + "logps/rejected": -19.253334045410156, + "loss": 0.0095, + "rewards/accuracies": 1.0, + "rewards/chosen": 5.163814544677734, + "rewards/margins": 4.320212364196777, + "rewards/rejected": 0.843602180480957, + "step": 12000 + } + ], + "logging_steps": 1.0, + "max_steps": 12322, + "num_train_epochs": 2, + "save_steps": 2000, + "total_flos": 0.0, + "trial_name": null, + "trial_params": null +}