diff --git "a/trainer_state.json" "b/trainer_state.json" --- "a/trainer_state.json" +++ "b/trainer_state.json" @@ -1,21 +1,21 @@ { "best_metric": null, "best_model_checkpoint": null, - "epoch": 0.9993235625704623, + "epoch": 0.9935483870967742, "eval_steps": 500, - "global_step": 277, + "global_step": 77, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { - "epoch": 0.0036076662908680946, - "grad_norm": 3.5840242555590267, - "learning_rate": 1.7857142857142856e-08, - "logits/chosen": -3.02018141746521, - "logits/rejected": -2.956803560256958, - "logps/chosen": -56.959434509277344, - "logps/rejected": -49.77225875854492, + "epoch": 0.012903225806451613, + "grad_norm": 7.575236641343013, + "learning_rate": 6.25e-08, + "logits/chosen": -3.0188064575195312, + "logits/rejected": -3.0444469451904297, + "logps/chosen": -24.05819320678711, + "logps/rejected": -34.35090637207031, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, @@ -24,13 +24,13 @@ "step": 1 }, { - "epoch": 0.007215332581736189, - "grad_norm": 3.4852511270398137, - "learning_rate": 3.571428571428571e-08, - "logits/chosen": -3.141793966293335, - "logits/rejected": -3.155113458633423, - "logps/chosen": -44.37836837768555, - "logps/rejected": -44.664100646972656, + "epoch": 0.025806451612903226, + "grad_norm": 5.98360652154207, + "learning_rate": 1.25e-07, + "logits/chosen": -2.8478431701660156, + "logits/rejected": -2.816833972930908, + "logps/chosen": -20.381498336791992, + "logps/rejected": -17.743824005126953, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, @@ -39,4142 +39,1142 @@ "step": 2 }, { - "epoch": 0.010822998872604284, - "grad_norm": 3.551907402841069, - "learning_rate": 5.3571428571428564e-08, - "logits/chosen": -3.0334765911102295, - "logits/rejected": -3.007319688796997, - "logps/chosen": -50.289154052734375, - "logps/rejected": -51.01716613769531, + "epoch": 0.03870967741935484, + "grad_norm": 8.209690481232373, + "learning_rate": 1.875e-07, + "logits/chosen": -3.1529746055603027, + "logits/rejected": -3.1297154426574707, + "logps/chosen": -29.355775833129883, + "logps/rejected": -27.056119918823242, "loss": 0.6931, - "rewards/accuracies": 0.5, - "rewards/chosen": 0.00042927381582558155, - "rewards/margins": 0.0004788838850799948, - "rewards/rejected": -4.9610109272180125e-05, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.0009147524833679199, + "rewards/margins": 0.0013640093384310603, + "rewards/rejected": -0.00044925688416697085, "step": 3 }, { - "epoch": 0.014430665163472379, - "grad_norm": 3.5362621061340294, - "learning_rate": 7.142857142857142e-08, - "logits/chosen": -3.083339214324951, - "logits/rejected": -3.02933669090271, - "logps/chosen": -40.84933853149414, - "logps/rejected": -39.18031692504883, - "loss": 0.6932, - "rewards/accuracies": 0.375, - "rewards/chosen": 0.000369690649677068, - "rewards/margins": 0.0004772090178448707, - "rewards/rejected": -0.00010751838999567553, + "epoch": 0.05161290322580645, + "grad_norm": 6.674375366491978, + "learning_rate": 2.5e-07, + "logits/chosen": -3.0608866214752197, + "logits/rejected": -3.0937676429748535, + "logps/chosen": -33.94017791748047, + "logps/rejected": -42.64073181152344, + "loss": 0.6931, + "rewards/accuracies": 0.3125, + "rewards/chosen": 0.00032928824657574296, + "rewards/margins": 0.00031276524532586336, + "rewards/rejected": 1.65230012498796e-05, "step": 4 }, { - "epoch": 0.018038331454340473, - "grad_norm": 3.513411696995216, - "learning_rate": 8.928571428571429e-08, - "logits/chosen": -3.099221706390381, - "logits/rejected": -3.053905487060547, - "logps/chosen": -46.57722854614258, - "logps/rejected": -48.32011413574219, - "loss": 0.6929, - "rewards/accuracies": 0.1875, - "rewards/chosen": 4.309175710659474e-05, - "rewards/margins": -0.0003382217837497592, - "rewards/rejected": 0.0003813135845120996, + "epoch": 0.06451612903225806, + "grad_norm": 7.666243296505927, + "learning_rate": 3.1249999999999997e-07, + "logits/chosen": -2.932521104812622, + "logits/rejected": -2.905606746673584, + "logps/chosen": -30.47471809387207, + "logps/rejected": -31.02899932861328, + "loss": 0.6934, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.00036995764821767807, + "rewards/margins": 0.0002594107063487172, + "rewards/rejected": -0.0006293683545663953, "step": 5 }, { - "epoch": 0.02164599774520857, - "grad_norm": 3.4655447807929836, - "learning_rate": 1.0714285714285713e-07, - "logits/chosen": -3.1053647994995117, - "logits/rejected": -3.019660234451294, - "logps/chosen": -59.371002197265625, - "logps/rejected": -56.88490676879883, - "loss": 0.693, - "rewards/accuracies": 0.5, - "rewards/chosen": 0.0005562889273278415, - "rewards/margins": 0.0007294797105714679, - "rewards/rejected": -0.0001731908123474568, + "epoch": 0.07741935483870968, + "grad_norm": 8.49764667909534, + "learning_rate": 3.75e-07, + "logits/chosen": -3.0920748710632324, + "logits/rejected": -3.1023545265197754, + "logps/chosen": -57.721893310546875, + "logps/rejected": -62.54054260253906, + "loss": 0.6932, + "rewards/accuracies": 0.1875, + "rewards/chosen": -0.001343829557299614, + "rewards/margins": -0.0011225318303331733, + "rewards/rejected": -0.00022129775607027113, "step": 6 }, { - "epoch": 0.025253664036076665, - "grad_norm": 3.401955154766394, - "learning_rate": 1.25e-07, - "logits/chosen": -3.1133270263671875, - "logits/rejected": -3.0611650943756104, - "logps/chosen": -67.82024383544922, - "logps/rejected": -64.8301010131836, - "loss": 0.6932, - "rewards/accuracies": 0.4375, - "rewards/chosen": -2.9102553526172414e-05, - "rewards/margins": -0.0005619716830551624, - "rewards/rejected": 0.000532869016751647, + "epoch": 0.09032258064516129, + "grad_norm": 7.33711327379137, + "learning_rate": 4.375e-07, + "logits/chosen": -2.827237129211426, + "logits/rejected": -2.816389560699463, + "logps/chosen": -48.21922302246094, + "logps/rejected": -40.31886291503906, + "loss": 0.6931, + "rewards/accuracies": 0.3125, + "rewards/chosen": -0.008381588384509087, + "rewards/margins": -0.002356385812163353, + "rewards/rejected": -0.006025202106684446, "step": 7 }, { - "epoch": 0.028861330326944757, - "grad_norm": 3.1555450539214775, - "learning_rate": 1.4285714285714285e-07, - "logits/chosen": -2.85391902923584, - "logits/rejected": -2.870007276535034, - "logps/chosen": -47.19465637207031, - "logps/rejected": -42.87815856933594, - "loss": 0.6932, + "epoch": 0.1032258064516129, + "grad_norm": 4.927805558730517, + "learning_rate": 5e-07, + "logits/chosen": -3.0250465869903564, + "logits/rejected": -2.9766347408294678, + "logps/chosen": -35.06133270263672, + "logps/rejected": -39.101741790771484, + "loss": 0.6927, "rewards/accuracies": 0.5, - "rewards/chosen": 0.0011590981157496572, - "rewards/margins": 0.00014693137200083584, - "rewards/rejected": 0.001012166729196906, + "rewards/chosen": 0.0022047751117497683, + "rewards/margins": 0.004272125195711851, + "rewards/rejected": -0.0020673503167927265, "step": 8 }, { - "epoch": 0.03246899661781285, - "grad_norm": 3.614839697039073, - "learning_rate": 1.6071428571428573e-07, - "logits/chosen": -2.92150616645813, - "logits/rejected": -2.8782413005828857, - "logps/chosen": -37.58573913574219, - "logps/rejected": -37.860939025878906, - "loss": 0.693, - "rewards/accuracies": 0.25, - "rewards/chosen": 0.000555627339053899, - "rewards/margins": 0.0004235732776578516, - "rewards/rejected": 0.00013205409049987793, + "epoch": 0.11612903225806452, + "grad_norm": 10.205695541523633, + "learning_rate": 4.997409184116819e-07, + "logits/chosen": -3.0799849033355713, + "logits/rejected": -3.0568881034851074, + "logps/chosen": -52.149391174316406, + "logps/rejected": -42.9133186340332, + "loss": 0.6923, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.001999429427087307, + "rewards/margins": 0.0028107434045523405, + "rewards/rejected": -0.0008113139774650335, "step": 9 }, { - "epoch": 0.036076662908680945, - "grad_norm": 3.5118876812096094, - "learning_rate": 1.7857142857142858e-07, - "logits/chosen": -3.227530002593994, - "logits/rejected": -3.158536195755005, - "logps/chosen": -64.64949035644531, - "logps/rejected": -61.340675354003906, - "loss": 0.693, + "epoch": 0.12903225806451613, + "grad_norm": 7.6501883418829415, + "learning_rate": 4.989642106328828e-07, + "logits/chosen": -3.0393779277801514, + "logits/rejected": -3.0457043647766113, + "logps/chosen": -38.716957092285156, + "logps/rejected": -37.4216194152832, + "loss": 0.6909, "rewards/accuracies": 0.25, - "rewards/chosen": 0.0003387999313417822, - "rewards/margins": -0.0008412337047047913, - "rewards/rejected": 0.0011800335487350821, + "rewards/chosen": 0.0031322999857366085, + "rewards/margins": 0.004622948355972767, + "rewards/rejected": -0.0014906482538208365, "step": 10 }, { - "epoch": 0.03968432919954904, - "grad_norm": 3.5833022687868565, - "learning_rate": 1.964285714285714e-07, - "logits/chosen": -3.207714319229126, - "logits/rejected": -3.1596803665161133, - "logps/chosen": -55.09148406982422, - "logps/rejected": -55.0782585144043, - "loss": 0.693, - "rewards/accuracies": 0.375, - "rewards/chosen": 0.0026482604444026947, - "rewards/margins": 0.0005400502705015242, - "rewards/rejected": 0.0021082102321088314, + "epoch": 0.14193548387096774, + "grad_norm": 5.299300171900462, + "learning_rate": 4.976714865090826e-07, + "logits/chosen": -3.161602258682251, + "logits/rejected": -3.1717371940612793, + "logps/chosen": -63.28357696533203, + "logps/rejected": -70.27680969238281, + "loss": 0.6893, + "rewards/accuracies": 0.3125, + "rewards/chosen": 0.003206870285794139, + "rewards/margins": 0.004664942622184753, + "rewards/rejected": -0.0014580729184672236, "step": 11 }, { - "epoch": 0.04329199549041714, - "grad_norm": 3.3002299207434147, - "learning_rate": 2.1428571428571426e-07, - "logits/chosen": -3.1364617347717285, - "logits/rejected": -3.03536319732666, - "logps/chosen": -59.547698974609375, - "logps/rejected": -57.14613342285156, - "loss": 0.693, - "rewards/accuracies": 0.5625, - "rewards/chosen": 0.003593004308640957, - "rewards/margins": 0.0015400201082229614, - "rewards/rejected": 0.0020529842004179955, + "epoch": 0.15483870967741936, + "grad_norm": 6.014913040201032, + "learning_rate": 4.958654254084355e-07, + "logits/chosen": -3.0326099395751953, + "logits/rejected": -3.014777898788452, + "logps/chosen": -59.73595428466797, + "logps/rejected": -54.794410705566406, + "loss": 0.6915, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.0073243663646280766, + "rewards/margins": 0.003999748267233372, + "rewards/rejected": 0.003324618097394705, "step": 12 }, { - "epoch": 0.04689966178128523, - "grad_norm": 3.7024666557117416, - "learning_rate": 2.3214285714285714e-07, - "logits/chosen": -2.9659042358398438, - "logits/rejected": -3.026294708251953, - "logps/chosen": -38.168941497802734, - "logps/rejected": -40.436302185058594, - "loss": 0.693, + "epoch": 0.16774193548387098, + "grad_norm": 5.582237671195135, + "learning_rate": 4.935497706683698e-07, + "logits/chosen": -3.088393211364746, + "logits/rejected": -3.1142313480377197, + "logps/chosen": -52.511295318603516, + "logps/rejected": -61.057098388671875, + "loss": 0.6872, "rewards/accuracies": 0.4375, - "rewards/chosen": 0.003420760855078697, - "rewards/margins": 0.0009218340856023133, - "rewards/rejected": 0.0024989263620227575, + "rewards/chosen": -0.0028830207884311676, + "rewards/margins": 0.038106679916381836, + "rewards/rejected": -0.040989700704813004, "step": 13 }, { - "epoch": 0.05050732807215333, - "grad_norm": 3.5398946215981617, - "learning_rate": 2.5e-07, - "logits/chosen": -3.0174214839935303, - "logits/rejected": -2.966845750808716, - "logps/chosen": -48.355037689208984, - "logps/rejected": -43.75670623779297, - "loss": 0.6931, - "rewards/accuracies": 0.4375, - "rewards/chosen": 0.0056186579167842865, - "rewards/margins": 0.00017049553571268916, - "rewards/rejected": 0.00544816255569458, + "epoch": 0.18064516129032257, + "grad_norm": 5.712829645203721, + "learning_rate": 4.907293218369498e-07, + "logits/chosen": -2.8892948627471924, + "logits/rejected": -2.9255869388580322, + "logps/chosen": -47.063507080078125, + "logps/rejected": -59.78974533081055, + "loss": 0.6862, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.0006488842191174626, + "rewards/margins": 0.03463263809680939, + "rewards/rejected": -0.033983759582042694, "step": 14 }, { - "epoch": 0.05411499436302142, - "grad_norm": 3.697915228945678, - "learning_rate": 2.6785714285714284e-07, - "logits/chosen": -3.0737075805664062, - "logits/rejected": -3.0794742107391357, - "logps/chosen": -58.38935852050781, - "logps/rejected": -57.72342300415039, - "loss": 0.6929, - "rewards/accuracies": 0.25, - "rewards/chosen": 0.006344526074826717, - "rewards/margins": -0.0004283534362912178, - "rewards/rejected": 0.006772879045456648, + "epoch": 0.1935483870967742, + "grad_norm": 6.024414620349713, + "learning_rate": 4.874099247250798e-07, + "logits/chosen": -2.9845387935638428, + "logits/rejected": -3.003157138824463, + "logps/chosen": -61.63920974731445, + "logps/rejected": -77.2118148803711, + "loss": 0.6873, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.015748100355267525, + "rewards/margins": 0.006991543807089329, + "rewards/rejected": -0.022739645093679428, "step": 15 }, { - "epoch": 0.057722660653889514, - "grad_norm": 3.391085315127927, - "learning_rate": 2.857142857142857e-07, - "logits/chosen": -3.1120781898498535, - "logits/rejected": -3.0887746810913086, - "logps/chosen": -38.1907958984375, - "logps/rejected": -37.75780487060547, - "loss": 0.6927, - "rewards/accuracies": 0.1875, - "rewards/chosen": 0.006180965807288885, - "rewards/margins": 0.0014092850033193827, - "rewards/rejected": 0.004771680571138859, + "epoch": 0.2064516129032258, + "grad_norm": 6.792340069349894, + "learning_rate": 4.835984592901677e-07, + "logits/chosen": -3.0437138080596924, + "logits/rejected": -3.0553507804870605, + "logps/chosen": -47.824859619140625, + "logps/rejected": -57.92134475708008, + "loss": 0.6799, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.006595752667635679, + "rewards/margins": 0.035280268639326096, + "rewards/rejected": -0.041876018047332764, "step": 16 }, { - "epoch": 0.06133032694475761, - "grad_norm": 3.555933344998629, - "learning_rate": 3.0357142857142855e-07, - "logits/chosen": -3.0864977836608887, - "logits/rejected": -3.0205554962158203, - "logps/chosen": -50.95547103881836, - "logps/rejected": -45.228240966796875, - "loss": 0.6928, - "rewards/accuracies": 0.4375, - "rewards/chosen": 0.012755894102156162, - "rewards/margins": 0.00120101822540164, - "rewards/rejected": 0.011554877273738384, + "epoch": 0.21935483870967742, + "grad_norm": 11.168844097664572, + "learning_rate": 4.793028253763632e-07, + "logits/chosen": -3.0245566368103027, + "logits/rejected": -2.989858865737915, + "logps/chosen": -42.81298065185547, + "logps/rejected": -52.179176330566406, + "loss": 0.6768, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.03772101551294327, + "rewards/margins": 0.07718317210674286, + "rewards/rejected": -0.11490418016910553, "step": 17 }, { - "epoch": 0.0649379932356257, - "grad_norm": 3.6340860283059886, - "learning_rate": 3.2142857142857145e-07, - "logits/chosen": -3.099447727203369, - "logits/rejected": -2.9933624267578125, - "logps/chosen": -66.3836669921875, - "logps/rejected": -66.29476165771484, - "loss": 0.6927, - "rewards/accuracies": 0.4375, - "rewards/chosen": 0.014439787715673447, - "rewards/margins": 0.000907671288587153, - "rewards/rejected": 0.013532115146517754, + "epoch": 0.23225806451612904, + "grad_norm": 7.414924383918636, + "learning_rate": 4.74531926340924e-07, + "logits/chosen": -2.971475839614868, + "logits/rejected": -2.945223569869995, + "logps/chosen": -43.70143127441406, + "logps/rejected": -33.612754821777344, + "loss": 0.6875, + "rewards/accuracies": 0.125, + "rewards/chosen": -0.03854432329535484, + "rewards/margins": -0.005393250845372677, + "rewards/rejected": -0.03315107151865959, "step": 18 }, { - "epoch": 0.0685456595264938, - "grad_norm": 3.8128041248799662, - "learning_rate": 3.392857142857143e-07, - "logits/chosen": -3.2197351455688477, - "logits/rejected": -3.213644027709961, - "logps/chosen": -91.82564544677734, - "logps/rejected": -89.4195785522461, - "loss": 0.6926, - "rewards/accuracies": 0.375, - "rewards/chosen": 0.024685291573405266, - "rewards/margins": 0.000819602282717824, - "rewards/rejected": 0.02386569045484066, + "epoch": 0.24516129032258063, + "grad_norm": 10.414586267170323, + "learning_rate": 4.692956506006486e-07, + "logits/chosen": -2.9716796875, + "logits/rejected": -2.9322829246520996, + "logps/chosen": -62.92927551269531, + "logps/rejected": -59.554969787597656, + "loss": 0.6734, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.0010191131150349975, + "rewards/margins": 0.037698645144701004, + "rewards/rejected": -0.038717761635780334, "step": 19 }, { - "epoch": 0.07215332581736189, - "grad_norm": 3.360311139693669, - "learning_rate": 3.5714285714285716e-07, - "logits/chosen": -2.9687275886535645, - "logits/rejected": -3.0069329738616943, - "logps/chosen": -49.66991424560547, - "logps/rejected": -56.89038848876953, - "loss": 0.6922, - "rewards/accuracies": 0.375, - "rewards/chosen": 0.016231724992394447, - "rewards/margins": 0.0010469770058989525, - "rewards/rejected": 0.01518474705517292, + "epoch": 0.25806451612903225, + "grad_norm": 8.133295798312702, + "learning_rate": 4.6360485113662214e-07, + "logits/chosen": -2.8298916816711426, + "logits/rejected": -2.906747579574585, + "logps/chosen": -26.855010986328125, + "logps/rejected": -53.25407028198242, + "loss": 0.6727, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.004282762296497822, + "rewards/margins": 0.04214238375425339, + "rewards/rejected": -0.04642514884471893, "step": 20 }, { - "epoch": 0.07576099210823, - "grad_norm": 3.6738935090154463, - "learning_rate": 3.75e-07, - "logits/chosen": -3.188748359680176, - "logits/rejected": -3.1373486518859863, - "logps/chosen": -55.108917236328125, - "logps/rejected": -53.68857955932617, - "loss": 0.6924, - "rewards/accuracies": 0.5, - "rewards/chosen": 0.03164692968130112, - "rewards/margins": 0.002948709297925234, - "rewards/rejected": 0.02869821898639202, + "epoch": 0.2709677419354839, + "grad_norm": 7.911827600658339, + "learning_rate": 4.574713229997563e-07, + "logits/chosen": -3.1790623664855957, + "logits/rejected": -3.183187246322632, + "logps/chosen": -62.52846908569336, + "logps/rejected": -64.2408218383789, + "loss": 0.6712, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.019579507410526276, + "rewards/margins": 6.686896085739136e-06, + "rewards/rejected": -0.019586196169257164, "step": 21 }, { - "epoch": 0.07936865839909808, - "grad_norm": 3.633505174054657, - "learning_rate": 3.928571428571428e-07, - "logits/chosen": -3.18550181388855, - "logits/rejected": -3.1714069843292236, - "logps/chosen": -64.8235855102539, - "logps/rejected": -63.128482818603516, - "loss": 0.6927, - "rewards/accuracies": 0.5, - "rewards/chosen": 0.03558368235826492, - "rewards/margins": 0.0030938356649130583, - "rewards/rejected": 0.032489847391843796, + "epoch": 0.2838709677419355, + "grad_norm": 6.233493817899055, + "learning_rate": 4.5090777886374453e-07, + "logits/chosen": -3.0413153171539307, + "logits/rejected": -3.0401761531829834, + "logps/chosen": -32.636592864990234, + "logps/rejected": -33.258331298828125, + "loss": 0.6712, + "rewards/accuracies": 0.1875, + "rewards/chosen": -0.006448335479944944, + "rewards/margins": 0.016460755839943886, + "rewards/rejected": -0.022909093648195267, "step": 22 }, { - "epoch": 0.08297632468996617, - "grad_norm": 3.3216483648140946, - "learning_rate": 4.1071428571428566e-07, - "logits/chosen": -3.0630507469177246, - "logits/rejected": -3.0171515941619873, - "logps/chosen": -59.85310745239258, - "logps/rejected": -51.64715576171875, - "loss": 0.6922, - "rewards/accuracies": 0.5625, - "rewards/chosen": 0.03555324673652649, - "rewards/margins": 0.0034288412425667048, - "rewards/rejected": 0.03212440758943558, + "epoch": 0.2967741935483871, + "grad_norm": 12.20764827602206, + "learning_rate": 4.4392782267610495e-07, + "logits/chosen": -2.8663036823272705, + "logits/rejected": -2.8864498138427734, + "logps/chosen": -53.24276351928711, + "logps/rejected": -58.46373748779297, + "loss": 0.6649, + "rewards/accuracies": 0.1875, + "rewards/chosen": -0.09484248608350754, + "rewards/margins": 0.05482190474867821, + "rewards/rejected": -0.14966437220573425, "step": 23 }, { - "epoch": 0.08658399098083427, - "grad_norm": 3.28607502832243, - "learning_rate": 4.285714285714285e-07, - "logits/chosen": -3.062216281890869, - "logits/rejected": -3.065988779067993, - "logps/chosen": -58.483497619628906, - "logps/rejected": -57.545345306396484, - "loss": 0.692, - "rewards/accuracies": 0.375, - "rewards/chosen": 0.04048089310526848, - "rewards/margins": 0.0023373623844236135, - "rewards/rejected": 0.03814353048801422, + "epoch": 0.3096774193548387, + "grad_norm": 9.11035362020615, + "learning_rate": 4.3654592146192137e-07, + "logits/chosen": -3.043889284133911, + "logits/rejected": -3.04427433013916, + "logps/chosen": -50.45281219482422, + "logps/rejected": -62.33794403076172, + "loss": 0.6614, + "rewards/accuracies": 0.3125, + "rewards/chosen": -0.06691670417785645, + "rewards/margins": 0.06718842685222626, + "rewards/rejected": -0.1341051459312439, "step": 24 }, { - "epoch": 0.09019165727170236, - "grad_norm": 3.152336326362568, - "learning_rate": 4.464285714285714e-07, - "logits/chosen": -3.0892107486724854, - "logits/rejected": -3.1102676391601562, - "logps/chosen": -16.93307876586914, - "logps/rejected": -19.273767471313477, - "loss": 0.6922, - "rewards/accuracies": 0.0, - "rewards/chosen": 0.015432961285114288, - "rewards/margins": -0.0015947434585541487, - "rewards/rejected": 0.017027704045176506, + "epoch": 0.3225806451612903, + "grad_norm": 7.38691150020557, + "learning_rate": 4.2877737533872484e-07, + "logits/chosen": -3.020115375518799, + "logits/rejected": -2.949554204940796, + "logps/chosen": -90.56169128417969, + "logps/rejected": -88.24325561523438, + "loss": 0.6629, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.10975000262260437, + "rewards/margins": 0.09208998084068298, + "rewards/rejected": -0.20183998346328735, "step": 25 }, { - "epoch": 0.09379932356257047, - "grad_norm": 3.7240418322690507, - "learning_rate": 4.6428571428571427e-07, - "logits/chosen": -3.0490899085998535, - "logits/rejected": -3.0427658557891846, - "logps/chosen": -21.992151260375977, - "logps/rejected": -21.333309173583984, - "loss": 0.6919, + "epoch": 0.33548387096774196, + "grad_norm": 5.575537840637267, + "learning_rate": 4.206382858046635e-07, + "logits/chosen": -3.0138120651245117, + "logits/rejected": -3.0246829986572266, + "logps/chosen": -63.48017120361328, + "logps/rejected": -64.13935089111328, + "loss": 0.6774, "rewards/accuracies": 0.375, - "rewards/chosen": 0.019902564585208893, - "rewards/margins": 0.0022603285033255816, - "rewards/rejected": 0.01764223724603653, + "rewards/chosen": -0.08101020753383636, + "rewards/margins": 0.03786313533782959, + "rewards/rejected": -0.11887334287166595, "step": 26 }, { - "epoch": 0.09740698985343856, - "grad_norm": 3.538743053204808, - "learning_rate": 4.821428571428571e-07, - "logits/chosen": -2.98097825050354, - "logits/rejected": -2.990886688232422, - "logps/chosen": -24.547744750976562, - "logps/rejected": -24.993146896362305, - "loss": 0.6917, - "rewards/accuracies": 0.375, - "rewards/chosen": 0.02400978095829487, - "rewards/margins": -0.0004754224210046232, - "rewards/rejected": 0.024485204368829727, + "epoch": 0.34838709677419355, + "grad_norm": 6.166923188160189, + "learning_rate": 4.12145522365689e-07, + "logits/chosen": -2.909912347793579, + "logits/rejected": -2.9610202312469482, + "logps/chosen": -33.47990417480469, + "logps/rejected": -65.81147003173828, + "loss": 0.6581, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.14108583331108093, + "rewards/margins": 0.14831525087356567, + "rewards/rejected": -0.2894010841846466, "step": 27 }, { - "epoch": 0.10101465614430666, - "grad_norm": 3.7127327075576986, - "learning_rate": 5e-07, - "logits/chosen": -3.04250431060791, - "logits/rejected": -3.0370893478393555, - "logps/chosen": -51.725685119628906, - "logps/rejected": -49.70272445678711, - "loss": 0.6914, - "rewards/accuracies": 0.1875, - "rewards/chosen": 0.03617345541715622, - "rewards/margins": -0.0012786828447133303, - "rewards/rejected": 0.03745213523507118, + "epoch": 0.36129032258064514, + "grad_norm": 6.548237975271229, + "learning_rate": 4.0331668757092905e-07, + "logits/chosen": -2.959547519683838, + "logits/rejected": -2.954468250274658, + "logps/chosen": -48.109375, + "logps/rejected": -64.71925354003906, + "loss": 0.662, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.20003138482570648, + "rewards/margins": 0.12624813616275787, + "rewards/rejected": -0.32627955079078674, "step": 28 }, { - "epoch": 0.10462232243517475, - "grad_norm": 3.8435490397284826, - "learning_rate": 4.99980102188921e-07, - "logits/chosen": -3.15837025642395, - "logits/rejected": -3.0846993923187256, - "logps/chosen": -38.1151008605957, - "logps/rejected": -36.88887023925781, - "loss": 0.6919, - "rewards/accuracies": 0.375, - "rewards/chosen": 0.031231267377734184, - "rewards/margins": 0.00354637298732996, - "rewards/rejected": 0.0276848953217268, + "epoch": 0.3741935483870968, + "grad_norm": 8.441769285768048, + "learning_rate": 3.941700805287168e-07, + "logits/chosen": -2.763150215148926, + "logits/rejected": -2.7778449058532715, + "logps/chosen": -47.057498931884766, + "logps/rejected": -56.587127685546875, + "loss": 0.6655, + "rewards/accuracies": 0.3125, + "rewards/chosen": -0.12557458877563477, + "rewards/margins": 0.02159186080098152, + "rewards/rejected": -0.147166445851326, "step": 29 }, { - "epoch": 0.10822998872604284, - "grad_norm": 3.5574836624589694, - "learning_rate": 4.999204119230669e-07, - "logits/chosen": -3.068660259246826, - "logits/rejected": -2.9993085861206055, - "logps/chosen": -51.469879150390625, - "logps/rejected": -51.89442443847656, - "loss": 0.6905, - "rewards/accuracies": 0.5625, - "rewards/chosen": 0.0365854874253273, - "rewards/margins": 0.006971403025090694, - "rewards/rejected": 0.02961408533155918, + "epoch": 0.3870967741935484, + "grad_norm": 6.820259306929618, + "learning_rate": 3.847246589788939e-07, + "logits/chosen": -2.9482781887054443, + "logits/rejected": -2.943040370941162, + "logps/chosen": -51.7378044128418, + "logps/rejected": -68.4146728515625, + "loss": 0.6487, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.16159993410110474, + "rewards/margins": 0.12931199371814728, + "rewards/rejected": -0.2909119427204132, "step": 30 }, { - "epoch": 0.11183765501691094, - "grad_norm": 3.6048192019986782, - "learning_rate": 4.998209387040828e-07, - "logits/chosen": -2.9828033447265625, - "logits/rejected": -2.928224563598633, - "logps/chosen": -58.3504638671875, - "logps/rejected": -59.8613166809082, - "loss": 0.6925, - "rewards/accuracies": 0.4375, - "rewards/chosen": 0.03084871545433998, - "rewards/margins": 0.007869547232985497, - "rewards/rejected": 0.022979168221354485, + "epoch": 0.4, + "grad_norm": 7.812680952947978, + "learning_rate": 3.75e-07, + "logits/chosen": -2.7655723094940186, + "logits/rejected": -2.7568373680114746, + "logps/chosen": -71.60513305664062, + "logps/rejected": -76.70703125, + "loss": 0.649, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.2342558652162552, + "rewards/margins": 0.0854680985212326, + "rewards/rejected": -0.3197239935398102, "step": 31 }, { - "epoch": 0.11544532130777903, - "grad_norm": 3.778645567479808, - "learning_rate": 4.996816983663634e-07, - "logits/chosen": -3.0680296421051025, - "logits/rejected": -3.0426552295684814, - "logps/chosen": -58.639949798583984, - "logps/rejected": -55.833797454833984, - "loss": 0.6905, - "rewards/accuracies": 0.4375, - "rewards/chosen": 0.034869663417339325, - "rewards/margins": 0.006170907057821751, - "rewards/rejected": 0.028698759153485298, + "epoch": 0.4129032258064516, + "grad_norm": 7.397143004044, + "learning_rate": 3.65016259432788e-07, + "logits/chosen": -2.9698352813720703, + "logits/rejected": -3.0043089389801025, + "logps/chosen": -41.41299057006836, + "logps/rejected": -50.932334899902344, + "loss": 0.6388, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.23619022965431213, + "rewards/margins": 0.04527804255485535, + "rewards/rejected": -0.2814682722091675, "step": 32 }, { - "epoch": 0.11905298759864713, - "grad_norm": 3.5136789808694546, - "learning_rate": 4.995027130745321e-07, - "logits/chosen": -3.146360397338867, - "logits/rejected": -3.134718894958496, - "logps/chosen": -41.02317810058594, - "logps/rejected": -39.83603286743164, - "loss": 0.6903, - "rewards/accuracies": 0.1875, - "rewards/chosen": 0.022047480568289757, - "rewards/margins": -0.0010091331787407398, - "rewards/rejected": 0.02305661514401436, + "epoch": 0.4258064516129032, + "grad_norm": 7.152484571426207, + "learning_rate": 3.54794130104166e-07, + "logits/chosen": -2.823517084121704, + "logits/rejected": -2.8529646396636963, + "logps/chosen": -16.425308227539062, + "logps/rejected": -24.822494506835938, + "loss": 0.6421, + "rewards/accuracies": 0.125, + "rewards/chosen": -0.0828515961766243, + "rewards/margins": 0.04831594228744507, + "rewards/rejected": -0.13116753101348877, "step": 33 }, { - "epoch": 0.12266065388951522, - "grad_norm": 3.5456372383058445, - "learning_rate": 4.99284011319913e-07, - "logits/chosen": -3.067211151123047, - "logits/rejected": -3.0412845611572266, - "logps/chosen": -44.38189697265625, - "logps/rejected": -42.0210075378418, - "loss": 0.6912, - "rewards/accuracies": 0.3125, - "rewards/chosen": 0.013639697805047035, - "rewards/margins": 0.008699356578290462, - "rewards/rejected": 0.004940340295433998, + "epoch": 0.43870967741935485, + "grad_norm": 9.358494780867913, + "learning_rate": 3.4435479893815355e-07, + "logits/chosen": -2.8296639919281006, + "logits/rejected": -2.8361520767211914, + "logps/chosen": -86.12203979492188, + "logps/rejected": -95.99158477783203, + "loss": 0.6386, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.26781708002090454, + "rewards/margins": 0.14883765578269958, + "rewards/rejected": -0.41665470600128174, "step": 34 }, { - "epoch": 0.1262683201803833, - "grad_norm": 3.7175368775540782, - "learning_rate": 4.990256279159957e-07, - "logits/chosen": -3.0800342559814453, - "logits/rejected": -3.0359151363372803, - "logps/chosen": -40.98062515258789, - "logps/rejected": -41.68905258178711, - "loss": 0.6904, - "rewards/accuracies": 0.375, - "rewards/chosen": -0.004652009811252356, - "rewards/margins": -0.0004617827944457531, - "rewards/rejected": -0.0041902270168066025, + "epoch": 0.45161290322580644, + "grad_norm": 8.786574832386057, + "learning_rate": 3.337199030427465e-07, + "logits/chosen": -2.9809043407440186, + "logits/rejected": -2.9782485961914062, + "logps/chosen": -70.9310302734375, + "logps/rejected": -77.85838317871094, + "loss": 0.6339, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.2812136113643646, + "rewards/margins": 0.06157858297228813, + "rewards/rejected": -0.3427921533584595, "step": 35 }, { - "epoch": 0.1298759864712514, - "grad_norm": 3.428989674416389, - "learning_rate": 4.987276039928936e-07, - "logits/chosen": -3.023162841796875, - "logits/rejected": -3.0007073879241943, - "logps/chosen": -30.415803909301758, - "logps/rejected": -27.819746017456055, - "loss": 0.6907, + "epoch": 0.4645161290322581, + "grad_norm": 6.3872463929573495, + "learning_rate": 3.229114848637062e-07, + "logits/chosen": -2.9001638889312744, + "logits/rejected": -2.8978068828582764, + "logps/chosen": -53.35871124267578, + "logps/rejected": -61.33634567260742, + "loss": 0.6308, "rewards/accuracies": 0.3125, - "rewards/chosen": -0.013717405498027802, - "rewards/margins": 0.0052181570790708065, - "rewards/rejected": -0.018935564905405045, + "rewards/chosen": -0.24349886178970337, + "rewards/margins": 0.06033254787325859, + "rewards/rejected": -0.30383142828941345, "step": 36 }, { - "epoch": 0.13348365276211951, - "grad_norm": 3.8174353757069315, - "learning_rate": 4.983899869907962e-07, - "logits/chosen": -3.1304450035095215, - "logits/rejected": -3.1041676998138428, - "logps/chosen": -30.272798538208008, - "logps/rejected": -30.132299423217773, - "loss": 0.689, + "epoch": 0.4774193548387097, + "grad_norm": 9.892537592398023, + "learning_rate": 3.11951946498225e-07, + "logits/chosen": -2.742429256439209, + "logits/rejected": -2.762636184692383, + "logps/chosen": -51.067054748535156, + "logps/rejected": -58.35332489013672, + "loss": 0.6331, "rewards/accuracies": 0.3125, - "rewards/chosen": -0.014267665334045887, - "rewards/margins": 0.0075728558003902435, - "rewards/rejected": -0.021840520203113556, + "rewards/chosen": -0.12977036833763123, + "rewards/margins": 0.07739689201116562, + "rewards/rejected": -0.20716726779937744, "step": 37 }, { - "epoch": 0.1370913190529876, - "grad_norm": 3.757696716590805, - "learning_rate": 4.980128306524183e-07, - "logits/chosen": -3.1125388145446777, - "logits/rejected": -3.0397329330444336, - "logps/chosen": -28.54355239868164, - "logps/rejected": -31.549545288085938, - "loss": 0.6878, + "epoch": 0.49032258064516127, + "grad_norm": 8.185078211550065, + "learning_rate": 3.008640032631585e-07, + "logits/chosen": -2.911105155944824, + "logits/rejected": -2.9227683544158936, + "logps/chosen": -37.1446647644043, + "logps/rejected": -68.1653823852539, + "loss": 0.6473, "rewards/accuracies": 0.25, - "rewards/chosen": 0.001858408097177744, - "rewards/margins": 0.007613640744239092, - "rewards/rejected": -0.005755233578383923, + "rewards/chosen": -0.11132928729057312, + "rewards/margins": 0.14660148322582245, + "rewards/rejected": -0.2579307556152344, "step": 38 }, { - "epoch": 0.1406989853438557, - "grad_norm": 4.0152543112432895, - "learning_rate": 4.975961950144444e-07, - "logits/chosen": -2.833340883255005, - "logits/rejected": -2.8634095191955566, - "logps/chosen": -71.63900756835938, - "logps/rejected": -69.79136657714844, - "loss": 0.6898, + "epoch": 0.5032258064516129, + "grad_norm": 9.483445735835293, + "learning_rate": 2.8967063661406284e-07, + "logits/chosen": -3.0798282623291016, + "logits/rejected": -3.039661407470703, + "logps/chosen": -75.90811157226562, + "logps/rejected": -76.0742416381836, + "loss": 0.6333, "rewards/accuracies": 0.3125, - "rewards/chosen": -0.037096284329891205, - "rewards/margins": -0.002715187380090356, - "rewards/rejected": -0.03438109904527664, + "rewards/chosen": -0.044046178460121155, + "rewards/margins": 0.11025506258010864, + "rewards/rejected": -0.154301255941391, "step": 39 }, { - "epoch": 0.14430665163472378, - "grad_norm": 4.500971168555326, - "learning_rate": 4.971401463979721e-07, - "logits/chosen": -3.0739340782165527, - "logits/rejected": -3.0391714572906494, - "logps/chosen": -61.3797492980957, - "logps/rejected": -58.503292083740234, - "loss": 0.6865, - "rewards/accuracies": 0.75, - "rewards/chosen": -0.01357153058052063, - "rewards/margins": 0.029109418392181396, - "rewards/rejected": -0.042680948972702026, + "epoch": 0.5161290322580645, + "grad_norm": 10.173499498197904, + "learning_rate": 2.783950465126187e-07, + "logits/chosen": -2.8641061782836914, + "logits/rejected": -2.880579710006714, + "logps/chosen": -59.221946716308594, + "logps/rejected": -90.15663146972656, + "loss": 0.6433, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.16509050130844116, + "rewards/margins": 0.26559263467788696, + "rewards/rejected": -0.4306831359863281, "step": 40 }, { - "epoch": 0.14791431792559187, - "grad_norm": 4.538967423876311, - "learning_rate": 4.966447573979552e-07, - "logits/chosen": -3.0744223594665527, - "logits/rejected": -3.057842493057251, - "logps/chosen": -38.64921569824219, - "logps/rejected": -37.562931060791016, - "loss": 0.6858, - "rewards/accuracies": 0.3125, - "rewards/chosen": -0.049748875200748444, - "rewards/margins": 0.010249648243188858, - "rewards/rejected": -0.0599985234439373, + "epoch": 0.5290322580645161, + "grad_norm": 8.808202753666528, + "learning_rate": 2.6706060334116775e-07, + "logits/chosen": -3.0445690155029297, + "logits/rejected": -3.074784755706787, + "logps/chosen": -68.35140991210938, + "logps/rejected": -104.00436401367188, + "loss": 0.6333, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.3045073449611664, + "rewards/margins": 0.229690819978714, + "rewards/rejected": -0.5341981649398804, "step": 41 }, { - "epoch": 0.15152198421646, - "grad_norm": 3.9223710631744084, - "learning_rate": 4.961101068716476e-07, - "logits/chosen": -2.9480621814727783, - "logits/rejected": -2.9769458770751953, - "logps/chosen": -54.56187438964844, - "logps/rejected": -56.06974792480469, - "loss": 0.6876, + "epoch": 0.5419354838709678, + "grad_norm": 8.13273017045566, + "learning_rate": 2.556907994640264e-07, + "logits/chosen": -2.9060964584350586, + "logits/rejected": -2.90242338180542, + "logps/chosen": -91.3266830444336, + "logps/rejected": -118.18492126464844, + "loss": 0.6499, "rewards/accuracies": 0.5, - "rewards/chosen": -0.07341207563877106, - "rewards/margins": -0.0054923165589571, - "rewards/rejected": -0.0679197609424591, + "rewards/chosen": -0.23070470988750458, + "rewards/margins": 0.24628427624702454, + "rewards/rejected": -0.4769890308380127, "step": 42 }, { - "epoch": 0.15512965050732808, - "grad_norm": 3.9618481991909995, - "learning_rate": 4.955362799260506e-07, - "logits/chosen": -3.1202292442321777, - "logits/rejected": -3.105976104736328, - "logps/chosen": -43.32845687866211, - "logps/rejected": -42.27537155151367, - "loss": 0.6883, - "rewards/accuracies": 0.25, - "rewards/chosen": -0.028534067794680595, - "rewards/margins": 0.0005733096040785313, - "rewards/rejected": -0.02910737879574299, + "epoch": 0.5548387096774193, + "grad_norm": 8.743594644815907, + "learning_rate": 2.4430920053597355e-07, + "logits/chosen": -2.8366618156433105, + "logits/rejected": -2.7787930965423584, + "logps/chosen": -81.77251434326172, + "logps/rejected": -92.00411987304688, + "loss": 0.6421, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.2051384150981903, + "rewards/margins": 0.1954495906829834, + "rewards/rejected": -0.4005880057811737, "step": 43 }, { - "epoch": 0.15873731679819617, - "grad_norm": 4.097820832663001, - "learning_rate": 4.949233679043654e-07, - "logits/chosen": -3.0720224380493164, - "logits/rejected": -3.0059666633605957, - "logps/chosen": -46.022666931152344, - "logps/rejected": -44.198524475097656, - "loss": 0.6882, - "rewards/accuracies": 0.5, - "rewards/chosen": -0.04602278396487236, - "rewards/margins": 0.011173712089657784, - "rewards/rejected": -0.057196490466594696, + "epoch": 0.567741935483871, + "grad_norm": 7.640503263166919, + "learning_rate": 2.3293939665883228e-07, + "logits/chosen": -2.8904852867126465, + "logits/rejected": -2.877133369445801, + "logps/chosen": -81.6563949584961, + "logps/rejected": -94.8319320678711, + "loss": 0.6258, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.30271658301353455, + "rewards/margins": 0.1767008900642395, + "rewards/rejected": -0.47941750288009644, "step": 44 }, { - "epoch": 0.16234498308906425, - "grad_norm": 4.619746862244029, - "learning_rate": 4.942714683714531e-07, - "logits/chosen": -3.007413387298584, - "logits/rejected": -3.0036489963531494, - "logps/chosen": -55.64915084838867, - "logps/rejected": -56.559940338134766, - "loss": 0.6849, - "rewards/accuracies": 0.375, - "rewards/chosen": -0.058474987745285034, - "rewards/margins": 0.015302611514925957, - "rewards/rejected": -0.07377760112285614, + "epoch": 0.5806451612903226, + "grad_norm": 7.729498181809134, + "learning_rate": 2.2160495348738124e-07, + "logits/chosen": -3.036078929901123, + "logits/rejected": -3.0348596572875977, + "logps/chosen": -64.10650634765625, + "logps/rejected": -89.60687255859375, + "loss": 0.6304, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.21719691157341003, + "rewards/margins": 0.24271026253700256, + "rewards/rejected": -0.4599071741104126, "step": 45 }, { - "epoch": 0.16595264937993234, - "grad_norm": 4.517640994061133, - "learning_rate": 4.935806850983033e-07, - "logits/chosen": -3.094602346420288, - "logits/rejected": -3.0135269165039062, - "logps/chosen": -74.85617065429688, - "logps/rejected": -79.79491424560547, - "loss": 0.6837, - "rewards/accuracies": 0.6875, - "rewards/chosen": -0.06277385354042053, - "rewards/margins": 0.04777727648615837, - "rewards/rejected": -0.11055111885070801, + "epoch": 0.5935483870967742, + "grad_norm": 8.226196925104558, + "learning_rate": 2.1032936338593717e-07, + "logits/chosen": -2.9982056617736816, + "logits/rejected": -3.010178327560425, + "logps/chosen": -75.5604476928711, + "logps/rejected": -85.93553161621094, + "loss": 0.6261, + "rewards/accuracies": 0.1875, + "rewards/chosen": -0.426933228969574, + "rewards/margins": -0.0191495418548584, + "rewards/rejected": -0.4077836871147156, "step": 46 }, { - "epoch": 0.16956031567080046, - "grad_norm": 4.3874832917927105, - "learning_rate": 4.928511280455168e-07, - "logits/chosen": -3.0445709228515625, - "logits/rejected": -3.032961130142212, - "logps/chosen": -39.556121826171875, - "logps/rejected": -41.15797805786133, - "loss": 0.684, - "rewards/accuracies": 0.3125, - "rewards/chosen": -0.07098544389009476, - "rewards/margins": 0.008738975040614605, - "rewards/rejected": -0.07972440868616104, + "epoch": 0.6064516129032258, + "grad_norm": 10.912368208375094, + "learning_rate": 1.9913599673684159e-07, + "logits/chosen": -3.070939540863037, + "logits/rejected": -3.0626533031463623, + "logps/chosen": -75.2443618774414, + "logps/rejected": -106.09898376464844, + "loss": 0.6116, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.18821322917938232, + "rewards/margins": 0.24638797342777252, + "rewards/rejected": -0.43460121750831604, "step": 47 }, { - "epoch": 0.17316798196166855, - "grad_norm": 4.169413386902496, - "learning_rate": 4.92082913345801e-07, - "logits/chosen": -2.850205183029175, - "logits/rejected": -2.8822288513183594, - "logps/chosen": -39.982269287109375, - "logps/rejected": -40.610504150390625, - "loss": 0.6827, - "rewards/accuracies": 0.25, - "rewards/chosen": -0.017116393893957138, - "rewards/margins": 0.002710124012082815, - "rewards/rejected": -0.01982651837170124, + "epoch": 0.6193548387096774, + "grad_norm": 10.218819822781606, + "learning_rate": 1.8804805350177506e-07, + "logits/chosen": -3.0395772457122803, + "logits/rejected": -3.069377899169922, + "logps/chosen": -74.85322570800781, + "logps/rejected": -102.59559631347656, + "loss": 0.5944, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.2960664629936218, + "rewards/margins": 0.21642781794071198, + "rewards/rejected": -0.5124942660331726, "step": 48 }, { - "epoch": 0.17677564825253664, - "grad_norm": 4.706380001454301, - "learning_rate": 4.912761632854832e-07, - "logits/chosen": -3.0092577934265137, - "logits/rejected": -2.931588649749756, - "logps/chosen": -75.39279174804688, - "logps/rejected": -74.2584228515625, - "loss": 0.6902, + "epoch": 0.632258064516129, + "grad_norm": 11.388294826021683, + "learning_rate": 1.7708851513629373e-07, + "logits/chosen": -2.810640811920166, + "logits/rejected": -2.8224596977233887, + "logps/chosen": -66.05618286132812, + "logps/rejected": -98.07613372802734, + "loss": 0.6453, "rewards/accuracies": 0.4375, - "rewards/chosen": -0.0948907881975174, - "rewards/margins": 0.002338982652872801, - "rewards/rejected": -0.09722977131605148, + "rewards/chosen": -0.34373006224632263, + "rewards/margins": 0.2323249876499176, + "rewards/rejected": -0.5760550498962402, "step": 49 }, { - "epoch": 0.18038331454340473, - "grad_norm": 4.518855935375837, - "learning_rate": 4.904310062850462e-07, - "logits/chosen": -3.133390188217163, - "logits/rejected": -3.106928825378418, - "logps/chosen": -65.8951187133789, - "logps/rejected": -63.197330474853516, - "loss": 0.6888, - "rewards/accuracies": 0.375, - "rewards/chosen": -0.06748752295970917, - "rewards/margins": 0.020350340753793716, - "rewards/rejected": -0.08783785998821259, + "epoch": 0.6451612903225806, + "grad_norm": 13.864844054736514, + "learning_rate": 1.6628009695725346e-07, + "logits/chosen": -2.8773510456085205, + "logits/rejected": -2.8751275539398193, + "logps/chosen": -66.28228759765625, + "logps/rejected": -67.61710357666016, + "loss": 0.6199, + "rewards/accuracies": 0.1875, + "rewards/chosen": -0.16454900801181793, + "rewards/margins": 0.060019105672836304, + "rewards/rejected": -0.22456809878349304, "step": 50 }, { - "epoch": 0.18399098083427282, - "grad_norm": 4.520782362991715, - "learning_rate": 4.895475768786842e-07, - "logits/chosen": -3.0380477905273438, - "logits/rejected": -3.0168204307556152, - "logps/chosen": -49.39640808105469, - "logps/rejected": -49.188743591308594, - "loss": 0.6851, - "rewards/accuracies": 0.375, - "rewards/chosen": -0.05433456599712372, - "rewards/margins": 0.012480136007070541, - "rewards/rejected": -0.06681470572948456, + "epoch": 0.6580645161290323, + "grad_norm": 11.09099423463829, + "learning_rate": 1.5564520106184643e-07, + "logits/chosen": -2.8890538215637207, + "logits/rejected": -2.8882861137390137, + "logps/chosen": -97.7024917602539, + "logps/rejected": -117.82125854492188, + "loss": 0.5991, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.3737015128135681, + "rewards/margins": 0.14311935007572174, + "rewards/rejected": -0.5168208479881287, "step": 51 }, { - "epoch": 0.18759864712514093, - "grad_norm": 4.73187632341397, - "learning_rate": 4.886260156928887e-07, - "logits/chosen": -3.1190249919891357, - "logits/rejected": -3.0345208644866943, - "logps/chosen": -55.41284942626953, - "logps/rejected": -54.91582489013672, - "loss": 0.6816, - "rewards/accuracies": 0.4375, - "rewards/chosen": -0.07112475484609604, - "rewards/margins": 0.03103570081293583, - "rewards/rejected": -0.10216045379638672, + "epoch": 0.6709677419354839, + "grad_norm": 14.02314336885856, + "learning_rate": 1.4520586989583405e-07, + "logits/chosen": -3.058253049850464, + "logits/rejected": -3.0794386863708496, + "logps/chosen": -86.7257308959961, + "logps/rejected": -100.26595306396484, + "loss": 0.6199, + "rewards/accuracies": 0.1875, + "rewards/chosen": -0.21201437711715698, + "rewards/margins": -0.08268441259860992, + "rewards/rejected": -0.12932996451854706, "step": 52 }, { - "epoch": 0.19120631341600902, - "grad_norm": 5.253374777975737, - "learning_rate": 4.876664694240629e-07, - "logits/chosen": -3.0187482833862305, - "logits/rejected": -2.9662864208221436, - "logps/chosen": -57.75083923339844, - "logps/rejected": -54.512786865234375, - "loss": 0.6891, - "rewards/accuracies": 0.375, - "rewards/chosen": -0.11051340401172638, - "rewards/margins": -0.007197379134595394, - "rewards/rejected": -0.10331601649522781, + "epoch": 0.6838709677419355, + "grad_norm": 15.2231921450545, + "learning_rate": 1.3498374056721196e-07, + "logits/chosen": -3.1192705631256104, + "logits/rejected": -3.145230770111084, + "logps/chosen": -45.069618225097656, + "logps/rejected": -71.2268295288086, + "loss": 0.6339, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.13725833594799042, + "rewards/margins": 0.12775808572769165, + "rewards/rejected": -0.26501643657684326, "step": 53 }, { - "epoch": 0.1948139797068771, - "grad_norm": 4.964608250951011, - "learning_rate": 4.866690908151697e-07, - "logits/chosen": -3.001500368118286, - "logits/rejected": -2.9302666187286377, - "logps/chosen": -70.85618591308594, - "logps/rejected": -75.05422973632812, - "loss": 0.6837, - "rewards/accuracies": 0.625, - "rewards/chosen": -0.14847774803638458, - "rewards/margins": 0.04103374481201172, - "rewards/rejected": -0.1895114779472351, + "epoch": 0.6967741935483871, + "grad_norm": 10.43878298749505, + "learning_rate": 1.2500000000000005e-07, + "logits/chosen": -2.923105239868164, + "logits/rejected": -2.929281234741211, + "logps/chosen": -44.89529800415039, + "logps/rejected": -39.17671203613281, + "loss": 0.612, + "rewards/accuracies": 0.125, + "rewards/chosen": -0.10784972459077835, + "rewards/margins": -0.016192324459552765, + "rewards/rejected": -0.09165740013122559, "step": 54 }, { - "epoch": 0.1984216459977452, - "grad_norm": 5.074181522586497, - "learning_rate": 4.856340386314181e-07, - "logits/chosen": -3.134683132171631, - "logits/rejected": -3.1000521183013916, - "logps/chosen": -77.0101318359375, - "logps/rejected": -71.63951873779297, - "loss": 0.6844, - "rewards/accuracies": 0.5, - "rewards/chosen": -0.09386000037193298, - "rewards/margins": 0.0251320693641901, - "rewards/rejected": -0.11899206787347794, + "epoch": 0.7096774193548387, + "grad_norm": 12.476212432569776, + "learning_rate": 1.1527534102110611e-07, + "logits/chosen": -2.9852142333984375, + "logits/rejected": -3.0071215629577637, + "logps/chosen": -69.94696807861328, + "logps/rejected": -109.17233276367188, + "loss": 0.6082, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.21303395926952362, + "rewards/margins": 0.3135417103767395, + "rewards/rejected": -0.5265756845474243, "step": 55 }, { - "epoch": 0.20202931228861332, - "grad_norm": 5.008037783902186, - "learning_rate": 4.845614776349907e-07, - "logits/chosen": -3.141080141067505, - "logits/rejected": -3.06501841545105, - "logps/chosen": -81.1248779296875, - "logps/rejected": -85.57904815673828, - "loss": 0.6797, - "rewards/accuracies": 0.5625, - "rewards/chosen": -0.15610180795192719, - "rewards/margins": 0.049608003348112106, - "rewards/rejected": -0.2057098150253296, + "epoch": 0.7225806451612903, + "grad_norm": 7.382374495620578, + "learning_rate": 1.0582991947128323e-07, + "logits/chosen": -2.8991634845733643, + "logits/rejected": -2.997525453567505, + "logps/chosen": -60.843292236328125, + "logps/rejected": -86.19700622558594, + "loss": 0.6219, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.23299351334571838, + "rewards/margins": 0.20443448424339294, + "rewards/rejected": -0.43742799758911133, "step": 56 }, { - "epoch": 0.2056369785794814, - "grad_norm": 4.749875826450028, - "learning_rate": 4.834515785588161e-07, - "logits/chosen": -2.9905920028686523, - "logits/rejected": -3.0484495162963867, - "logps/chosen": -83.18892669677734, - "logps/rejected": -90.42606353759766, - "loss": 0.6915, - "rewards/accuracies": 0.375, - "rewards/chosen": -0.19652573764324188, - "rewards/margins": 0.03644590824842453, - "rewards/rejected": -0.232971653342247, + "epoch": 0.7354838709677419, + "grad_norm": 7.692419552247762, + "learning_rate": 9.668331242907088e-08, + "logits/chosen": -3.0765204429626465, + "logits/rejected": -3.0679996013641357, + "logps/chosen": -65.09187316894531, + "logps/rejected": -85.44721984863281, + "loss": 0.6402, + "rewards/accuracies": 0.3125, + "rewards/chosen": -0.20165708661079407, + "rewards/margins": 0.18024533987045288, + "rewards/rejected": -0.38190245628356934, "step": 57 }, { - "epoch": 0.2092446448703495, - "grad_norm": 5.284694239272722, - "learning_rate": 4.823045180793913e-07, - "logits/chosen": -3.1047616004943848, - "logits/rejected": -3.0558719635009766, - "logps/chosen": -73.41265106201172, - "logps/rejected": -73.64460754394531, - "loss": 0.6816, - "rewards/accuracies": 0.25, - "rewards/chosen": -0.25620439648628235, - "rewards/margins": 0.0007482077926397324, - "rewards/rejected": -0.2569526135921478, + "epoch": 0.7483870967741936, + "grad_norm": 11.554834969612196, + "learning_rate": 8.785447763431101e-08, + "logits/chosen": -3.120288848876953, + "logits/rejected": -3.10617995262146, + "logps/chosen": -78.844970703125, + "logps/rejected": -106.19613647460938, + "loss": 0.612, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.2569999694824219, + "rewards/margins": 0.2700415253639221, + "rewards/rejected": -0.527041494846344, "step": 58 }, { - "epoch": 0.21285231116121758, - "grad_norm": 5.278096245783588, - "learning_rate": 4.81120478788658e-07, - "logits/chosen": -3.023015022277832, - "logits/rejected": -2.992161989212036, - "logps/chosen": -75.61831665039062, - "logps/rejected": -67.40644836425781, - "loss": 0.6825, - "rewards/accuracies": 0.3125, - "rewards/chosen": -0.315682590007782, - "rewards/margins": -0.00714217871427536, - "rewards/rejected": -0.308540403842926, + "epoch": 0.7612903225806451, + "grad_norm": 7.28020470137263, + "learning_rate": 7.936171419533652e-08, + "logits/chosen": -3.1455440521240234, + "logits/rejected": -3.1471190452575684, + "logps/chosen": -51.242835998535156, + "logps/rejected": -78.44454956054688, + "loss": 0.6152, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.046268824487924576, + "rewards/margins": 0.16216275095939636, + "rewards/rejected": -0.20843157172203064, "step": 59 }, { - "epoch": 0.21645997745208567, - "grad_norm": 5.76171627111517, - "learning_rate": 4.798996491649373e-07, - "logits/chosen": -3.1804051399230957, - "logits/rejected": -3.0553033351898193, - "logps/chosen": -85.3018798828125, - "logps/rejected": -90.02981567382812, - "loss": 0.6843, - "rewards/accuracies": 0.5625, - "rewards/chosen": -0.24002744257450104, - "rewards/margins": 0.05709411948919296, - "rewards/rejected": -0.2971215546131134, + "epoch": 0.7741935483870968, + "grad_norm": 7.0851502720539346, + "learning_rate": 7.122262466127513e-08, + "logits/chosen": -2.983321189880371, + "logits/rejected": -2.9774532318115234, + "logps/chosen": -85.1767807006836, + "logps/rejected": -111.3412857055664, + "loss": 0.6049, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.4200911223888397, + "rewards/margins": 0.23350517451763153, + "rewards/rejected": -0.6535962820053101, "step": 60 }, { - "epoch": 0.2200676437429538, - "grad_norm": 5.492615379722156, - "learning_rate": 4.786422235429269e-07, - "logits/chosen": -3.010590076446533, - "logits/rejected": -2.954204559326172, - "logps/chosen": -80.36260986328125, - "logps/rejected": -80.47151947021484, - "loss": 0.6764, - "rewards/accuracies": 0.3125, - "rewards/chosen": -0.2392284870147705, - "rewards/margins": 0.004341253079473972, - "rewards/rejected": -0.2435697317123413, + "epoch": 0.7870967741935484, + "grad_norm": 10.713993216138782, + "learning_rate": 6.345407853807863e-08, + "logits/chosen": -2.8526759147644043, + "logits/rejected": -2.85900616645813, + "logps/chosen": -63.035057067871094, + "logps/rejected": -72.75942993164062, + "loss": 0.6374, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.2579822540283203, + "rewards/margins": 0.08359092473983765, + "rewards/rejected": -0.34157317876815796, "step": 61 }, { - "epoch": 0.22367531003382188, - "grad_norm": 5.859405971188221, - "learning_rate": 4.773484020827662e-07, - "logits/chosen": -3.037508487701416, - "logits/rejected": -3.059109687805176, - "logps/chosen": -58.52042770385742, - "logps/rejected": -62.00938415527344, - "loss": 0.6785, + "epoch": 0.8, + "grad_norm": 10.03605388575717, + "learning_rate": 5.607217732389502e-08, + "logits/chosen": -3.0021588802337646, + "logits/rejected": -3.0087029933929443, + "logps/chosen": -30.55845832824707, + "logps/rejected": -53.643280029296875, + "loss": 0.587, "rewards/accuracies": 0.3125, - "rewards/chosen": -0.1636224389076233, - "rewards/margins": 0.021534007042646408, - "rewards/rejected": -0.18515644967556, + "rewards/chosen": -0.10973939299583435, + "rewards/margins": 0.20178647339344025, + "rewards/rejected": -0.3115258812904358, "step": 62 }, { - "epoch": 0.22728297632468997, - "grad_norm": 6.40699603698559, - "learning_rate": 4.7601839073817564e-07, - "logits/chosen": -2.966259002685547, - "logits/rejected": -2.9369587898254395, - "logps/chosen": -74.35282897949219, - "logps/rejected": -76.79000091552734, - "loss": 0.6774, + "epoch": 0.8129032258064516, + "grad_norm": 6.2735761784405675, + "learning_rate": 4.909222113625544e-08, + "logits/chosen": -3.1388683319091797, + "logits/rejected": -3.0751585960388184, + "logps/chosen": -81.34771728515625, + "logps/rejected": -89.97032165527344, + "loss": 0.5877, "rewards/accuracies": 0.5625, - "rewards/chosen": -0.24077413976192474, - "rewards/margins": 0.07140924036502838, - "rewards/rejected": -0.31218335032463074, + "rewards/chosen": -0.19216133654117584, + "rewards/margins": 0.20118926465511322, + "rewards/rejected": -0.3933505713939667, "step": 63 }, { - "epoch": 0.23089064261555806, - "grad_norm": 5.677450933082558, - "learning_rate": 4.7465240122367054e-07, - "logits/chosen": -3.090595245361328, - "logits/rejected": -3.0663249492645264, - "logps/chosen": -78.77523803710938, - "logps/rejected": -78.72602081298828, - "loss": 0.6756, - "rewards/accuracies": 0.3125, - "rewards/chosen": -0.06116873770952225, - "rewards/margins": 0.02271055057644844, - "rewards/rejected": -0.08387928456068039, + "epoch": 0.8258064516129032, + "grad_norm": 11.716263562699654, + "learning_rate": 4.2528677000243737e-08, + "logits/chosen": -2.9592177867889404, + "logits/rejected": -2.9626688957214355, + "logps/chosen": -117.65725708007812, + "logps/rejected": -126.92681884765625, + "loss": 0.6193, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.5141925811767578, + "rewards/margins": 0.1016259714961052, + "rewards/rejected": -0.6158185005187988, "step": 64 }, { - "epoch": 0.23449830890642615, - "grad_norm": 5.417970377682496, - "learning_rate": 4.732506509808614e-07, - "logits/chosen": -3.076111078262329, - "logits/rejected": -3.0489518642425537, - "logps/chosen": -66.13645935058594, - "logps/rejected": -60.44820785522461, - "loss": 0.6815, - "rewards/accuracies": 0.25, - "rewards/chosen": -0.1020764410495758, - "rewards/margins": 0.020927922800183296, - "rewards/rejected": -0.12300436198711395, + "epoch": 0.8387096774193549, + "grad_norm": 9.893509990517233, + "learning_rate": 3.6395148863377854e-08, + "logits/chosen": -2.8350701332092285, + "logits/rejected": -2.797614812850952, + "logps/chosen": -38.36604309082031, + "logps/rejected": -57.85443115234375, + "loss": 0.6155, + "rewards/accuracies": 0.3125, + "rewards/chosen": -0.16758887469768524, + "rewards/margins": 0.24942214787006378, + "rewards/rejected": -0.417011022567749, "step": 65 }, { - "epoch": 0.23810597519729426, - "grad_norm": 5.308860192306935, - "learning_rate": 4.7181336314384034e-07, - "logits/chosen": -2.9755256175994873, - "logits/rejected": -2.940260171890259, - "logps/chosen": -43.73695755004883, - "logps/rejected": -49.114418029785156, - "loss": 0.6696, - "rewards/accuracies": 0.625, - "rewards/chosen": -0.08726543188095093, - "rewards/margins": 0.08850964903831482, - "rewards/rejected": -0.17577508091926575, + "epoch": 0.8516129032258064, + "grad_norm": 11.654525901199872, + "learning_rate": 3.0704349399351435e-08, + "logits/chosen": -2.892805814743042, + "logits/rejected": -2.926443099975586, + "logps/chosen": -63.805030822753906, + "logps/rejected": -109.76573181152344, + "loss": 0.6189, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.33767932653427124, + "rewards/margins": 0.3409982919692993, + "rewards/rejected": -0.6786776781082153, "step": 66 }, { - "epoch": 0.24171364148816235, - "grad_norm": 4.844400046155317, - "learning_rate": 4.703407665036622e-07, - "logits/chosen": -3.0616796016693115, - "logits/rejected": -3.0430803298950195, - "logps/chosen": -70.15087127685547, - "logps/rejected": -73.33533477783203, - "loss": 0.6828, - "rewards/accuracies": 0.5625, - "rewards/chosen": -0.10511312633752823, - "rewards/margins": 0.049545902758836746, - "rewards/rejected": -0.15465903282165527, + "epoch": 0.864516129032258, + "grad_norm": 9.135989557153602, + "learning_rate": 2.5468073659075996e-08, + "logits/chosen": -3.0026183128356934, + "logits/rejected": -2.9746150970458984, + "logps/chosen": -101.80465698242188, + "logps/rejected": -122.83572387695312, + "loss": 0.5887, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.3310484290122986, + "rewards/margins": 0.2203977406024933, + "rewards/rejected": -0.5514461994171143, "step": 67 }, { - "epoch": 0.24532130777903044, - "grad_norm": 9.056121595816778, - "learning_rate": 4.688330954719247e-07, - "logits/chosen": -2.9804787635803223, - "logits/rejected": -3.0142626762390137, - "logps/chosen": -58.71462631225586, - "logps/rejected": -59.18278121948242, - "loss": 0.6845, + "epoch": 0.8774193548387097, + "grad_norm": 9.564598187916838, + "learning_rate": 2.069717462363679e-08, + "logits/chosen": -3.0436861515045166, + "logits/rejected": -3.017408847808838, + "logps/chosen": -63.420555114746094, + "logps/rejected": -79.03422546386719, + "loss": 0.6229, "rewards/accuracies": 0.25, - "rewards/chosen": -0.09285889565944672, - "rewards/margins": -0.022356411442160606, - "rewards/rejected": -0.07050248235464096, + "rewards/chosen": -0.33887577056884766, + "rewards/margins": 0.19294096529483795, + "rewards/rejected": -0.5318167209625244, "step": 68 }, { - "epoch": 0.24892897406989853, - "grad_norm": 5.427412443476703, - "learning_rate": 4.6729059004345493e-07, - "logits/chosen": -3.039491891860962, - "logits/rejected": -2.978717803955078, - "logps/chosen": -45.64339065551758, - "logps/rejected": -49.950653076171875, - "loss": 0.6838, - "rewards/accuracies": 0.375, - "rewards/chosen": -0.08918777108192444, - "rewards/margins": 0.02250598557293415, - "rewards/rejected": -0.11169375479221344, + "epoch": 0.8903225806451613, + "grad_norm": 8.446220058864208, + "learning_rate": 1.640154070983224e-08, + "logits/chosen": -2.9233558177948, + "logits/rejected": -2.9490303993225098, + "logps/chosen": -106.49418640136719, + "logps/rejected": -155.04872131347656, + "loss": 0.5833, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.4763030707836151, + "rewards/margins": 0.36830809712409973, + "rewards/rejected": -0.8446111679077148, "step": 69 }, { - "epoch": 0.2525366403607666, - "grad_norm": 5.632699782018657, - "learning_rate": 4.657134957581057e-07, - "logits/chosen": -3.097475051879883, - "logits/rejected": -3.0688934326171875, - "logps/chosen": -61.59363555908203, - "logps/rejected": -61.80335235595703, - "loss": 0.6766, - "rewards/accuracies": 0.1875, - "rewards/chosen": -0.10563363879919052, - "rewards/margins": 0.013648715801537037, - "rewards/rejected": -0.11928234994411469, + "epoch": 0.9032258064516129, + "grad_norm": 11.736911118541979, + "learning_rate": 1.2590075274920203e-08, + "logits/chosen": -3.033332109451294, + "logits/rejected": -2.9890236854553223, + "logps/chosen": -72.19229125976562, + "logps/rejected": -91.49864959716797, + "loss": 0.643, + "rewards/accuracies": 0.3125, + "rewards/chosen": -0.18326060473918915, + "rewards/margins": 0.19467103481292725, + "rewards/rejected": -0.3779316544532776, "step": 70 }, { - "epoch": 0.25614430665163473, - "grad_norm": 6.237795334490199, - "learning_rate": 4.6410206366167006e-07, - "logits/chosen": -2.8893134593963623, - "logits/rejected": -2.8665285110473633, - "logps/chosen": -79.49488067626953, - "logps/rejected": -87.27457427978516, - "loss": 0.6639, - "rewards/accuracies": 0.375, - "rewards/chosen": -0.14098061621189117, - "rewards/margins": 0.0641748383641243, - "rewards/rejected": -0.20515546202659607, + "epoch": 0.9161290322580645, + "grad_norm": 8.22057742391122, + "learning_rate": 9.270678163050217e-09, + "logits/chosen": -2.92671537399292, + "logits/rejected": -2.944516658782959, + "logps/chosen": -81.09933471679688, + "logps/rejected": -119.98873138427734, + "loss": 0.5992, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.39163094758987427, + "rewards/margins": 0.3819514513015747, + "rewards/rejected": -0.7735823392868042, "step": 71 }, { - "epoch": 0.2597519729425028, - "grad_norm": 6.540061367889613, - "learning_rate": 4.6245655026591933e-07, - "logits/chosen": -3.0924839973449707, - "logits/rejected": -3.0280892848968506, - "logps/chosen": -91.73502349853516, - "logps/rejected": -107.45960998535156, - "loss": 0.6616, - "rewards/accuracies": 0.6875, - "rewards/chosen": -0.06672094017267227, - "rewards/margins": 0.1718476265668869, - "rewards/rejected": -0.23856858909130096, + "epoch": 0.9290322580645162, + "grad_norm": 8.437281968122917, + "learning_rate": 6.450229331630253e-09, + "logits/chosen": -2.8446879386901855, + "logits/rejected": -2.8200840950012207, + "logps/chosen": -79.77139282226562, + "logps/rejected": -117.7776107788086, + "loss": 0.6254, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.25027692317962646, + "rewards/margins": 0.39766794443130493, + "rewards/rejected": -0.6479449272155762, "step": 72 }, { - "epoch": 0.2633596392333709, - "grad_norm": 6.491558061892799, - "learning_rate": 4.607772175077711e-07, - "logits/chosen": -3.1811912059783936, - "logits/rejected": -3.1132688522338867, - "logps/chosen": -54.56733703613281, - "logps/rejected": -55.28483963012695, - "loss": 0.6743, - "rewards/accuracies": 0.3125, - "rewards/chosen": -0.10147538781166077, - "rewards/margins": 0.05391587316989899, - "rewards/rejected": -0.15539124608039856, + "epoch": 0.9419354838709677, + "grad_norm": 10.325301780650673, + "learning_rate": 4.1345745915644935e-09, + "logits/chosen": -3.0622751712799072, + "logits/rejected": -3.030594825744629, + "logps/chosen": -88.93402862548828, + "logps/rejected": -100.32481384277344, + "loss": 0.6015, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.345296174287796, + "rewards/margins": 0.1704256534576416, + "rewards/rejected": -0.51572185754776, "step": 73 }, { - "epoch": 0.26696730552423903, - "grad_norm": 6.2815579420003, - "learning_rate": 4.5906433270759293e-07, - "logits/chosen": -3.025987148284912, - "logits/rejected": -3.0501232147216797, - "logps/chosen": -73.81613159179688, - "logps/rejected": -75.47962188720703, - "loss": 0.6787, + "epoch": 0.9548387096774194, + "grad_norm": 15.320595517735114, + "learning_rate": 2.328513490917311e-09, + "logits/chosen": -2.8475613594055176, + "logits/rejected": -2.8002405166625977, + "logps/chosen": -102.33820343017578, + "logps/rejected": -135.93002319335938, + "loss": 0.5982, "rewards/accuracies": 0.4375, - "rewards/chosen": -0.27066299319267273, - "rewards/margins": 0.035506900399923325, - "rewards/rejected": -0.30616986751556396, + "rewards/chosen": -0.46599456667900085, + "rewards/margins": 0.20195163786411285, + "rewards/rejected": -0.6679461598396301, "step": 74 }, { - "epoch": 0.2705749718151071, - "grad_norm": 6.543371842247222, - "learning_rate": 4.5731816852665014e-07, - "logits/chosen": -2.980196714401245, - "logits/rejected": -2.945509195327759, - "logps/chosen": -79.3486328125, - "logps/rejected": -76.48310089111328, - "loss": 0.656, - "rewards/accuracies": 0.375, - "rewards/chosen": -0.2180095911026001, - "rewards/margins": 0.003370078280568123, - "rewards/rejected": -0.22137968242168427, + "epoch": 0.967741935483871, + "grad_norm": 12.878552628840982, + "learning_rate": 1.035789367117179e-09, + "logits/chosen": -2.999194622039795, + "logits/rejected": -2.9982104301452637, + "logps/chosen": -38.21251678466797, + "logps/rejected": -50.88334274291992, + "loss": 0.6391, + "rewards/accuracies": 0.1875, + "rewards/chosen": -0.12685494124889374, + "rewards/margins": 0.12716560065746307, + "rewards/rejected": -0.2540205419063568, "step": 75 }, { - "epoch": 0.2741826381059752, - "grad_norm": 7.356479611388927, - "learning_rate": 4.555390029237025e-07, - "logits/chosen": -3.117149591445923, - "logits/rejected": -3.0874693393707275, - "logps/chosen": -52.4029541015625, - "logps/rejected": -52.73406982421875, - "loss": 0.6812, + "epoch": 0.9806451612903225, + "grad_norm": 8.7924927116735, + "learning_rate": 2.5908158831811077e-10, + "logits/chosen": -2.9358251094818115, + "logits/rejected": -2.925405979156494, + "logps/chosen": -51.06131362915039, + "logps/rejected": -58.798614501953125, + "loss": 0.6059, "rewards/accuracies": 0.25, - "rewards/chosen": -0.13237334787845612, - "rewards/margins": 0.02146833762526512, - "rewards/rejected": -0.15384167432785034, + "rewards/chosen": -0.2657967209815979, + "rewards/margins": 0.02999306283891201, + "rewards/rejected": -0.29578977823257446, "step": 76 }, { - "epoch": 0.27779030439684327, - "grad_norm": 7.410938365308492, - "learning_rate": 4.5372711911075846e-07, - "logits/chosen": -2.9389328956604004, - "logits/rejected": -2.9145894050598145, - "logps/chosen": -87.16389465332031, - "logps/rejected": -87.20050048828125, - "loss": 0.6788, + "epoch": 0.9935483870967742, + "grad_norm": 11.703381693919436, + "learning_rate": 0.0, + "logits/chosen": -2.8389170169830322, + "logits/rejected": -2.8045477867126465, + "logps/chosen": -82.48759460449219, + "logps/rejected": -92.22787475585938, + "loss": 0.6105, "rewards/accuracies": 0.375, - "rewards/chosen": -0.32804983854293823, - "rewards/margins": 0.006163815036416054, - "rewards/rejected": -0.33421361446380615, + "rewards/chosen": -0.343311071395874, + "rewards/margins": 0.20035406947135925, + "rewards/rejected": -0.5436651706695557, "step": 77 }, { - "epoch": 0.2813979706877114, - "grad_norm": 6.712219727973192, - "learning_rate": 4.5188280550799245e-07, - "logits/chosen": -3.127492904663086, - "logits/rejected": -3.109748601913452, - "logps/chosen": -100.02323150634766, - "logps/rejected": -107.56802368164062, - "loss": 0.6793, - "rewards/accuracies": 0.5, - "rewards/chosen": -0.3727189898490906, - "rewards/margins": 0.08715255558490753, - "rewards/rejected": -0.4598715305328369, - "step": 78 - }, - { - "epoch": 0.2850056369785795, - "grad_norm": 10.56186082375539, - "learning_rate": 4.500063556978336e-07, - "logits/chosen": -3.0038719177246094, - "logits/rejected": -2.954413414001465, - "logps/chosen": -95.82234954833984, - "logps/rejected": -87.31886291503906, - "loss": 0.6806, - "rewards/accuracies": 0.375, - "rewards/chosen": -0.4763154685497284, - "rewards/margins": 0.0035571642220020294, - "rewards/rejected": -0.47987261414527893, - "step": 79 - }, - { - "epoch": 0.28861330326944756, - "grad_norm": 7.503356290121524, - "learning_rate": 4.480980683782325e-07, - "logits/chosen": -2.9749221801757812, - "logits/rejected": -2.9812374114990234, - "logps/chosen": -103.54981231689453, - "logps/rejected": -104.48420715332031, - "loss": 0.664, - "rewards/accuracies": 0.5, - "rewards/chosen": -0.456723153591156, - "rewards/margins": 0.03572231903672218, - "rewards/rejected": -0.49244555830955505, - "step": 80 - }, - { - "epoch": 0.2922209695603157, - "grad_norm": 7.830297865869621, - "learning_rate": 4.4615824731511363e-07, - "logits/chosen": -2.6993398666381836, - "logits/rejected": -2.694842576980591, - "logps/chosen": -67.55276489257812, - "logps/rejected": -66.96586608886719, - "loss": 0.6779, - "rewards/accuracies": 0.25, - "rewards/chosen": -0.3337884843349457, - "rewards/margins": -0.0012375228106975555, - "rewards/rejected": -0.33255094289779663, - "step": 81 - }, - { - "epoch": 0.29582863585118374, - "grad_norm": 7.336729630446761, - "learning_rate": 4.4418720129402136e-07, - "logits/chosen": -2.8738903999328613, - "logits/rejected": -2.8631696701049805, - "logps/chosen": -56.175960540771484, - "logps/rejected": -59.59237289428711, - "loss": 0.6749, - "rewards/accuracies": 0.3125, - "rewards/chosen": -0.33598408102989197, - "rewards/margins": 0.054874226450920105, - "rewards/rejected": -0.3908582925796509, - "step": 82 - }, - { - "epoch": 0.29943630214205186, - "grad_norm": 6.973702754924946, - "learning_rate": 4.4218524407096656e-07, - "logits/chosen": -2.9357500076293945, - "logits/rejected": -2.920872688293457, - "logps/chosen": -59.18834686279297, - "logps/rejected": -67.08983612060547, - "loss": 0.6604, - "rewards/accuracies": 0.4375, - "rewards/chosen": -0.26184171438217163, - "rewards/margins": 0.07276714593172073, - "rewards/rejected": -0.33460885286331177, - "step": 83 - }, - { - "epoch": 0.30304396843292, - "grad_norm": 8.155016596381941, - "learning_rate": 4.4015269432248213e-07, - "logits/chosen": -3.07014536857605, - "logits/rejected": -3.0590834617614746, - "logps/chosen": -114.27590942382812, - "logps/rejected": -126.73323059082031, - "loss": 0.6687, - "rewards/accuracies": 0.4375, - "rewards/chosen": -0.3542146682739258, - "rewards/margins": 0.13344313204288483, - "rewards/rejected": -0.4876577854156494, - "step": 84 - }, - { - "epoch": 0.30665163472378804, - "grad_norm": 7.85490303862283, - "learning_rate": 4.380898755948953e-07, - "logits/chosen": -3.0937111377716064, - "logits/rejected": -3.0751538276672363, - "logps/chosen": -76.29114532470703, - "logps/rejected": -79.89305114746094, - "loss": 0.6642, - "rewards/accuracies": 0.375, - "rewards/chosen": -0.1871543824672699, - "rewards/margins": 0.07309264689683914, - "rewards/rejected": -0.26024705171585083, - "step": 85 - }, - { - "epoch": 0.31025930101465615, - "grad_norm": 7.168624243086863, - "learning_rate": 4.359971162528247e-07, - "logits/chosen": -2.9832687377929688, - "logits/rejected": -2.9627914428710938, - "logps/chosen": -40.997859954833984, - "logps/rejected": -43.25674819946289, - "loss": 0.6549, - "rewards/accuracies": 0.3125, - "rewards/chosen": -0.14379273355007172, - "rewards/margins": 0.03346078842878342, - "rewards/rejected": -0.17725351452827454, - "step": 86 - }, - { - "epoch": 0.3138669673055242, - "grad_norm": 8.329101082733047, - "learning_rate": 4.338747494269105e-07, - "logits/chosen": -2.938364267349243, - "logits/rejected": -2.9644434452056885, - "logps/chosen": -50.71893310546875, - "logps/rejected": -52.61332321166992, - "loss": 0.6697, - "rewards/accuracies": 0.1875, - "rewards/chosen": -0.18301628530025482, - "rewards/margins": -0.016642173752188683, - "rewards/rejected": -0.1663741022348404, - "step": 87 - }, - { - "epoch": 0.31747463359639233, - "grad_norm": 8.785725364488357, - "learning_rate": 4.317231129607859e-07, - "logits/chosen": -3.051335334777832, - "logits/rejected": -2.973188638687134, - "logps/chosen": -71.78001403808594, - "logps/rejected": -79.94705200195312, - "loss": 0.6649, - "rewards/accuracies": 0.25, - "rewards/chosen": -0.28318947553634644, - "rewards/margins": 0.12954573333263397, - "rewards/rejected": -0.4127352237701416, - "step": 88 - }, - { - "epoch": 0.32108229988726045, - "grad_norm": 9.466398883860359, - "learning_rate": 4.295425493572982e-07, - "logits/chosen": -2.9624252319335938, - "logits/rejected": -2.977269172668457, - "logps/chosen": -80.85143280029297, - "logps/rejected": -85.35714721679688, - "loss": 0.6511, - "rewards/accuracies": 0.375, - "rewards/chosen": -0.2776328921318054, - "rewards/margins": 0.055954016745090485, - "rewards/rejected": -0.3335869014263153, - "step": 89 - }, - { - "epoch": 0.3246899661781285, - "grad_norm": 8.43255488774325, - "learning_rate": 4.2733340572398835e-07, - "logits/chosen": -2.987386703491211, - "logits/rejected": -2.9460248947143555, - "logps/chosen": -58.54926300048828, - "logps/rejected": -59.32145690917969, - "loss": 0.6589, - "rewards/accuracies": 0.1875, - "rewards/chosen": -0.19937515258789062, - "rewards/margins": 0.03408390283584595, - "rewards/rejected": -0.23345905542373657, - "step": 90 - }, - { - "epoch": 0.3282976324689966, - "grad_norm": 8.787587026321777, - "learning_rate": 4.250960337178377e-07, - "logits/chosen": -2.9402103424072266, - "logits/rejected": -2.978181838989258, - "logps/chosen": -65.70238494873047, - "logps/rejected": -70.6248779296875, - "loss": 0.6428, - "rewards/accuracies": 0.4375, - "rewards/chosen": -0.36394602060317993, - "rewards/margins": 0.03319575637578964, - "rewards/rejected": -0.3971417248249054, - "step": 91 - }, - { - "epoch": 0.3319052987598647, - "grad_norm": 10.794527233456202, - "learning_rate": 4.228307894892902e-07, - "logits/chosen": -3.0047225952148438, - "logits/rejected": -2.878885269165039, - "logps/chosen": -145.8394317626953, - "logps/rejected": -148.85208129882812, - "loss": 0.6387, - "rewards/accuracies": 0.5, - "rewards/chosen": -0.6903162598609924, - "rewards/margins": 0.10916871577501297, - "rewards/rejected": -0.7994850277900696, - "step": 92 - }, - { - "epoch": 0.3355129650507328, - "grad_norm": 10.307780184662892, - "learning_rate": 4.205380336255594e-07, - "logits/chosen": -2.987762451171875, - "logits/rejected": -2.9657390117645264, - "logps/chosen": -65.02996063232422, - "logps/rejected": -75.74144744873047, - "loss": 0.6515, - "rewards/accuracies": 0.375, - "rewards/chosen": -0.2750554382801056, - "rewards/margins": 0.11421419680118561, - "rewards/rejected": -0.3892696499824524, - "step": 93 - }, - { - "epoch": 0.3391206313416009, - "grad_norm": 9.921397519916923, - "learning_rate": 4.182181310932297e-07, - "logits/chosen": -3.0359809398651123, - "logits/rejected": -3.0303328037261963, - "logps/chosen": -71.20989227294922, - "logps/rejected": -76.57341766357422, - "loss": 0.6763, - "rewards/accuracies": 0.375, - "rewards/chosen": -0.32689860463142395, - "rewards/margins": 0.08039885014295578, - "rewards/rejected": -0.4072974622249603, - "step": 94 - }, - { - "epoch": 0.342728297632469, - "grad_norm": 12.007682083945625, - "learning_rate": 4.158714511801594e-07, - "logits/chosen": -3.062500238418579, - "logits/rejected": -2.9682044982910156, - "logps/chosen": -150.4534149169922, - "logps/rejected": -169.4901123046875, - "loss": 0.662, - "rewards/accuracies": 0.5625, - "rewards/chosen": -0.8838521242141724, - "rewards/margins": 0.20915710926055908, - "rewards/rejected": -1.0930092334747314, - "step": 95 - }, - { - "epoch": 0.3463359639233371, - "grad_norm": 15.556480936118733, - "learning_rate": 4.134983674366973e-07, - "logits/chosen": -2.8261094093322754, - "logits/rejected": -2.8071787357330322, - "logps/chosen": -106.62940216064453, - "logps/rejected": -132.07650756835938, - "loss": 0.6301, - "rewards/accuracies": 0.4375, - "rewards/chosen": -0.5544123649597168, - "rewards/margins": 0.2631354331970215, - "rewards/rejected": -0.8175479173660278, - "step": 96 - }, - { - "epoch": 0.34994363021420516, - "grad_norm": 15.73303830602391, - "learning_rate": 4.110992576162192e-07, - "logits/chosen": -2.9027836322784424, - "logits/rejected": -2.886002540588379, - "logps/chosen": -70.5209732055664, - "logps/rejected": -79.7185287475586, - "loss": 0.6602, - "rewards/accuracies": 0.375, - "rewards/chosen": -0.28252050280570984, - "rewards/margins": 0.12111782282590866, - "rewards/rejected": -0.4036383628845215, - "step": 97 - }, - { - "epoch": 0.3535512965050733, - "grad_norm": 15.397086015885561, - "learning_rate": 4.08674503614997e-07, - "logits/chosen": -2.889780044555664, - "logits/rejected": -2.92999267578125, - "logps/chosen": -119.18766021728516, - "logps/rejected": -141.76385498046875, - "loss": 0.6397, - "rewards/accuracies": 0.5, - "rewards/chosen": -0.7728019952774048, - "rewards/margins": 0.19432534277439117, - "rewards/rejected": -0.9671273827552795, - "step": 98 - }, - { - "epoch": 0.3571589627959414, - "grad_norm": 17.22132335094817, - "learning_rate": 4.062244914114068e-07, - "logits/chosen": -2.9713516235351562, - "logits/rejected": -2.968160390853882, - "logps/chosen": -102.42787170410156, - "logps/rejected": -107.99383544921875, - "loss": 0.6251, - "rewards/accuracies": 0.3125, - "rewards/chosen": -0.5578393340110779, - "rewards/margins": 0.07280861586332321, - "rewards/rejected": -0.6306478977203369, - "step": 99 - }, - { - "epoch": 0.36076662908680945, - "grad_norm": 14.117772430240619, - "learning_rate": 4.037496110044884e-07, - "logits/chosen": -3.1235404014587402, - "logits/rejected": -3.050529956817627, - "logps/chosen": -94.09088897705078, - "logps/rejected": -126.543212890625, - "loss": 0.6354, - "rewards/accuracies": 0.4375, - "rewards/chosen": -0.42357906699180603, - "rewards/margins": 0.29684561491012573, - "rewards/rejected": -0.7204247117042542, - "step": 100 - }, - { - "epoch": 0.36437429537767757, - "grad_norm": 16.786777865237188, - "learning_rate": 4.0125025635186405e-07, - "logits/chosen": -3.0158231258392334, - "logits/rejected": -3.0204453468322754, - "logps/chosen": -116.05175018310547, - "logps/rejected": -128.13600158691406, - "loss": 0.6606, - "rewards/accuracies": 0.4375, - "rewards/chosen": -0.4721571207046509, - "rewards/margins": 0.1276375651359558, - "rewards/rejected": -0.5997946262359619, - "step": 101 - }, - { - "epoch": 0.36798196166854563, - "grad_norm": 14.579924870854553, - "learning_rate": 3.9872682530702726e-07, - "logits/chosen": -2.9454610347747803, - "logits/rejected": -2.97355055809021, - "logps/chosen": -66.14591217041016, - "logps/rejected": -84.91130828857422, - "loss": 0.6486, - "rewards/accuracies": 0.375, - "rewards/chosen": -0.33762845396995544, - "rewards/margins": 0.18099550902843475, - "rewards/rejected": -0.5186240077018738, - "step": 102 - }, - { - "epoch": 0.37158962795941375, - "grad_norm": 23.289287453811884, - "learning_rate": 3.961797195560118e-07, - "logits/chosen": -3.0300021171569824, - "logits/rejected": -2.9782848358154297, - "logps/chosen": -102.82548522949219, - "logps/rejected": -124.77222442626953, - "loss": 0.6703, - "rewards/accuracies": 0.5, - "rewards/chosen": -0.45521754026412964, - "rewards/margins": 0.21448729932308197, - "rewards/rejected": -0.6697048544883728, - "step": 103 - }, - { - "epoch": 0.37519729425028187, - "grad_norm": 20.79755798043576, - "learning_rate": 3.9360934455344973e-07, - "logits/chosen": -2.8588948249816895, - "logits/rejected": -2.916116237640381, - "logps/chosen": -121.17371368408203, - "logps/rejected": -123.46075439453125, - "loss": 0.6622, - "rewards/accuracies": 0.5625, - "rewards/chosen": -0.5858777761459351, - "rewards/margins": -0.007589234039187431, - "rewards/rejected": -0.5782886147499084, - "step": 104 - }, - { - "epoch": 0.3788049605411499, - "grad_norm": 15.690072842495221, - "learning_rate": 3.9101610945803083e-07, - "logits/chosen": -3.0194122791290283, - "logits/rejected": -2.9381792545318604, - "logps/chosen": -148.06057739257812, - "logps/rejected": -180.38914489746094, - "loss": 0.6395, - "rewards/accuracies": 0.625, - "rewards/chosen": -0.7008622884750366, - "rewards/margins": 0.3252592086791992, - "rewards/rejected": -1.0261214971542358, - "step": 105 - }, - { - "epoch": 0.38241262683201804, - "grad_norm": 15.15128129147874, - "learning_rate": 3.8840042706737107e-07, - "logits/chosen": -2.777494192123413, - "logits/rejected": -2.7841367721557617, - "logps/chosen": -85.67420196533203, - "logps/rejected": -102.62860107421875, - "loss": 0.6814, - "rewards/accuracies": 0.625, - "rewards/chosen": -0.41453635692596436, - "rewards/margins": 0.16942985355854034, - "rewards/rejected": -0.5839661955833435, - "step": 106 - }, - { - "epoch": 0.38602029312288616, - "grad_norm": 12.724021180124803, - "learning_rate": 3.857627137523031e-07, - "logits/chosen": -2.927671432495117, - "logits/rejected": -2.8235867023468018, - "logps/chosen": -110.81070709228516, - "logps/rejected": -146.58822631835938, - "loss": 0.663, - "rewards/accuracies": 0.4375, - "rewards/chosen": -0.5489624738693237, - "rewards/margins": 0.367707222700119, - "rewards/rejected": -0.9166697263717651, - "step": 107 - }, - { - "epoch": 0.3896279594137542, - "grad_norm": 14.267449569584038, - "learning_rate": 3.831033893905964e-07, - "logits/chosen": -2.7638843059539795, - "logits/rejected": -2.702754259109497, - "logps/chosen": -95.52771759033203, - "logps/rejected": -92.77847290039062, - "loss": 0.6751, - "rewards/accuracies": 0.3125, - "rewards/chosen": -0.4860032796859741, - "rewards/margins": 0.035816505551338196, - "rewards/rejected": -0.5218197703361511, - "step": 108 - }, - { - "epoch": 0.39323562570462234, - "grad_norm": 12.61405259795272, - "learning_rate": 3.804228773001211e-07, - "logits/chosen": -2.939096212387085, - "logits/rejected": -2.8745901584625244, - "logps/chosen": -105.72749328613281, - "logps/rejected": -125.56944274902344, - "loss": 0.6634, - "rewards/accuracies": 0.5625, - "rewards/chosen": -0.45705926418304443, - "rewards/margins": 0.2192366123199463, - "rewards/rejected": -0.6762958765029907, - "step": 109 - }, - { - "epoch": 0.3968432919954904, - "grad_norm": 13.878350385096809, - "learning_rate": 3.7772160417146257e-07, - "logits/chosen": -2.974313974380493, - "logits/rejected": -2.8819642066955566, - "logps/chosen": -117.89132690429688, - "logps/rejected": -130.11300659179688, - "loss": 0.6674, - "rewards/accuracies": 0.375, - "rewards/chosen": -0.6227794885635376, - "rewards/margins": 0.1266952008008957, - "rewards/rejected": -0.7494745850563049, - "step": 110 - }, - { - "epoch": 0.4004509582863585, - "grad_norm": 13.393327401919608, - "learning_rate": 3.75e-07, - "logits/chosen": -3.0776989459991455, - "logits/rejected": -3.0629327297210693, - "logps/chosen": -135.2939453125, - "logps/rejected": -170.3146209716797, - "loss": 0.6281, - "rewards/accuracies": 0.4375, - "rewards/chosen": -0.6238767504692078, - "rewards/margins": 0.3313390016555786, - "rewards/rejected": -0.9552158713340759, - "step": 111 - }, - { - "epoch": 0.40405862457722663, - "grad_norm": 13.237834231226294, - "learning_rate": 3.722584980174583e-07, - "logits/chosen": -2.971565008163452, - "logits/rejected": -2.9340901374816895, - "logps/chosen": -75.61985778808594, - "logps/rejected": -80.21390533447266, - "loss": 0.6762, - "rewards/accuracies": 0.5, - "rewards/chosen": -0.2932888865470886, - "rewards/margins": 0.06129308044910431, - "rewards/rejected": -0.3545819818973541, - "step": 112 - }, - { - "epoch": 0.4076662908680947, - "grad_norm": 25.56737619838537, - "learning_rate": 3.6949753462294574e-07, - "logits/chosen": -3.0656042098999023, - "logits/rejected": -3.032794713973999, - "logps/chosen": -94.67741394042969, - "logps/rejected": -108.54988861083984, - "loss": 0.6395, - "rewards/accuracies": 0.5, - "rewards/chosen": -0.44096264243125916, - "rewards/margins": 0.1457306444644928, - "rewards/rejected": -0.586693286895752, - "step": 113 - }, - { - "epoch": 0.4112739571589628, - "grad_norm": 13.473034803634468, - "learning_rate": 3.667175493134864e-07, - "logits/chosen": -3.0270774364471436, - "logits/rejected": -3.027925491333008, - "logps/chosen": -115.76703643798828, - "logps/rejected": -135.0316162109375, - "loss": 0.641, - "rewards/accuracies": 0.4375, - "rewards/chosen": -0.5511752367019653, - "rewards/margins": 0.15001626312732697, - "rewards/rejected": -0.7011915445327759, - "step": 114 - }, - { - "epoch": 0.41488162344983087, - "grad_norm": 10.757296070280612, - "learning_rate": 3.639189846140604e-07, - "logits/chosen": -2.7936909198760986, - "logits/rejected": -2.7856245040893555, - "logps/chosen": -78.51675415039062, - "logps/rejected": -104.93350982666016, - "loss": 0.6324, - "rewards/accuracies": 0.4375, - "rewards/chosen": -0.3310379385948181, - "rewards/margins": 0.2887319326400757, - "rewards/rejected": -0.6197698712348938, - "step": 115 - }, - { - "epoch": 0.418489289740699, - "grad_norm": 12.552978884799218, - "learning_rate": 3.61102286007161e-07, - "logits/chosen": -2.7983784675598145, - "logits/rejected": -2.7699077129364014, - "logps/chosen": -103.65501403808594, - "logps/rejected": -117.89797973632812, - "loss": 0.6386, - "rewards/accuracies": 0.5, - "rewards/chosen": -0.5203580260276794, - "rewards/margins": 0.17265889048576355, - "rewards/rejected": -0.6930170059204102, - "step": 116 - }, - { - "epoch": 0.4220969560315671, - "grad_norm": 10.96410231920964, - "learning_rate": 3.582679018618822e-07, - "logits/chosen": -2.515772819519043, - "logits/rejected": -2.5191540718078613, - "logps/chosen": -111.01329803466797, - "logps/rejected": -129.30508422851562, - "loss": 0.6272, - "rewards/accuracies": 0.5, - "rewards/chosen": -0.5996973514556885, - "rewards/margins": 0.20019006729125977, - "rewards/rejected": -0.7998873591423035, - "step": 117 - }, - { - "epoch": 0.42570462232243517, - "grad_norm": 21.055071910739645, - "learning_rate": 3.55416283362546e-07, - "logits/chosen": -3.005600929260254, - "logits/rejected": -2.9773454666137695, - "logps/chosen": -140.0772247314453, - "logps/rejected": -150.26556396484375, - "loss": 0.682, - "rewards/accuracies": 0.25, - "rewards/chosen": -0.8993721008300781, - "rewards/margins": 0.10675337165594101, - "rewards/rejected": -1.0061254501342773, - "step": 118 - }, - { - "epoch": 0.4293122886133033, - "grad_norm": 12.944971672563094, - "learning_rate": 3.525478844368818e-07, - "logits/chosen": -2.974605083465576, - "logits/rejected": -2.9158968925476074, - "logps/chosen": -105.04194641113281, - "logps/rejected": -114.68635559082031, - "loss": 0.6569, - "rewards/accuracies": 0.3125, - "rewards/chosen": -0.6415022611618042, - "rewards/margins": 0.12235216796398163, - "rewards/rejected": -0.7638545036315918, - "step": 119 - }, - { - "epoch": 0.43291995490417134, - "grad_norm": 14.767772539789915, - "learning_rate": 3.49663161683769e-07, - "logits/chosen": -2.9451308250427246, - "logits/rejected": -2.8823275566101074, - "logps/chosen": -114.45256805419922, - "logps/rejected": -127.80436706542969, - "loss": 0.6411, - "rewards/accuracies": 0.5625, - "rewards/chosen": -0.6361675262451172, - "rewards/margins": 0.15846434235572815, - "rewards/rejected": -0.794631838798523, - "step": 120 - }, - { - "epoch": 0.43652762119503946, - "grad_norm": 13.067383722887056, - "learning_rate": 3.467625743005543e-07, - "logits/chosen": -2.9372665882110596, - "logits/rejected": -2.8659605979919434, - "logps/chosen": -157.3434295654297, - "logps/rejected": -167.2296600341797, - "loss": 0.6554, - "rewards/accuracies": 0.375, - "rewards/chosen": -1.0550791025161743, - "rewards/margins": 0.09769081324338913, - "rewards/rejected": -1.152769923210144, - "step": 121 - }, - { - "epoch": 0.4401352874859076, - "grad_norm": 22.88669592245508, - "learning_rate": 3.43846584009956e-07, - "logits/chosen": -2.800259590148926, - "logits/rejected": -2.793133497238159, - "logps/chosen": -161.44972229003906, - "logps/rejected": -176.51881408691406, - "loss": 0.6657, - "rewards/accuracies": 0.4375, - "rewards/chosen": -0.9232314825057983, - "rewards/margins": 0.13970746099948883, - "rewards/rejected": -1.0629390478134155, - "step": 122 - }, - { - "epoch": 0.44374295377677564, - "grad_norm": 19.90415160315299, - "learning_rate": 3.4091565498656534e-07, - "logits/chosen": -2.9406120777130127, - "logits/rejected": -2.8403947353363037, - "logps/chosen": -129.9938201904297, - "logps/rejected": -170.3994598388672, - "loss": 0.6771, - "rewards/accuracies": 0.375, - "rewards/chosen": -0.6933571100234985, - "rewards/margins": 0.4637371599674225, - "rewards/rejected": -1.1570942401885986, - "step": 123 - }, - { - "epoch": 0.44735062006764376, - "grad_norm": 16.289712391920492, - "learning_rate": 3.3797025378295826e-07, - "logits/chosen": -2.60408616065979, - "logits/rejected": -2.607609510421753, - "logps/chosen": -110.75968170166016, - "logps/rejected": -148.51022338867188, - "loss": 0.6183, - "rewards/accuracies": 0.625, - "rewards/chosen": -0.6847554445266724, - "rewards/margins": 0.3520004153251648, - "rewards/rejected": -1.0367558002471924, - "step": 124 - }, - { - "epoch": 0.4509582863585118, - "grad_norm": 21.99029182209784, - "learning_rate": 3.350108492554284e-07, - "logits/chosen": -2.9422686100006104, - "logits/rejected": -2.9620985984802246, - "logps/chosen": -157.82647705078125, - "logps/rejected": -186.3019256591797, - "loss": 0.6407, - "rewards/accuracies": 0.5625, - "rewards/chosen": -0.7670837044715881, - "rewards/margins": 0.2764733135700226, - "rewards/rejected": -1.043557047843933, - "step": 125 - }, - { - "epoch": 0.45456595264937993, - "grad_norm": 14.866743628532555, - "learning_rate": 3.320379124893533e-07, - "logits/chosen": -2.8409082889556885, - "logits/rejected": -2.789806365966797, - "logps/chosen": -155.82882690429688, - "logps/rejected": -164.99630737304688, - "loss": 0.6269, - "rewards/accuracies": 0.5, - "rewards/chosen": -0.9507042169570923, - "rewards/margins": 0.08565385639667511, - "rewards/rejected": -1.0363579988479614, - "step": 126 - }, - { - "epoch": 0.45817361894024805, - "grad_norm": 14.323658528910146, - "learning_rate": 3.29051916724206e-07, - "logits/chosen": -2.9115352630615234, - "logits/rejected": -2.8558108806610107, - "logps/chosen": -79.81935119628906, - "logps/rejected": -88.44755554199219, - "loss": 0.637, - "rewards/accuracies": 0.4375, - "rewards/chosen": -0.4422595202922821, - "rewards/margins": 0.09828019142150879, - "rewards/rejected": -0.5405396819114685, - "step": 127 - }, - { - "epoch": 0.4617812852311161, - "grad_norm": 16.724223779580853, - "learning_rate": 3.2605333727822334e-07, - "logits/chosen": -2.7589972019195557, - "logits/rejected": -2.8023018836975098, - "logps/chosen": -139.31642150878906, - "logps/rejected": -141.2412567138672, - "loss": 0.6424, - "rewards/accuracies": 0.5, - "rewards/chosen": -0.9360347986221313, - "rewards/margins": 0.0007151514291763306, - "rewards/rejected": -0.9367499351501465, - "step": 128 - }, - { - "epoch": 0.46538895152198423, - "grad_norm": 15.299047854836443, - "learning_rate": 3.230426514727439e-07, - "logits/chosen": -2.873906135559082, - "logits/rejected": -2.7736876010894775, - "logps/chosen": -112.08553314208984, - "logps/rejected": -114.5158462524414, - "loss": 0.6546, - "rewards/accuracies": 0.4375, - "rewards/chosen": -0.702272355556488, - "rewards/margins": 0.058246057480573654, - "rewards/rejected": -0.7605184316635132, - "step": 129 - }, - { - "epoch": 0.4689966178128523, - "grad_norm": 13.522411703015429, - "learning_rate": 3.200203385562268e-07, - "logits/chosen": -2.9093048572540283, - "logits/rejected": -2.7951672077178955, - "logps/chosen": -66.85855865478516, - "logps/rejected": -85.58715057373047, - "loss": 0.6385, - "rewards/accuracies": 0.5, - "rewards/chosen": -0.3744560480117798, - "rewards/margins": 0.23534807562828064, - "rewards/rejected": -0.609804093837738, - "step": 130 - }, - { - "epoch": 0.4726042841037204, - "grad_norm": 18.021679614914405, - "learning_rate": 3.169868796279633e-07, - "logits/chosen": -2.798247814178467, - "logits/rejected": -2.675122022628784, - "logps/chosen": -151.86956787109375, - "logps/rejected": -192.33258056640625, - "loss": 0.6492, - "rewards/accuracies": 0.875, - "rewards/chosen": -0.84661865234375, - "rewards/margins": 0.44758865237236023, - "rewards/rejected": -1.2942073345184326, - "step": 131 - }, - { - "epoch": 0.4762119503945885, - "grad_norm": 14.974953775133617, - "learning_rate": 3.1394275756149503e-07, - "logits/chosen": -2.955599308013916, - "logits/rejected": -2.9406235218048096, - "logps/chosen": -80.21049499511719, - "logps/rejected": -90.9979248046875, - "loss": 0.625, - "rewards/accuracies": 0.375, - "rewards/chosen": -0.4655758738517761, - "rewards/margins": 0.127346932888031, - "rewards/rejected": -0.5929228067398071, - "step": 132 - }, - { - "epoch": 0.4798196166854566, - "grad_norm": 26.339961425073355, - "learning_rate": 3.1088845692774795e-07, - "logits/chosen": -2.9236092567443848, - "logits/rejected": -2.9097344875335693, - "logps/chosen": -59.16986083984375, - "logps/rejected": -70.45954895019531, - "loss": 0.6129, - "rewards/accuracies": 0.25, - "rewards/chosen": -0.40091997385025024, - "rewards/margins": 0.12422917783260345, - "rewards/rejected": -0.5251491665840149, - "step": 133 - }, - { - "epoch": 0.4834272829763247, - "grad_norm": 30.606445898377437, - "learning_rate": 3.0782446391789827e-07, - "logits/chosen": -2.7647979259490967, - "logits/rejected": -2.688325881958008, - "logps/chosen": -117.26891326904297, - "logps/rejected": -140.15887451171875, - "loss": 0.6636, - "rewards/accuracies": 0.5, - "rewards/chosen": -0.7809666395187378, - "rewards/margins": 0.24227336049079895, - "rewards/rejected": -1.0232399702072144, - "step": 134 - }, - { - "epoch": 0.48703494926719276, - "grad_norm": 16.30788635535694, - "learning_rate": 3.0475126626597826e-07, - "logits/chosen": -2.8486061096191406, - "logits/rejected": -2.817986011505127, - "logps/chosen": -113.62992095947266, - "logps/rejected": -132.82833862304688, - "loss": 0.6373, - "rewards/accuracies": 0.375, - "rewards/chosen": -0.731618344783783, - "rewards/margins": 0.1986570656299591, - "rewards/rejected": -0.9302754402160645, - "step": 135 - }, - { - "epoch": 0.4906426155580609, - "grad_norm": 15.97612287122739, - "learning_rate": 3.016693531712382e-07, - "logits/chosen": -2.817702531814575, - "logits/rejected": -2.792722702026367, - "logps/chosen": -133.21063232421875, - "logps/rejected": -165.58197021484375, - "loss": 0.6191, - "rewards/accuracies": 0.5, - "rewards/chosen": -0.7569137811660767, - "rewards/margins": 0.34298646450042725, - "rewards/rejected": -1.0999003648757935, - "step": 136 - }, - { - "epoch": 0.494250281848929, - "grad_norm": 16.137454656229274, - "learning_rate": 2.985792152202744e-07, - "logits/chosen": -2.5360145568847656, - "logits/rejected": -2.4910378456115723, - "logps/chosen": -123.48370361328125, - "logps/rejected": -148.6038055419922, - "loss": 0.6334, - "rewards/accuracies": 0.5625, - "rewards/chosen": -0.8421890735626221, - "rewards/margins": 0.2367614507675171, - "rewards/rejected": -1.0789506435394287, - "step": 137 - }, - { - "epoch": 0.49785794813979706, - "grad_norm": 15.898855094955541, - "learning_rate": 2.95481344308936e-07, - "logits/chosen": -2.73331618309021, - "logits/rejected": -2.6632349491119385, - "logps/chosen": -170.72422790527344, - "logps/rejected": -196.53546142578125, - "loss": 0.6488, - "rewards/accuracies": 0.625, - "rewards/chosen": -1.0920462608337402, - "rewards/margins": 0.3004887104034424, - "rewards/rejected": -1.3925349712371826, - "step": 138 - }, - { - "epoch": 0.5014656144306652, - "grad_norm": 17.22424304423365, - "learning_rate": 2.9237623356402417e-07, - "logits/chosen": -2.593909502029419, - "logits/rejected": -2.6516079902648926, - "logps/chosen": -136.3877716064453, - "logps/rejected": -172.6061248779297, - "loss": 0.6359, - "rewards/accuracies": 0.4375, - "rewards/chosen": -0.857252299785614, - "rewards/margins": 0.3476211130619049, - "rewards/rejected": -1.2048733234405518, - "step": 139 - }, - { - "epoch": 0.5050732807215332, - "grad_norm": 22.789502748606264, - "learning_rate": 2.8926437726479476e-07, - "logits/chosen": -2.636845827102661, - "logits/rejected": -2.605362892150879, - "logps/chosen": -129.49395751953125, - "logps/rejected": -153.09373474121094, - "loss": 0.6374, - "rewards/accuracies": 0.5625, - "rewards/chosen": -0.9271912574768066, - "rewards/margins": 0.2521345317363739, - "rewards/rejected": -1.1793256998062134, - "step": 140 - }, - { - "epoch": 0.5086809470124014, - "grad_norm": 20.456149556877257, - "learning_rate": 2.861462707642777e-07, - "logits/chosen": -2.6641764640808105, - "logits/rejected": -2.6901206970214844, - "logps/chosen": -138.69837951660156, - "logps/rejected": -136.23948669433594, - "loss": 0.6367, - "rewards/accuracies": 0.3125, - "rewards/chosen": -1.0070006847381592, - "rewards/margins": -0.06307016313076019, - "rewards/rejected": -0.9439306259155273, - "step": 141 - }, - { - "epoch": 0.5122886133032695, - "grad_norm": 15.925649763802328, - "learning_rate": 2.8302241041042564e-07, - "logits/chosen": -2.853618860244751, - "logits/rejected": -2.8236286640167236, - "logps/chosen": -121.82012176513672, - "logps/rejected": -137.94114685058594, - "loss": 0.6667, - "rewards/accuracies": 0.375, - "rewards/chosen": -0.8957615494728088, - "rewards/margins": 0.13905447721481323, - "rewards/rejected": -1.034816026687622, - "step": 142 - }, - { - "epoch": 0.5158962795941375, - "grad_norm": 19.082414620098294, - "learning_rate": 2.798932934671037e-07, - "logits/chosen": -2.8687548637390137, - "logits/rejected": -2.817361831665039, - "logps/chosen": -163.8360595703125, - "logps/rejected": -178.6797637939453, - "loss": 0.6829, - "rewards/accuracies": 0.5625, - "rewards/chosen": -1.0216882228851318, - "rewards/margins": 0.16729450225830078, - "rewards/rejected": -1.1889827251434326, - "step": 143 - }, - { - "epoch": 0.5195039458850056, - "grad_norm": 20.0504031523297, - "learning_rate": 2.767594180349343e-07, - "logits/chosen": -2.709780216217041, - "logits/rejected": -2.675569772720337, - "logps/chosen": -104.50929260253906, - "logps/rejected": -115.60841369628906, - "loss": 0.6569, - "rewards/accuracies": 0.4375, - "rewards/chosen": -0.7327451705932617, - "rewards/margins": 0.10645922273397446, - "rewards/rejected": -0.8392043709754944, - "step": 144 - }, - { - "epoch": 0.5231116121758738, - "grad_norm": 14.711077395875332, - "learning_rate": 2.736212829720078e-07, - "logits/chosen": -2.709390163421631, - "logits/rejected": -2.6369526386260986, - "logps/chosen": -128.56617736816406, - "logps/rejected": -132.992431640625, - "loss": 0.6338, - "rewards/accuracies": 0.3125, - "rewards/chosen": -0.9590910077095032, - "rewards/margins": 0.059610866010189056, - "rewards/rejected": -1.0187017917633057, - "step": 145 - }, - { - "epoch": 0.5267192784667418, - "grad_norm": 21.43629378734133, - "learning_rate": 2.704793878144731e-07, - "logits/chosen": -2.4441423416137695, - "logits/rejected": -2.386596441268921, - "logps/chosen": -150.14593505859375, - "logps/rejected": -159.36843872070312, - "loss": 0.6433, - "rewards/accuracies": 0.4375, - "rewards/chosen": -0.8938876986503601, - "rewards/margins": 0.17310473322868347, - "rewards/rejected": -1.0669924020767212, - "step": 146 - }, - { - "epoch": 0.5303269447576099, - "grad_norm": 27.674921224150413, - "learning_rate": 2.673342326970204e-07, - "logits/chosen": -2.7814602851867676, - "logits/rejected": -2.733147144317627, - "logps/chosen": -120.16597747802734, - "logps/rejected": -122.85480499267578, - "loss": 0.6609, - "rewards/accuracies": 0.3125, - "rewards/chosen": -0.8024088740348816, - "rewards/margins": 0.040904901921749115, - "rewards/rejected": -0.8433138132095337, - "step": 147 - }, - { - "epoch": 0.5339346110484781, - "grad_norm": 22.86077453810124, - "learning_rate": 2.641863182732685e-07, - "logits/chosen": -2.552476406097412, - "logits/rejected": -2.5251212120056152, - "logps/chosen": -127.68795776367188, - "logps/rejected": -128.28073120117188, - "loss": 0.6456, - "rewards/accuracies": 0.3125, - "rewards/chosen": -0.8695493936538696, - "rewards/margins": 0.016688432544469833, - "rewards/rejected": -0.8862378001213074, - "step": 148 - }, - { - "epoch": 0.5375422773393461, - "grad_norm": 15.757449035863608, - "learning_rate": 2.610361456360693e-07, - "logits/chosen": -2.936483860015869, - "logits/rejected": -2.8791444301605225, - "logps/chosen": -91.83504486083984, - "logps/rejected": -118.82925415039062, - "loss": 0.617, - "rewards/accuracies": 0.3125, - "rewards/chosen": -0.5295834541320801, - "rewards/margins": 0.31060242652893066, - "rewards/rejected": -0.8401858806610107, - "step": 149 - }, - { - "epoch": 0.5411499436302142, - "grad_norm": 19.634128294510784, - "learning_rate": 2.5788421623774285e-07, - "logits/chosen": -2.6922764778137207, - "logits/rejected": -2.688413143157959, - "logps/chosen": -169.74029541015625, - "logps/rejected": -193.23045349121094, - "loss": 0.6408, - "rewards/accuracies": 0.5625, - "rewards/chosen": -1.0591926574707031, - "rewards/margins": 0.20889857411384583, - "rewards/rejected": -1.2680913209915161, - "step": 150 - }, - { - "epoch": 0.5447576099210824, - "grad_norm": 20.83164592323932, - "learning_rate": 2.5473103181025476e-07, - "logits/chosen": -2.7575480937957764, - "logits/rejected": -2.69429874420166, - "logps/chosen": -120.32164001464844, - "logps/rejected": -146.59634399414062, - "loss": 0.6426, - "rewards/accuracies": 0.4375, - "rewards/chosen": -0.7543135285377502, - "rewards/margins": 0.2804763913154602, - "rewards/rejected": -1.034789800643921, - "step": 151 - }, - { - "epoch": 0.5483652762119504, - "grad_norm": 21.115430939613688, - "learning_rate": 2.5157709428534933e-07, - "logits/chosen": -2.792976140975952, - "logits/rejected": -2.751317024230957, - "logps/chosen": -145.10353088378906, - "logps/rejected": -134.7813262939453, - "loss": 0.7016, - "rewards/accuracies": 0.25, - "rewards/chosen": -1.0921827554702759, - "rewards/margins": -0.10177533328533173, - "rewards/rejected": -0.990407407283783, - "step": 152 - }, - { - "epoch": 0.5519729425028185, - "grad_norm": 18.993334517762637, - "learning_rate": 2.4842290571465064e-07, - "logits/chosen": -2.766651153564453, - "logits/rejected": -2.7081239223480225, - "logps/chosen": -131.57028198242188, - "logps/rejected": -174.12741088867188, - "loss": 0.6328, - "rewards/accuracies": 0.5, - "rewards/chosen": -0.9192937016487122, - "rewards/margins": 0.4600033760070801, - "rewards/rejected": -1.3792970180511475, - "step": 153 - }, - { - "epoch": 0.5555806087936865, - "grad_norm": 18.318300753376395, - "learning_rate": 2.452689681897453e-07, - "logits/chosen": -2.5747904777526855, - "logits/rejected": -2.4591739177703857, - "logps/chosen": -159.27610778808594, - "logps/rejected": -170.09893798828125, - "loss": 0.6642, - "rewards/accuracies": 0.5625, - "rewards/chosen": -1.0223138332366943, - "rewards/margins": 0.16343066096305847, - "rewards/rejected": -1.1857444047927856, - "step": 154 - }, - { - "epoch": 0.5591882750845547, - "grad_norm": 16.90777215730717, - "learning_rate": 2.4211578376225713e-07, - "logits/chosen": -2.582458972930908, - "logits/rejected": -2.6354870796203613, - "logps/chosen": -132.24697875976562, - "logps/rejected": -165.60267639160156, - "loss": 0.6437, - "rewards/accuracies": 0.5625, - "rewards/chosen": -0.8551180362701416, - "rewards/margins": 0.3145350217819214, - "rewards/rejected": -1.1696531772613525, - "step": 155 - }, - { - "epoch": 0.5627959413754228, - "grad_norm": 18.538019662879023, - "learning_rate": 2.389638543639307e-07, - "logits/chosen": -2.658515453338623, - "logits/rejected": -2.651068925857544, - "logps/chosen": -136.00575256347656, - "logps/rejected": -127.08906555175781, - "loss": 0.6557, - "rewards/accuracies": 0.25, - "rewards/chosen": -0.9305965304374695, - "rewards/margins": -0.07156933844089508, - "rewards/rejected": -0.8590272068977356, - "step": 156 - }, - { - "epoch": 0.5664036076662908, - "grad_norm": 16.372357623236674, - "learning_rate": 2.3581368172673148e-07, - "logits/chosen": -2.785402297973633, - "logits/rejected": -2.7543561458587646, - "logps/chosen": -166.28115844726562, - "logps/rejected": -179.12625122070312, - "loss": 0.606, - "rewards/accuracies": 0.5, - "rewards/chosen": -1.2056031227111816, - "rewards/margins": 0.13404881954193115, - "rewards/rejected": -1.3396518230438232, - "step": 157 - }, - { - "epoch": 0.570011273957159, - "grad_norm": 19.24848393063558, - "learning_rate": 2.3266576730297953e-07, - "logits/chosen": -2.8945670127868652, - "logits/rejected": -2.8988423347473145, - "logps/chosen": -134.99729919433594, - "logps/rejected": -138.91622924804688, - "loss": 0.6499, - "rewards/accuracies": 0.25, - "rewards/chosen": -0.8855177164077759, - "rewards/margins": 0.028278954327106476, - "rewards/rejected": -0.9137967228889465, - "step": 158 - }, - { - "epoch": 0.5736189402480271, - "grad_norm": 14.012890168700507, - "learning_rate": 2.295206121855269e-07, - "logits/chosen": -2.74426007270813, - "logits/rejected": -2.7604598999023438, - "logps/chosen": -125.30564880371094, - "logps/rejected": -128.6156463623047, - "loss": 0.6348, - "rewards/accuracies": 0.3125, - "rewards/chosen": -0.8884296417236328, - "rewards/margins": 0.0012431517243385315, - "rewards/rejected": -0.8896728157997131, - "step": 159 - }, - { - "epoch": 0.5772266065388951, - "grad_norm": 17.775755544160994, - "learning_rate": 2.2637871702799219e-07, - "logits/chosen": -2.7324259281158447, - "logits/rejected": -2.7266507148742676, - "logps/chosen": -150.2353973388672, - "logps/rejected": -178.04507446289062, - "loss": 0.6525, - "rewards/accuracies": 0.5625, - "rewards/chosen": -0.8990907669067383, - "rewards/margins": 0.23634961247444153, - "rewards/rejected": -1.1354403495788574, - "step": 160 - }, - { - "epoch": 0.5808342728297633, - "grad_norm": 15.257142021926509, - "learning_rate": 2.232405819650657e-07, - "logits/chosen": -2.856113910675049, - "logits/rejected": -2.8076114654541016, - "logps/chosen": -110.28036499023438, - "logps/rejected": -127.90596771240234, - "loss": 0.612, - "rewards/accuracies": 0.375, - "rewards/chosen": -0.7447718381881714, - "rewards/margins": 0.19078752398490906, - "rewards/rejected": -0.9355593919754028, - "step": 161 - }, - { - "epoch": 0.5844419391206314, - "grad_norm": 15.891526989983937, - "learning_rate": 2.2010670653289624e-07, - "logits/chosen": -2.648637294769287, - "logits/rejected": -2.609950542449951, - "logps/chosen": -108.57567596435547, - "logps/rejected": -128.6502685546875, - "loss": 0.6401, - "rewards/accuracies": 0.5, - "rewards/chosen": -0.6480116844177246, - "rewards/margins": 0.24210825562477112, - "rewards/rejected": -0.8901200294494629, - "step": 162 - }, - { - "epoch": 0.5880496054114994, - "grad_norm": 21.591300223582838, - "learning_rate": 2.1697758958957447e-07, - "logits/chosen": -2.7000112533569336, - "logits/rejected": -2.663194179534912, - "logps/chosen": -200.93020629882812, - "logps/rejected": -198.53639221191406, - "loss": 0.6942, - "rewards/accuracies": 0.5, - "rewards/chosen": -1.4486756324768066, - "rewards/margins": -0.03041556477546692, - "rewards/rejected": -1.418260097503662, - "step": 163 - }, - { - "epoch": 0.5916572717023675, - "grad_norm": 16.646502851432825, - "learning_rate": 2.138537292357223e-07, - "logits/chosen": -2.543466329574585, - "logits/rejected": -2.5985028743743896, - "logps/chosen": -157.02590942382812, - "logps/rejected": -201.055419921875, - "loss": 0.6205, - "rewards/accuracies": 0.625, - "rewards/chosen": -0.9844037294387817, - "rewards/margins": 0.38901880383491516, - "rewards/rejected": -1.373422622680664, - "step": 164 - }, - { - "epoch": 0.5952649379932357, - "grad_norm": 17.23370314987821, - "learning_rate": 2.1073562273520535e-07, - "logits/chosen": -2.610887289047241, - "logits/rejected": -2.5929317474365234, - "logps/chosen": -102.35009765625, - "logps/rejected": -117.8133544921875, - "loss": 0.6426, - "rewards/accuracies": 0.5, - "rewards/chosen": -0.7128652334213257, - "rewards/margins": 0.14537093043327332, - "rewards/rejected": -0.8582361340522766, - "step": 165 - }, - { - "epoch": 0.5988726042841037, - "grad_norm": 17.116284979185036, - "learning_rate": 2.076237664359758e-07, - "logits/chosen": -2.7106292247772217, - "logits/rejected": -2.718566417694092, - "logps/chosen": -138.78997802734375, - "logps/rejected": -143.2956085205078, - "loss": 0.612, - "rewards/accuracies": 0.5, - "rewards/chosen": -0.9055193662643433, - "rewards/margins": 0.08886927366256714, - "rewards/rejected": -0.9943886399269104, - "step": 166 - }, - { - "epoch": 0.6024802705749718, - "grad_norm": 14.459910177063847, - "learning_rate": 2.0451865569106403e-07, - "logits/chosen": -2.6689453125, - "logits/rejected": -2.6547491550445557, - "logps/chosen": -124.56228637695312, - "logps/rejected": -138.3570556640625, - "loss": 0.631, - "rewards/accuracies": 0.375, - "rewards/chosen": -0.7005306482315063, - "rewards/margins": 0.19185298681259155, - "rewards/rejected": -0.8923837542533875, - "step": 167 - }, - { - "epoch": 0.60608793686584, - "grad_norm": 16.812557037174834, - "learning_rate": 2.0142078477972556e-07, - "logits/chosen": -2.8302969932556152, - "logits/rejected": -2.8322956562042236, - "logps/chosen": -124.77357482910156, - "logps/rejected": -132.1130828857422, - "loss": 0.6668, - "rewards/accuracies": 0.4375, - "rewards/chosen": -0.8619922399520874, - "rewards/margins": 0.07053224742412567, - "rewards/rejected": -0.9325243830680847, - "step": 168 - }, - { - "epoch": 0.609695603156708, - "grad_norm": 15.745023679638336, - "learning_rate": 1.9833064682876175e-07, - "logits/chosen": -2.7202577590942383, - "logits/rejected": -2.7093000411987305, - "logps/chosen": -42.6649284362793, - "logps/rejected": -45.50575256347656, - "loss": 0.6199, - "rewards/accuracies": 0.1875, - "rewards/chosen": -0.2206006944179535, - "rewards/margins": 0.03538605198264122, - "rewards/rejected": -0.255986750125885, - "step": 169 - }, - { - "epoch": 0.6133032694475761, - "grad_norm": 14.748417634471243, - "learning_rate": 1.9524873373402175e-07, - "logits/chosen": -2.693784236907959, - "logits/rejected": -2.6833336353302, - "logps/chosen": -177.14149475097656, - "logps/rejected": -215.44683837890625, - "loss": 0.6364, - "rewards/accuracies": 0.4375, - "rewards/chosen": -1.156082272529602, - "rewards/margins": 0.36871713399887085, - "rewards/rejected": -1.5247993469238281, - "step": 170 - }, - { - "epoch": 0.6169109357384442, - "grad_norm": 18.867432006124634, - "learning_rate": 1.9217553608210174e-07, - "logits/chosen": -2.7561516761779785, - "logits/rejected": -2.769016981124878, - "logps/chosen": -118.07001495361328, - "logps/rejected": -142.2090301513672, - "loss": 0.6155, - "rewards/accuracies": 0.4375, - "rewards/chosen": -0.5728927254676819, - "rewards/margins": 0.26933857798576355, - "rewards/rejected": -0.8422313332557678, - "step": 171 - }, - { - "epoch": 0.6205186020293123, - "grad_norm": 14.704944547157945, - "learning_rate": 1.8911154307225203e-07, - "logits/chosen": -2.7598936557769775, - "logits/rejected": -2.69793438911438, - "logps/chosen": -159.75970458984375, - "logps/rejected": -171.54270935058594, - "loss": 0.653, - "rewards/accuracies": 0.4375, - "rewards/chosen": -1.0255920886993408, - "rewards/margins": 0.15820060670375824, - "rewards/rejected": -1.1837927103042603, - "step": 172 - }, - { - "epoch": 0.6241262683201804, - "grad_norm": 16.96682619023045, - "learning_rate": 1.8605724243850497e-07, - "logits/chosen": -2.800889015197754, - "logits/rejected": -2.7683513164520264, - "logps/chosen": -136.80233764648438, - "logps/rejected": -148.33009338378906, - "loss": 0.6618, - "rewards/accuracies": 0.3125, - "rewards/chosen": -0.7636388540267944, - "rewards/margins": 0.15408092737197876, - "rewards/rejected": -0.9177197217941284, - "step": 173 - }, - { - "epoch": 0.6277339346110484, - "grad_norm": 16.030630528718454, - "learning_rate": 1.8301312037203676e-07, - "logits/chosen": -2.6804215908050537, - "logits/rejected": -2.673224925994873, - "logps/chosen": -162.06027221679688, - "logps/rejected": -176.3635711669922, - "loss": 0.6389, - "rewards/accuracies": 0.625, - "rewards/chosen": -1.1072931289672852, - "rewards/margins": 0.1627993881702423, - "rewards/rejected": -1.2700927257537842, - "step": 174 - }, - { - "epoch": 0.6313416009019166, - "grad_norm": 19.218957932347394, - "learning_rate": 1.7997966144377326e-07, - "logits/chosen": -2.629088878631592, - "logits/rejected": -2.6365842819213867, - "logps/chosen": -117.45922088623047, - "logps/rejected": -135.8382110595703, - "loss": 0.631, - "rewards/accuracies": 0.375, - "rewards/chosen": -0.7468096613883972, - "rewards/margins": 0.2035558521747589, - "rewards/rejected": -0.9503654837608337, - "step": 175 - }, - { - "epoch": 0.6349492671927847, - "grad_norm": 18.619219772237013, - "learning_rate": 1.7695734852725618e-07, - "logits/chosen": -2.5348024368286133, - "logits/rejected": -2.5094666481018066, - "logps/chosen": -161.67816162109375, - "logps/rejected": -179.298583984375, - "loss": 0.6368, - "rewards/accuracies": 0.5625, - "rewards/chosen": -0.8872625231742859, - "rewards/margins": 0.22884149849414825, - "rewards/rejected": -1.116104006767273, - "step": 176 - }, - { - "epoch": 0.6385569334836527, - "grad_norm": 20.502583173520957, - "learning_rate": 1.739466627217767e-07, - "logits/chosen": -2.6239590644836426, - "logits/rejected": -2.582123279571533, - "logps/chosen": -161.62527465820312, - "logps/rejected": -154.91091918945312, - "loss": 0.6463, - "rewards/accuracies": 0.375, - "rewards/chosen": -1.0002014636993408, - "rewards/margins": -0.0447964072227478, - "rewards/rejected": -0.955405056476593, - "step": 177 - }, - { - "epoch": 0.6421645997745209, - "grad_norm": 16.533244963827425, - "learning_rate": 1.70948083275794e-07, - "logits/chosen": -2.5065081119537354, - "logits/rejected": -2.548151969909668, - "logps/chosen": -144.95687866210938, - "logps/rejected": -175.65626525878906, - "loss": 0.6337, - "rewards/accuracies": 0.4375, - "rewards/chosen": -0.9515472054481506, - "rewards/margins": 0.31802898645401, - "rewards/rejected": -1.2695761919021606, - "step": 178 - }, - { - "epoch": 0.645772266065389, - "grad_norm": 19.66556000147453, - "learning_rate": 1.6796208751064665e-07, - "logits/chosen": -2.5851457118988037, - "logits/rejected": -2.573211908340454, - "logps/chosen": -156.71408081054688, - "logps/rejected": -165.16729736328125, - "loss": 0.6571, - "rewards/accuracies": 0.4375, - "rewards/chosen": -0.9640285968780518, - "rewards/margins": 0.12776385247707367, - "rewards/rejected": -1.0917924642562866, - "step": 179 - }, - { - "epoch": 0.649379932356257, - "grad_norm": 17.95397127731106, - "learning_rate": 1.6498915074457156e-07, - "logits/chosen": -2.607480764389038, - "logits/rejected": -2.626681327819824, - "logps/chosen": -100.76728820800781, - "logps/rejected": -130.57501220703125, - "loss": 0.632, - "rewards/accuracies": 0.4375, - "rewards/chosen": -0.5932424664497375, - "rewards/margins": 0.2794119417667389, - "rewards/rejected": -0.8726544380187988, - "step": 180 - }, - { - "epoch": 0.6529875986471252, - "grad_norm": 14.91981410932362, - "learning_rate": 1.6202974621704174e-07, - "logits/chosen": -2.6122026443481445, - "logits/rejected": -2.5755481719970703, - "logps/chosen": -110.2283935546875, - "logps/rejected": -103.84386444091797, - "loss": 0.6415, - "rewards/accuracies": 0.3125, - "rewards/chosen": -0.6347527503967285, - "rewards/margins": -0.03415530547499657, - "rewards/rejected": -0.6005973815917969, - "step": 181 - }, - { - "epoch": 0.6565952649379933, - "grad_norm": 16.613685474158963, - "learning_rate": 1.590843450134347e-07, - "logits/chosen": -2.4995620250701904, - "logits/rejected": -2.41062593460083, - "logps/chosen": -167.216064453125, - "logps/rejected": -153.84239196777344, - "loss": 0.6335, - "rewards/accuracies": 0.3125, - "rewards/chosen": -1.1623873710632324, - "rewards/margins": -0.10739407688379288, - "rewards/rejected": -1.0549933910369873, - "step": 182 - }, - { - "epoch": 0.6602029312288613, - "grad_norm": 14.10351328216283, - "learning_rate": 1.5615341599004406e-07, - "logits/chosen": -2.621835231781006, - "logits/rejected": -2.604492425918579, - "logps/chosen": -166.58204650878906, - "logps/rejected": -190.07391357421875, - "loss": 0.6412, - "rewards/accuracies": 0.625, - "rewards/chosen": -1.1296504735946655, - "rewards/margins": 0.2191045582294464, - "rewards/rejected": -1.348755121231079, - "step": 183 - }, - { - "epoch": 0.6638105975197294, - "grad_norm": 17.43519872387487, - "learning_rate": 1.532374256994457e-07, - "logits/chosen": -2.6489250659942627, - "logits/rejected": -2.602325439453125, - "logps/chosen": -141.79489135742188, - "logps/rejected": -144.03334045410156, - "loss": 0.6695, - "rewards/accuracies": 0.1875, - "rewards/chosen": -1.0018857717514038, - "rewards/margins": -0.0007977448403835297, - "rewards/rejected": -1.00108802318573, - "step": 184 - }, - { - "epoch": 0.6674182638105975, - "grad_norm": 14.699144802164259, - "learning_rate": 1.5033683831623112e-07, - "logits/chosen": -2.6636416912078857, - "logits/rejected": -2.6072072982788086, - "logps/chosen": -152.4785614013672, - "logps/rejected": -162.30181884765625, - "loss": 0.595, - "rewards/accuracies": 0.4375, - "rewards/chosen": -1.0577099323272705, - "rewards/margins": 0.10988734662532806, - "rewards/rejected": -1.1675972938537598, - "step": 185 - }, - { - "epoch": 0.6710259301014656, - "grad_norm": 18.30677848015039, - "learning_rate": 1.474521155631182e-07, - "logits/chosen": -2.7483725547790527, - "logits/rejected": -2.748386859893799, - "logps/chosen": -160.15005493164062, - "logps/rejected": -161.2864227294922, - "loss": 0.6469, - "rewards/accuracies": 0.3125, - "rewards/chosen": -1.039442539215088, - "rewards/margins": 0.013212975114583969, - "rewards/rejected": -1.0526554584503174, - "step": 186 - }, - { - "epoch": 0.6746335963923337, - "grad_norm": 19.49798242848129, - "learning_rate": 1.44583716637454e-07, - "logits/chosen": -2.750894069671631, - "logits/rejected": -2.6982154846191406, - "logps/chosen": -116.81553649902344, - "logps/rejected": -141.81300354003906, - "loss": 0.6535, - "rewards/accuracies": 0.4375, - "rewards/chosen": -0.6832581758499146, - "rewards/margins": 0.27629202604293823, - "rewards/rejected": -0.959550142288208, - "step": 187 - }, - { - "epoch": 0.6782412626832018, - "grad_norm": 18.530071298153135, - "learning_rate": 1.4173209813811788e-07, - "logits/chosen": -2.6951780319213867, - "logits/rejected": -2.6223602294921875, - "logps/chosen": -177.3026123046875, - "logps/rejected": -198.16806030273438, - "loss": 0.6425, - "rewards/accuracies": 0.5, - "rewards/chosen": -1.2028027772903442, - "rewards/margins": 0.2099728137254715, - "rewards/rejected": -1.4127756357192993, - "step": 188 - }, - { - "epoch": 0.6818489289740699, - "grad_norm": 16.14866291471168, - "learning_rate": 1.38897713992839e-07, - "logits/chosen": -2.649348258972168, - "logits/rejected": -2.6460623741149902, - "logps/chosen": -122.51019287109375, - "logps/rejected": -137.5120849609375, - "loss": 0.6321, - "rewards/accuracies": 0.375, - "rewards/chosen": -0.7333808541297913, - "rewards/margins": 0.10139094293117523, - "rewards/rejected": -0.8347718119621277, - "step": 189 - }, - { - "epoch": 0.685456595264938, - "grad_norm": 16.218725356471335, - "learning_rate": 1.3608101538593964e-07, - "logits/chosen": -2.7609314918518066, - "logits/rejected": -2.739086389541626, - "logps/chosen": -118.09423828125, - "logps/rejected": -130.329833984375, - "loss": 0.6582, - "rewards/accuracies": 0.375, - "rewards/chosen": -0.8001685738563538, - "rewards/margins": 0.08103892207145691, - "rewards/rejected": -0.8812074661254883, - "step": 190 - }, - { - "epoch": 0.6890642615558061, - "grad_norm": 17.444458875032904, - "learning_rate": 1.332824506865135e-07, - "logits/chosen": -2.5050034523010254, - "logits/rejected": -2.4796395301818848, - "logps/chosen": -81.19308471679688, - "logps/rejected": -113.7338638305664, - "loss": 0.6303, - "rewards/accuracies": 0.5625, - "rewards/chosen": -0.515100359916687, - "rewards/margins": 0.3201243281364441, - "rewards/rejected": -0.8352246284484863, - "step": 191 - }, - { - "epoch": 0.6926719278466742, - "grad_norm": 21.980670974545852, - "learning_rate": 1.3050246537705423e-07, - "logits/chosen": -2.709773302078247, - "logits/rejected": -2.7297189235687256, - "logps/chosen": -113.28019714355469, - "logps/rejected": -121.93257141113281, - "loss": 0.6444, - "rewards/accuracies": 0.3125, - "rewards/chosen": -0.7695672512054443, - "rewards/margins": 0.04824068397283554, - "rewards/rejected": -0.8178079724311829, - "step": 192 - }, - { - "epoch": 0.6962795941375423, - "grad_norm": 14.086726282530078, - "learning_rate": 1.277415019825417e-07, - "logits/chosen": -2.631265878677368, - "logits/rejected": -2.575739622116089, - "logps/chosen": -125.69048309326172, - "logps/rejected": -146.4845428466797, - "loss": 0.6252, - "rewards/accuracies": 0.4375, - "rewards/chosen": -0.7491203546524048, - "rewards/margins": 0.20852434635162354, - "rewards/rejected": -0.9576447606086731, - "step": 193 - }, - { - "epoch": 0.6998872604284103, - "grad_norm": 18.291229132407672, - "learning_rate": 1.2500000000000005e-07, - "logits/chosen": -2.688297748565674, - "logits/rejected": -2.669114828109741, - "logps/chosen": -211.00732421875, - "logps/rejected": -209.12322998046875, - "loss": 0.6664, - "rewards/accuracies": 0.3125, - "rewards/chosen": -1.2566479444503784, - "rewards/margins": -0.033594757318496704, - "rewards/rejected": -1.223053216934204, - "step": 194 - }, - { - "epoch": 0.7034949267192785, - "grad_norm": 16.764116725877088, - "learning_rate": 1.2227839582853736e-07, - "logits/chosen": -2.751734972000122, - "logits/rejected": -2.7538325786590576, - "logps/chosen": -143.16390991210938, - "logps/rejected": -158.14288330078125, - "loss": 0.6393, - "rewards/accuracies": 0.4375, - "rewards/chosen": -0.9713834524154663, - "rewards/margins": 0.13335493206977844, - "rewards/rejected": -1.104738473892212, - "step": 195 - }, - { - "epoch": 0.7071025930101466, - "grad_norm": 16.375730614897645, - "learning_rate": 1.1957712269987887e-07, - "logits/chosen": -2.6568965911865234, - "logits/rejected": -2.6080009937286377, - "logps/chosen": -104.1981430053711, - "logps/rejected": -132.65667724609375, - "loss": 0.6256, - "rewards/accuracies": 0.4375, - "rewards/chosen": -0.6839835047721863, - "rewards/margins": 0.2929723262786865, - "rewards/rejected": -0.9769558310508728, - "step": 196 - }, - { - "epoch": 0.7107102593010146, - "grad_norm": 19.373193635262318, - "learning_rate": 1.1689661060940364e-07, - "logits/chosen": -2.569329261779785, - "logits/rejected": -2.517493724822998, - "logps/chosen": -184.40109252929688, - "logps/rejected": -214.02529907226562, - "loss": 0.6608, - "rewards/accuracies": 0.5625, - "rewards/chosen": -1.1185375452041626, - "rewards/margins": 0.3387353718280792, - "rewards/rejected": -1.457273006439209, - "step": 197 - }, - { - "epoch": 0.7143179255918828, - "grad_norm": 20.209863962145814, - "learning_rate": 1.1423728624769693e-07, - "logits/chosen": -2.8044545650482178, - "logits/rejected": -2.805450916290283, - "logps/chosen": -196.05953979492188, - "logps/rejected": -184.72727966308594, - "loss": 0.654, - "rewards/accuracies": 0.25, - "rewards/chosen": -1.3288071155548096, - "rewards/margins": -0.11929549276828766, - "rewards/rejected": -1.209511637687683, - "step": 198 - }, - { - "epoch": 0.7179255918827508, - "grad_norm": 15.494808937210042, - "learning_rate": 1.1159957293262887e-07, - "logits/chosen": -2.8339953422546387, - "logits/rejected": -2.703178882598877, - "logps/chosen": -163.5222930908203, - "logps/rejected": -216.82159423828125, - "loss": 0.6078, - "rewards/accuracies": 0.6875, - "rewards/chosen": -0.9383840560913086, - "rewards/margins": 0.5548679232597351, - "rewards/rejected": -1.4932520389556885, - "step": 199 - }, - { - "epoch": 0.7215332581736189, - "grad_norm": 23.81561610370924, - "learning_rate": 1.089838905419691e-07, - "logits/chosen": -2.550008535385132, - "logits/rejected": -2.5492167472839355, - "logps/chosen": -98.1673355102539, - "logps/rejected": -111.06562805175781, - "loss": 0.6396, - "rewards/accuracies": 0.3125, - "rewards/chosen": -0.5753649473190308, - "rewards/margins": 0.11871910840272903, - "rewards/rejected": -0.6940840482711792, - "step": 200 - }, - { - "epoch": 0.7251409244644871, - "grad_norm": 18.125603194316525, - "learning_rate": 1.0639065544655033e-07, - "logits/chosen": -2.7185845375061035, - "logits/rejected": -2.732865810394287, - "logps/chosen": -155.1719512939453, - "logps/rejected": -184.75363159179688, - "loss": 0.6255, - "rewards/accuracies": 0.625, - "rewards/chosen": -0.913672924041748, - "rewards/margins": 0.2742753028869629, - "rewards/rejected": -1.187948226928711, - "step": 201 - }, - { - "epoch": 0.7287485907553551, - "grad_norm": 16.35458157572819, - "learning_rate": 1.0382028044398822e-07, - "logits/chosen": -2.6049351692199707, - "logits/rejected": -2.4998717308044434, - "logps/chosen": -131.6762237548828, - "logps/rejected": -168.1844024658203, - "loss": 0.6238, - "rewards/accuracies": 0.4375, - "rewards/chosen": -0.8637382984161377, - "rewards/margins": 0.3832642436027527, - "rewards/rejected": -1.2470024824142456, - "step": 202 - }, - { - "epoch": 0.7323562570462232, - "grad_norm": 22.55968095781708, - "learning_rate": 1.0127317469297275e-07, - "logits/chosen": -2.64062762260437, - "logits/rejected": -2.626605987548828, - "logps/chosen": -204.2333526611328, - "logps/rejected": -211.51834106445312, - "loss": 0.6623, - "rewards/accuracies": 0.4375, - "rewards/chosen": -1.457507610321045, - "rewards/margins": 0.10570463538169861, - "rewards/rejected": -1.5632121562957764, - "step": 203 - }, - { - "epoch": 0.7359639233370913, - "grad_norm": 22.066580541452367, - "learning_rate": 9.874974364813593e-08, - "logits/chosen": -2.5739970207214355, - "logits/rejected": -2.56649112701416, - "logps/chosen": -176.37611389160156, - "logps/rejected": -194.35948181152344, - "loss": 0.6062, - "rewards/accuracies": 0.5625, - "rewards/chosen": -1.2450966835021973, - "rewards/margins": 0.21306829154491425, - "rewards/rejected": -1.458164930343628, - "step": 204 - }, - { - "epoch": 0.7395715896279594, - "grad_norm": 17.813935130489558, - "learning_rate": 9.625038899551161e-08, - "logits/chosen": -2.5886383056640625, - "logits/rejected": -2.6257102489471436, - "logps/chosen": -173.45433044433594, - "logps/rejected": -205.86611938476562, - "loss": 0.6055, - "rewards/accuracies": 0.4375, - "rewards/chosen": -1.0949602127075195, - "rewards/margins": 0.24069912731647491, - "rewards/rejected": -1.3356592655181885, - "step": 205 - }, - { - "epoch": 0.7431792559188275, - "grad_norm": 19.070960742939054, - "learning_rate": 9.377550858859324e-08, - "logits/chosen": -2.8416402339935303, - "logits/rejected": -2.855118751525879, - "logps/chosen": -140.96128845214844, - "logps/rejected": -173.95872497558594, - "loss": 0.6047, - "rewards/accuracies": 0.5, - "rewards/chosen": -0.955522894859314, - "rewards/margins": 0.33456897735595703, - "rewards/rejected": -1.2900919914245605, - "step": 206 - }, - { - "epoch": 0.7467869222096956, - "grad_norm": 17.504813327349456, - "learning_rate": 9.132549638500305e-08, - "logits/chosen": -2.6112301349639893, - "logits/rejected": -2.5324132442474365, - "logps/chosen": -115.29826354980469, - "logps/rejected": -130.9242401123047, - "loss": 0.6134, - "rewards/accuracies": 0.4375, - "rewards/chosen": -0.8085027933120728, - "rewards/margins": 0.18405520915985107, - "rewards/rejected": -0.9925580620765686, - "step": 207 - }, - { - "epoch": 0.7503945885005637, - "grad_norm": 25.033664433069895, - "learning_rate": 8.890074238378073e-08, - "logits/chosen": -2.6047165393829346, - "logits/rejected": -2.584367036819458, - "logps/chosen": -149.3617401123047, - "logps/rejected": -173.37771606445312, - "loss": 0.6565, - "rewards/accuracies": 0.5625, - "rewards/chosen": -0.9205259084701538, - "rewards/margins": 0.21372103691101074, - "rewards/rejected": -1.1342469453811646, - "step": 208 - }, - { - "epoch": 0.7540022547914318, - "grad_norm": 20.14222000285242, - "learning_rate": 8.65016325633027e-08, - "logits/chosen": -2.7320146560668945, - "logits/rejected": -2.6900644302368164, - "logps/chosen": -115.91548919677734, - "logps/rejected": -139.6302032470703, - "loss": 0.6304, - "rewards/accuracies": 0.4375, - "rewards/chosen": -0.7225445508956909, - "rewards/margins": 0.22064021229743958, - "rewards/rejected": -0.9431847929954529, - "step": 209 - }, - { - "epoch": 0.7576099210822999, - "grad_norm": 21.524766469753484, - "learning_rate": 8.412854881984056e-08, - "logits/chosen": -2.76995849609375, - "logits/rejected": -2.7241146564483643, - "logps/chosen": -101.65496063232422, - "logps/rejected": -127.55489349365234, - "loss": 0.6333, - "rewards/accuracies": 0.375, - "rewards/chosen": -0.5967918634414673, - "rewards/margins": 0.2909960150718689, - "rewards/rejected": -0.8877878785133362, - "step": 210 - }, - { - "epoch": 0.761217587373168, - "grad_norm": 22.54441120529105, - "learning_rate": 8.178186890677027e-08, - "logits/chosen": -2.4587697982788086, - "logits/rejected": -2.4420981407165527, - "logps/chosen": -143.93771362304688, - "logps/rejected": -160.05174255371094, - "loss": 0.6396, - "rewards/accuracies": 0.375, - "rewards/chosen": -1.0443251132965088, - "rewards/margins": 0.15341287851333618, - "rewards/rejected": -1.1977380514144897, - "step": 211 - }, - { - "epoch": 0.7648252536640361, - "grad_norm": 23.394591255980846, - "learning_rate": 7.946196637444059e-08, - "logits/chosen": -2.868518352508545, - "logits/rejected": -2.829932689666748, - "logps/chosen": -188.8074951171875, - "logps/rejected": -203.76547241210938, - "loss": 0.5937, - "rewards/accuracies": 0.5, - "rewards/chosen": -1.255654215812683, - "rewards/margins": 0.17242862284183502, - "rewards/rejected": -1.428082823753357, - "step": 212 - }, - { - "epoch": 0.7684329199549041, - "grad_norm": 24.244329717907355, - "learning_rate": 7.71692105107098e-08, - "logits/chosen": -2.5620954036712646, - "logits/rejected": -2.399686336517334, - "logps/chosen": -242.2209930419922, - "logps/rejected": -242.9703826904297, - "loss": 0.6491, - "rewards/accuracies": 0.5, - "rewards/chosen": -1.6907013654708862, - "rewards/margins": 0.1445438414812088, - "rewards/rejected": -1.8352453708648682, - "step": 213 - }, - { - "epoch": 0.7720405862457723, - "grad_norm": 25.14734916648313, - "learning_rate": 7.490396628216236e-08, - "logits/chosen": -2.884876251220703, - "logits/rejected": -2.8430535793304443, - "logps/chosen": -114.62889862060547, - "logps/rejected": -127.26203918457031, - "loss": 0.627, - "rewards/accuracies": 0.375, - "rewards/chosen": -0.680679976940155, - "rewards/margins": 0.1935320496559143, - "rewards/rejected": -0.8742119669914246, - "step": 214 - }, - { - "epoch": 0.7756482525366404, - "grad_norm": 34.73519323487288, - "learning_rate": 7.266659427601169e-08, - "logits/chosen": -2.6338040828704834, - "logits/rejected": -2.603044033050537, - "logps/chosen": -114.8099365234375, - "logps/rejected": -130.67709350585938, - "loss": 0.6364, - "rewards/accuracies": 0.375, - "rewards/chosen": -0.7538653016090393, - "rewards/margins": 0.14267241954803467, - "rewards/rejected": -0.8965376615524292, - "step": 215 - }, - { - "epoch": 0.7792559188275084, - "grad_norm": 19.239379540435802, - "learning_rate": 7.045745064270186e-08, - "logits/chosen": -2.705063819885254, - "logits/rejected": -2.623157024383545, - "logps/chosen": -223.34844970703125, - "logps/rejected": -191.0408172607422, - "loss": 0.6123, - "rewards/accuracies": 0.3125, - "rewards/chosen": -1.5766035318374634, - "rewards/margins": -0.2664285898208618, - "rewards/rejected": -1.3101749420166016, - "step": 216 - }, - { - "epoch": 0.7828635851183765, - "grad_norm": 20.86519434536946, - "learning_rate": 6.827688703921405e-08, - "logits/chosen": -2.7665932178497314, - "logits/rejected": -2.757221221923828, - "logps/chosen": -173.21044921875, - "logps/rejected": -164.48175048828125, - "loss": 0.6443, - "rewards/accuracies": 0.1875, - "rewards/chosen": -1.13240647315979, - "rewards/margins": -0.054416049271821976, - "rewards/rejected": -1.0779904127120972, - "step": 217 - }, - { - "epoch": 0.7864712514092447, - "grad_norm": 25.54824467709053, - "learning_rate": 6.612525057308948e-08, - "logits/chosen": -2.6467349529266357, - "logits/rejected": -2.6698858737945557, - "logps/chosen": -159.49598693847656, - "logps/rejected": -165.4706268310547, - "loss": 0.6349, - "rewards/accuracies": 0.375, - "rewards/chosen": -1.1105802059173584, - "rewards/margins": 0.03332412242889404, - "rewards/rejected": -1.1439043283462524, - "step": 218 - }, - { - "epoch": 0.7900789177001127, - "grad_norm": 24.965873693771254, - "learning_rate": 6.400288374717533e-08, - "logits/chosen": -2.6835930347442627, - "logits/rejected": -2.636648654937744, - "logps/chosen": -190.44488525390625, - "logps/rejected": -217.7324676513672, - "loss": 0.613, - "rewards/accuracies": 0.5625, - "rewards/chosen": -1.2087502479553223, - "rewards/margins": 0.31329116225242615, - "rewards/rejected": -1.5220413208007812, - "step": 219 - }, - { - "epoch": 0.7936865839909808, - "grad_norm": 18.92609703396816, - "learning_rate": 6.191012440510468e-08, - "logits/chosen": -2.4531712532043457, - "logits/rejected": -2.437631607055664, - "logps/chosen": -93.95329284667969, - "logps/rejected": -105.57788848876953, - "loss": 0.6127, - "rewards/accuracies": 0.5, - "rewards/chosen": -0.5820232033729553, - "rewards/margins": 0.14143766462802887, - "rewards/rejected": -0.7234609127044678, - "step": 220 - }, - { - "epoch": 0.797294250281849, - "grad_norm": 19.39595153425221, - "learning_rate": 5.984730567751786e-08, - "logits/chosen": -2.817124128341675, - "logits/rejected": -2.7696752548217773, - "logps/chosen": -144.85052490234375, - "logps/rejected": -164.24566650390625, - "loss": 0.6078, - "rewards/accuracies": 0.5, - "rewards/chosen": -0.8248751759529114, - "rewards/margins": 0.19056878983974457, - "rewards/rejected": -1.015444040298462, - "step": 221 - }, - { - "epoch": 0.800901916572717, - "grad_norm": 29.03642350629373, - "learning_rate": 5.781475592903337e-08, - "logits/chosen": -2.597087860107422, - "logits/rejected": -2.5627245903015137, - "logps/chosen": -95.23512268066406, - "logps/rejected": -116.90785217285156, - "loss": 0.6383, - "rewards/accuracies": 0.375, - "rewards/chosen": -0.6514754295349121, - "rewards/margins": 0.18482735753059387, - "rewards/rejected": -0.8363026976585388, - "step": 222 - }, - { - "epoch": 0.8045095828635851, - "grad_norm": 27.415759162402487, - "learning_rate": 5.581279870597866e-08, - "logits/chosen": -2.7290802001953125, - "logits/rejected": -2.7410173416137695, - "logps/chosen": -108.07835388183594, - "logps/rejected": -132.58139038085938, - "loss": 0.6388, - "rewards/accuracies": 0.3125, - "rewards/chosen": -0.6878082156181335, - "rewards/margins": 0.18555019795894623, - "rewards/rejected": -0.8733583688735962, - "step": 223 - }, - { - "epoch": 0.8081172491544533, - "grad_norm": 22.02918123491063, - "learning_rate": 5.384175268488639e-08, - "logits/chosen": -2.5859334468841553, - "logits/rejected": -2.6506881713867188, - "logps/chosen": -184.10287475585938, - "logps/rejected": -199.52178955078125, - "loss": 0.6465, - "rewards/accuracies": 0.5625, - "rewards/chosen": -1.1006207466125488, - "rewards/margins": 0.11409949511289597, - "rewards/rejected": -1.2147202491760254, - "step": 224 - }, - { - "epoch": 0.8117249154453213, - "grad_norm": 23.585365761496814, - "learning_rate": 5.190193162176754e-08, - "logits/chosen": -2.6538848876953125, - "logits/rejected": -2.563683032989502, - "logps/chosen": -134.6385498046875, - "logps/rejected": -148.71571350097656, - "loss": 0.6383, - "rewards/accuracies": 0.3125, - "rewards/chosen": -0.766261875629425, - "rewards/margins": 0.15823136270046234, - "rewards/rejected": -0.9244933128356934, - "step": 225 - }, - { - "epoch": 0.8153325817361894, - "grad_norm": 17.21487770087475, - "learning_rate": 4.9993644302166374e-08, - "logits/chosen": -2.713024139404297, - "logits/rejected": -2.6652426719665527, - "logps/chosen": -91.6714096069336, - "logps/rejected": -97.30072784423828, - "loss": 0.6649, - "rewards/accuracies": 0.3125, - "rewards/chosen": -0.5152935981750488, - "rewards/margins": 0.05684986710548401, - "rewards/rejected": -0.5721434354782104, - "step": 226 - }, - { - "epoch": 0.8189402480270574, - "grad_norm": 19.881452560669455, - "learning_rate": 4.811719449200755e-08, - "logits/chosen": -2.5407063961029053, - "logits/rejected": -2.542264461517334, - "logps/chosen": -123.28820037841797, - "logps/rejected": -155.67604064941406, - "loss": 0.5878, - "rewards/accuracies": 0.5625, - "rewards/chosen": -0.811669647693634, - "rewards/margins": 0.32552799582481384, - "rewards/rejected": -1.137197732925415, - "step": 227 - }, - { - "epoch": 0.8225479143179256, - "grad_norm": 20.32296951543072, - "learning_rate": 4.6272880889241554e-08, - "logits/chosen": -2.538395881652832, - "logits/rejected": -2.5627975463867188, - "logps/chosen": -152.72386169433594, - "logps/rejected": -205.58055114746094, - "loss": 0.6115, - "rewards/accuracies": 0.625, - "rewards/chosen": -1.0139594078063965, - "rewards/margins": 0.5598934888839722, - "rewards/rejected": -1.5738528966903687, - "step": 228 - }, - { - "epoch": 0.8261555806087937, - "grad_norm": 22.023214134054516, - "learning_rate": 4.44609970762975e-08, - "logits/chosen": -2.5238399505615234, - "logits/rejected": -2.4400908946990967, - "logps/chosen": -178.64308166503906, - "logps/rejected": -211.25213623046875, - "loss": 0.5885, - "rewards/accuracies": 0.5, - "rewards/chosen": -0.994665265083313, - "rewards/margins": 0.3249795734882355, - "rewards/rejected": -1.319644808769226, - "step": 229 - }, - { - "epoch": 0.8297632468996617, - "grad_norm": 18.876829580363623, - "learning_rate": 4.268183147334989e-08, - "logits/chosen": -2.686662197113037, - "logits/rejected": -2.6410155296325684, - "logps/chosen": -171.8336944580078, - "logps/rejected": -187.07118225097656, - "loss": 0.6378, - "rewards/accuracies": 0.6875, - "rewards/chosen": -0.9352148771286011, - "rewards/margins": 0.21722210943698883, - "rewards/rejected": -1.1524369716644287, - "step": 230 - }, - { - "epoch": 0.8333709131905299, - "grad_norm": 24.27323371929423, - "learning_rate": 4.0935667292407076e-08, - "logits/chosen": -2.7488949298858643, - "logits/rejected": -2.705801010131836, - "logps/chosen": -123.69054412841797, - "logps/rejected": -151.88018798828125, - "loss": 0.5818, - "rewards/accuracies": 0.4375, - "rewards/chosen": -0.6984049677848816, - "rewards/margins": 0.3373234272003174, - "rewards/rejected": -1.0357283353805542, - "step": 231 - }, - { - "epoch": 0.836978579481398, - "grad_norm": 17.37050695259515, - "learning_rate": 3.9222782492228934e-08, - "logits/chosen": -2.834803819656372, - "logits/rejected": -2.8067216873168945, - "logps/chosen": -75.37928009033203, - "logps/rejected": -72.15511322021484, - "loss": 0.6433, - "rewards/accuracies": 0.1875, - "rewards/chosen": -0.4528418779373169, - "rewards/margins": -0.010631322860717773, - "rewards/rejected": -0.4422105848789215, - "step": 232 - }, - { - "epoch": 0.840586245772266, - "grad_norm": 19.540990616730227, - "learning_rate": 3.7543449734080635e-08, - "logits/chosen": -2.7087433338165283, - "logits/rejected": -2.6912291049957275, - "logps/chosen": -137.85569763183594, - "logps/rejected": -165.02366638183594, - "loss": 0.622, - "rewards/accuracies": 0.75, - "rewards/chosen": -0.8620815873146057, - "rewards/margins": 0.2977851331233978, - "rewards/rejected": -1.1598666906356812, - "step": 233 - }, - { - "epoch": 0.8441939120631342, - "grad_norm": 22.674777823371233, - "learning_rate": 3.5897936338329994e-08, - "logits/chosen": -2.6290061473846436, - "logits/rejected": -2.6286020278930664, - "logps/chosen": -126.79898071289062, - "logps/rejected": -151.92877197265625, - "loss": 0.6005, - "rewards/accuracies": 0.5, - "rewards/chosen": -0.8524315357208252, - "rewards/margins": 0.18302160501480103, - "rewards/rejected": -1.0354530811309814, - "step": 234 - }, - { - "epoch": 0.8478015783540023, - "grad_norm": 21.010230513873392, - "learning_rate": 3.4286504241894275e-08, - "logits/chosen": -2.6801297664642334, - "logits/rejected": -2.6489310264587402, - "logps/chosen": -154.31802368164062, - "logps/rejected": -186.81097412109375, - "loss": 0.6428, - "rewards/accuracies": 0.625, - "rewards/chosen": -0.9396072626113892, - "rewards/margins": 0.30663400888442993, - "rewards/rejected": -1.2462412118911743, - "step": 235 - }, - { - "epoch": 0.8514092446448703, - "grad_norm": 20.148817142272478, - "learning_rate": 3.2709409956545056e-08, - "logits/chosen": -2.6412787437438965, - "logits/rejected": -2.6584792137145996, - "logps/chosen": -150.8040313720703, - "logps/rejected": -167.98403930664062, - "loss": 0.5901, - "rewards/accuracies": 0.4375, - "rewards/chosen": -1.0689404010772705, - "rewards/margins": 0.16337279975414276, - "rewards/rejected": -1.2323131561279297, - "step": 236 - }, - { - "epoch": 0.8550169109357384, - "grad_norm": 23.590032400439114, - "learning_rate": 3.1166904528075246e-08, - "logits/chosen": -2.669891834259033, - "logits/rejected": -2.630709409713745, - "logps/chosen": -118.471923828125, - "logps/rejected": -132.73631286621094, - "loss": 0.6188, - "rewards/accuracies": 0.4375, - "rewards/chosen": -0.806098222732544, - "rewards/margins": 0.15198391675949097, - "rewards/rejected": -0.9580821394920349, - "step": 237 - }, - { - "epoch": 0.8586245772266066, - "grad_norm": 20.41992515079147, - "learning_rate": 2.965923349633778e-08, - "logits/chosen": -2.947301149368286, - "logits/rejected": -2.938220262527466, - "logps/chosen": -84.75267791748047, - "logps/rejected": -101.88093566894531, - "loss": 0.6011, - "rewards/accuracies": 0.25, - "rewards/chosen": -0.548446774482727, - "rewards/margins": 0.14881302416324615, - "rewards/rejected": -0.6972598433494568, - "step": 238 - }, - { - "epoch": 0.8622322435174746, - "grad_norm": 21.845076901234233, - "learning_rate": 2.81866368561596e-08, - "logits/chosen": -2.65000319480896, - "logits/rejected": -2.6162397861480713, - "logps/chosen": -169.4011688232422, - "logps/rejected": -186.7139129638672, - "loss": 0.6505, - "rewards/accuracies": 0.5, - "rewards/chosen": -1.1040401458740234, - "rewards/margins": 0.20935505628585815, - "rewards/rejected": -1.313395380973816, - "step": 239 - }, - { - "epoch": 0.8658399098083427, - "grad_norm": 18.984893937694327, - "learning_rate": 2.6749349019138583e-08, - "logits/chosen": -2.798875093460083, - "logits/rejected": -2.799243450164795, - "logps/chosen": -100.33058166503906, - "logps/rejected": -98.19447326660156, - "loss": 0.6319, - "rewards/accuracies": 0.1875, - "rewards/chosen": -0.6747909188270569, - "rewards/margins": -0.03361566737294197, - "rewards/rejected": -0.6411752700805664, - "step": 240 - }, - { - "epoch": 0.8694475760992109, - "grad_norm": 22.571089057976877, - "learning_rate": 2.53475987763295e-08, - "logits/chosen": -2.685279369354248, - "logits/rejected": -2.6782948970794678, - "logps/chosen": -168.28924560546875, - "logps/rejected": -178.58523559570312, - "loss": 0.6158, - "rewards/accuracies": 0.5, - "rewards/chosen": -1.089237093925476, - "rewards/margins": 0.11458201706409454, - "rewards/rejected": -1.2038190364837646, - "step": 241 - }, - { - "epoch": 0.8730552423900789, - "grad_norm": 21.77782282020191, - "learning_rate": 2.3981609261824383e-08, - "logits/chosen": -2.654698610305786, - "logits/rejected": -2.6075949668884277, - "logps/chosen": -129.93067932128906, - "logps/rejected": -157.78756713867188, - "loss": 0.6102, - "rewards/accuracies": 0.4375, - "rewards/chosen": -0.7871225476264954, - "rewards/margins": 0.2907417416572571, - "rewards/rejected": -1.077864408493042, - "step": 242 - }, - { - "epoch": 0.876662908680947, - "grad_norm": 19.7945166469101, - "learning_rate": 2.2651597917233728e-08, - "logits/chosen": -2.6807045936584473, - "logits/rejected": -2.6718242168426514, - "logps/chosen": -137.94711303710938, - "logps/rejected": -164.97425842285156, - "loss": 0.6138, - "rewards/accuracies": 0.625, - "rewards/chosen": -0.7404623031616211, - "rewards/margins": 0.24454745650291443, - "rewards/rejected": -0.9850097894668579, - "step": 243 - }, - { - "epoch": 0.8802705749718152, - "grad_norm": 17.077718697447573, - "learning_rate": 2.135777645707318e-08, - "logits/chosen": -2.539994716644287, - "logits/rejected": -2.5535757541656494, - "logps/chosen": -123.84488677978516, - "logps/rejected": -148.84173583984375, - "loss": 0.6394, - "rewards/accuracies": 0.375, - "rewards/chosen": -0.8526497483253479, - "rewards/margins": 0.21240833401679993, - "rewards/rejected": -1.0650579929351807, - "step": 244 - }, - { - "epoch": 0.8838782412626832, - "grad_norm": 21.915856547028383, - "learning_rate": 2.01003508350627e-08, - "logits/chosen": -2.743135452270508, - "logits/rejected": -2.7024784088134766, - "logps/chosen": -144.93258666992188, - "logps/rejected": -186.56028747558594, - "loss": 0.6189, - "rewards/accuracies": 0.625, - "rewards/chosen": -0.8176748752593994, - "rewards/margins": 0.4297330975532532, - "rewards/rejected": -1.2474079132080078, - "step": 245 - }, - { - "epoch": 0.8874859075535513, - "grad_norm": 31.594914335616036, - "learning_rate": 1.8879521211341954e-08, - "logits/chosen": -2.7780113220214844, - "logits/rejected": -2.754880428314209, - "logps/chosen": -109.37733459472656, - "logps/rejected": -117.86366271972656, - "loss": 0.6458, - "rewards/accuracies": 0.1875, - "rewards/chosen": -0.5510645508766174, - "rewards/margins": 0.06718574464321136, - "rewards/rejected": -0.61825031042099, - "step": 246 - }, - { - "epoch": 0.8910935738444193, - "grad_norm": 23.070206549747876, - "learning_rate": 1.7695481920608712e-08, - "logits/chosen": -2.7138006687164307, - "logits/rejected": -2.64084792137146, - "logps/chosen": -132.44696044921875, - "logps/rejected": -161.95892333984375, - "loss": 0.5965, - "rewards/accuracies": 0.625, - "rewards/chosen": -0.7780744433403015, - "rewards/margins": 0.3132813572883606, - "rewards/rejected": -1.091355800628662, - "step": 247 - }, - { - "epoch": 0.8947012401352875, - "grad_norm": 25.230217954820528, - "learning_rate": 1.6548421441183872e-08, - "logits/chosen": -2.448786735534668, - "logits/rejected": -2.44523024559021, - "logps/chosen": -159.44529724121094, - "logps/rejected": -174.12574768066406, - "loss": 0.6301, - "rewards/accuracies": 0.4375, - "rewards/chosen": -1.1122872829437256, - "rewards/margins": 0.12879793345928192, - "rewards/rejected": -1.241085171699524, - "step": 248 - }, - { - "epoch": 0.8983089064261556, - "grad_norm": 20.32288376212083, - "learning_rate": 1.5438522365009254e-08, - "logits/chosen": -2.7866921424865723, - "logits/rejected": -2.7852888107299805, - "logps/chosen": -165.46640014648438, - "logps/rejected": -180.37371826171875, - "loss": 0.619, - "rewards/accuracies": 0.4375, - "rewards/chosen": -1.0974924564361572, - "rewards/margins": 0.14256468415260315, - "rewards/rejected": -1.240057110786438, - "step": 249 - }, - { - "epoch": 0.9019165727170236, - "grad_norm": 21.79553933837041, - "learning_rate": 1.4365961368581841e-08, - "logits/chosen": -2.3529977798461914, - "logits/rejected": -2.3767967224121094, - "logps/chosen": -108.47991943359375, - "logps/rejected": -122.56343078613281, - "loss": 0.6175, - "rewards/accuracies": 0.375, - "rewards/chosen": -0.7406535744667053, - "rewards/margins": 0.16251938045024872, - "rewards/rejected": -0.9031729698181152, - "step": 250 - }, - { - "epoch": 0.9055242390078918, - "grad_norm": 17.22780317766937, - "learning_rate": 1.3330909184830263e-08, - "logits/chosen": -2.5356850624084473, - "logits/rejected": -2.568988800048828, - "logps/chosen": -57.6492805480957, - "logps/rejected": -100.61380004882812, - "loss": 0.5941, - "rewards/accuracies": 0.375, - "rewards/chosen": -0.3544650375843048, - "rewards/margins": 0.4384561777114868, - "rewards/rejected": -0.7929211854934692, - "step": 251 - }, - { - "epoch": 0.9091319052987599, - "grad_norm": 24.957707988614608, - "learning_rate": 1.2333530575937029e-08, - "logits/chosen": -2.677913188934326, - "logits/rejected": -2.660062074661255, - "logps/chosen": -161.6991424560547, - "logps/rejected": -202.0394287109375, - "loss": 0.6245, - "rewards/accuracies": 0.5625, - "rewards/chosen": -1.0263457298278809, - "rewards/margins": 0.4307071566581726, - "rewards/rejected": -1.4570529460906982, - "step": 252 - }, - { - "epoch": 0.9127395715896279, - "grad_norm": 17.93477521694411, - "learning_rate": 1.1373984307111228e-08, - "logits/chosen": -2.5677995681762695, - "logits/rejected": -2.5746843814849854, - "logps/chosen": -221.53683471679688, - "logps/rejected": -254.741455078125, - "loss": 0.6098, - "rewards/accuracies": 0.5, - "rewards/chosen": -1.3974796533584595, - "rewards/margins": 0.33317381143569946, - "rewards/rejected": -1.7306534051895142, - "step": 253 - }, - { - "epoch": 0.9163472378804961, - "grad_norm": 20.386487290720527, - "learning_rate": 1.0452423121315835e-08, - "logits/chosen": -2.532202959060669, - "logits/rejected": -2.525928497314453, - "logps/chosen": -147.84786987304688, - "logps/rejected": -169.5756072998047, - "loss": 0.6073, - "rewards/accuracies": 0.5625, - "rewards/chosen": -0.9036383628845215, - "rewards/margins": 0.2264326810836792, - "rewards/rejected": -1.1300709247589111, - "step": 254 - }, - { - "epoch": 0.9199549041713642, - "grad_norm": 21.284514746737607, - "learning_rate": 9.568993714953817e-09, - "logits/chosen": -2.5790908336639404, - "logits/rejected": -2.5699095726013184, - "logps/chosen": -141.00070190429688, - "logps/rejected": -180.79441833496094, - "loss": 0.6018, - "rewards/accuracies": 0.5625, - "rewards/chosen": -0.8232504725456238, - "rewards/margins": 0.4163794219493866, - "rewards/rejected": -1.239629864692688, - "step": 255 - }, - { - "epoch": 0.9235625704622322, - "grad_norm": 19.59754641325857, - "learning_rate": 8.72383671451668e-09, - "logits/chosen": -2.544541835784912, - "logits/rejected": -2.558549404144287, - "logps/chosen": -148.5290985107422, - "logps/rejected": -163.0647430419922, - "loss": 0.6074, - "rewards/accuracies": 0.4375, - "rewards/chosen": -0.9852321147918701, - "rewards/margins": 0.14888951182365417, - "rewards/rejected": -1.1341215372085571, - "step": 256 - }, - { - "epoch": 0.9271702367531003, - "grad_norm": 17.745609098186172, - "learning_rate": 7.91708665419899e-09, - "logits/chosen": -2.6848866939544678, - "logits/rejected": -2.6615242958068848, - "logps/chosen": -152.1393280029297, - "logps/rejected": -161.1221923828125, - "loss": 0.606, - "rewards/accuracies": 0.3125, - "rewards/chosen": -0.933432936668396, - "rewards/margins": 0.08910448849201202, - "rewards/rejected": -1.0225374698638916, - "step": 257 - }, - { - "epoch": 0.9307779030439685, - "grad_norm": 26.81306547450463, - "learning_rate": 7.1488719544831044e-09, - "logits/chosen": -2.7195591926574707, - "logits/rejected": -2.6725714206695557, - "logps/chosen": -126.39772033691406, - "logps/rejected": -157.39036560058594, - "loss": 0.6266, - "rewards/accuracies": 0.625, - "rewards/chosen": -0.8361244201660156, - "rewards/margins": 0.32916060090065, - "rewards/rejected": -1.1652851104736328, - "step": 258 - }, - { - "epoch": 0.9343855693348365, - "grad_norm": 35.09030640810849, - "learning_rate": 6.4193149016966704e-09, - "logits/chosen": -2.7393150329589844, - "logits/rejected": -2.6686596870422363, - "logps/chosen": -154.73721313476562, - "logps/rejected": -186.57479858398438, - "loss": 0.6025, - "rewards/accuracies": 0.4375, - "rewards/chosen": -0.9817075729370117, - "rewards/margins": 0.36847811937332153, - "rewards/rejected": -1.350185751914978, - "step": 259 - }, - { - "epoch": 0.9379932356257046, - "grad_norm": 25.738974731302417, - "learning_rate": 5.728531628546945e-09, - "logits/chosen": -2.7480733394622803, - "logits/rejected": -2.7295284271240234, - "logps/chosen": -112.19759368896484, - "logps/rejected": -118.33749389648438, - "loss": 0.6689, - "rewards/accuracies": 0.25, - "rewards/chosen": -0.821943998336792, - "rewards/margins": 0.03218063712120056, - "rewards/rejected": -0.8541246056556702, - "step": 260 - }, - { - "epoch": 0.9416009019165728, - "grad_norm": 24.346530826675718, - "learning_rate": 5.076632095634525e-09, - "logits/chosen": -2.4755895137786865, - "logits/rejected": -2.496509075164795, - "logps/chosen": -99.9683837890625, - "logps/rejected": -117.90865325927734, - "loss": 0.6272, - "rewards/accuracies": 0.4375, - "rewards/chosen": -0.787111759185791, - "rewards/margins": 0.17271503806114197, - "rewards/rejected": -0.9598267674446106, - "step": 261 - }, - { - "epoch": 0.9452085682074408, - "grad_norm": 24.070680603637307, - "learning_rate": 4.463720073949351e-09, - "logits/chosen": -2.4826323986053467, - "logits/rejected": -2.4846949577331543, - "logps/chosen": -156.851318359375, - "logps/rejected": -183.58770751953125, - "loss": 0.6595, - "rewards/accuracies": 0.5, - "rewards/chosen": -1.0629929304122925, - "rewards/margins": 0.24074754118919373, - "rewards/rejected": -1.3037405014038086, - "step": 262 - }, - { - "epoch": 0.9488162344983089, - "grad_norm": 17.887668286746894, - "learning_rate": 3.889893128352334e-09, - "logits/chosen": -2.541724920272827, - "logits/rejected": -2.5193233489990234, - "logps/chosen": -129.2103729248047, - "logps/rejected": -159.85585021972656, - "loss": 0.6144, - "rewards/accuracies": 0.5, - "rewards/chosen": -0.7874460816383362, - "rewards/margins": 0.320261687040329, - "rewards/rejected": -1.1077077388763428, - "step": 263 - }, - { - "epoch": 0.952423900789177, - "grad_norm": 32.765577031425956, - "learning_rate": 3.3552426020447277e-09, - "logits/chosen": -2.727958917617798, - "logits/rejected": -2.687831401824951, - "logps/chosen": -140.69236755371094, - "logps/rejected": -175.51039123535156, - "loss": 0.6057, - "rewards/accuracies": 0.5625, - "rewards/chosen": -0.8520128726959229, - "rewards/margins": 0.35299837589263916, - "rewards/rejected": -1.2050111293792725, - "step": 264 - }, - { - "epoch": 0.9560315670800451, - "grad_norm": 18.996572808638184, - "learning_rate": 2.8598536020278673e-09, - "logits/chosen": -2.5644102096557617, - "logits/rejected": -2.558840274810791, - "logps/chosen": -118.72767639160156, - "logps/rejected": -116.29133605957031, - "loss": 0.6396, - "rewards/accuracies": 0.25, - "rewards/chosen": -0.8477441668510437, - "rewards/margins": -0.020649347454309464, - "rewards/rejected": -0.8270947337150574, - "step": 265 - }, - { - "epoch": 0.9596392333709132, - "grad_norm": 19.32088542485326, - "learning_rate": 2.4038049855556185e-09, - "logits/chosen": -2.538405656814575, - "logits/rejected": -2.4929141998291016, - "logps/chosen": -147.37872314453125, - "logps/rejected": -193.4884033203125, - "loss": 0.6089, - "rewards/accuracies": 0.5625, - "rewards/chosen": -0.8216827511787415, - "rewards/margins": 0.4939315617084503, - "rewards/rejected": -1.3156142234802246, - "step": 266 - }, - { - "epoch": 0.9632468996617812, - "grad_norm": 20.99364192374622, - "learning_rate": 1.9871693475816966e-09, - "logits/chosen": -2.340470790863037, - "logits/rejected": -2.35207200050354, - "logps/chosen": -172.65672302246094, - "logps/rejected": -195.99395751953125, - "loss": 0.6357, - "rewards/accuracies": 0.4375, - "rewards/chosen": -1.1873472929000854, - "rewards/margins": 0.2546396553516388, - "rewards/rejected": -1.4419869184494019, - "step": 267 - }, - { - "epoch": 0.9668545659526494, - "grad_norm": 26.700062977805967, - "learning_rate": 1.61001300920377e-09, - "logits/chosen": -2.62088680267334, - "logits/rejected": -2.560683488845825, - "logps/chosen": -98.74684143066406, - "logps/rejected": -116.84949493408203, - "loss": 0.6302, - "rewards/accuracies": 0.5625, - "rewards/chosen": -0.6034468412399292, - "rewards/margins": 0.20899976789951324, - "rewards/rejected": -0.8124465942382812, - "step": 268 - }, - { - "epoch": 0.9704622322435175, - "grad_norm": 20.606861926741473, - "learning_rate": 1.2723960071064354e-09, - "logits/chosen": -2.5746419429779053, - "logits/rejected": -2.5584211349487305, - "logps/chosen": -159.9401092529297, - "logps/rejected": -183.1726531982422, - "loss": 0.6103, - "rewards/accuracies": 0.5625, - "rewards/chosen": -1.1296398639678955, - "rewards/margins": 0.28286370635032654, - "rewards/rejected": -1.4125034809112549, - "step": 269 - }, - { - "epoch": 0.9740698985343855, - "grad_norm": 17.264778853231086, - "learning_rate": 9.743720840042492e-10, - "logits/chosen": -2.7372989654541016, - "logits/rejected": -2.6783485412597656, - "logps/chosen": -114.9549789428711, - "logps/rejected": -155.42819213867188, - "loss": 0.6018, - "rewards/accuracies": 0.5, - "rewards/chosen": -0.848701000213623, - "rewards/margins": 0.40935003757476807, - "rewards/rejected": -1.2580510377883911, - "step": 270 - }, - { - "epoch": 0.9776775648252537, - "grad_norm": 25.582516593462486, - "learning_rate": 7.159886800869874e-10, - "logits/chosen": -2.468820095062256, - "logits/rejected": -2.411522626876831, - "logps/chosen": -104.02911376953125, - "logps/rejected": -127.3437271118164, - "loss": 0.6198, - "rewards/accuracies": 0.4375, - "rewards/chosen": -0.5929229259490967, - "rewards/margins": 0.22650952637195587, - "rewards/rejected": -0.8194324374198914, - "step": 271 - }, - { - "epoch": 0.9812852311161218, - "grad_norm": 17.75046437178384, - "learning_rate": 4.972869254679102e-10, - "logits/chosen": -2.5962610244750977, - "logits/rejected": -2.6354269981384277, - "logps/chosen": -112.93864440917969, - "logps/rejected": -131.3079833984375, - "loss": 0.5985, - "rewards/accuracies": 0.4375, - "rewards/chosen": -0.7198734879493713, - "rewards/margins": 0.1204710528254509, - "rewards/rejected": -0.8403445482254028, - "step": 272 - }, - { - "epoch": 0.9848928974069898, - "grad_norm": 21.569623268615665, - "learning_rate": 3.1830163363655294e-10, - "logits/chosen": -2.635051727294922, - "logits/rejected": -2.641101598739624, - "logps/chosen": -156.79541015625, - "logps/rejected": -167.7721405029297, - "loss": 0.6375, - "rewards/accuracies": 0.5, - "rewards/chosen": -1.0239731073379517, - "rewards/margins": 0.08604252338409424, - "rewards/rejected": -1.110015630722046, - "step": 273 - }, - { - "epoch": 0.988500563697858, - "grad_norm": 21.827016064838958, - "learning_rate": 1.7906129591713227e-10, - "logits/chosen": -2.6349761486053467, - "logits/rejected": -2.6192984580993652, - "logps/chosen": -153.7158203125, - "logps/rejected": -168.363525390625, - "loss": 0.6502, - "rewards/accuracies": 0.4375, - "rewards/chosen": -1.008318543434143, - "rewards/margins": 0.15721425414085388, - "rewards/rejected": -1.1655328273773193, - "step": 274 - }, - { - "epoch": 0.992108229988726, - "grad_norm": 28.495869362816705, - "learning_rate": 7.958807693311809e-11, - "logits/chosen": -2.4659669399261475, - "logits/rejected": -2.5054712295532227, - "logps/chosen": -132.99371337890625, - "logps/rejected": -171.11233520507812, - "loss": 0.636, - "rewards/accuracies": 0.5, - "rewards/chosen": -0.8174401521682739, - "rewards/margins": 0.3683163523674011, - "rewards/rejected": -1.1857565641403198, - "step": 275 - }, - { - "epoch": 0.9957158962795941, - "grad_norm": 26.978019252673445, - "learning_rate": 1.989781107905597e-11, - "logits/chosen": -2.7785186767578125, - "logits/rejected": -2.788677215576172, - "logps/chosen": -177.7690887451172, - "logps/rejected": -201.444580078125, - "loss": 0.6135, - "rewards/accuracies": 0.5625, - "rewards/chosen": -1.1071832180023193, - "rewards/margins": 0.20052564144134521, - "rewards/rejected": -1.3077088594436646, - "step": 276 - }, - { - "epoch": 0.9993235625704623, - "grad_norm": 24.68644117362378, - "learning_rate": 0.0, - "logits/chosen": -2.6498212814331055, - "logits/rejected": -2.6731107234954834, - "logps/chosen": -178.29954528808594, - "logps/rejected": -204.15992736816406, - "loss": 0.631, - "rewards/accuracies": 0.5625, - "rewards/chosen": -1.2166697978973389, - "rewards/margins": 0.28650936484336853, - "rewards/rejected": -1.5031790733337402, - "step": 277 - }, - { - "epoch": 0.9993235625704623, - "step": 277, + "epoch": 0.9935483870967742, + "step": 77, "total_flos": 0.0, - "train_loss": 0.6513021779835009, - "train_runtime": 4537.7195, - "train_samples_per_second": 7.818, - "train_steps_per_second": 0.061 + "train_loss": 0.6444497665801605, + "train_runtime": 1967.4135, + "train_samples_per_second": 5.042, + "train_steps_per_second": 0.039 } ], "logging_steps": 1, - "max_steps": 277, + "max_steps": 77, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100,